| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.025, |
| "eval_steps": 500, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 2.5e-05, |
| "grad_norm": 1.5854424238204956, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 23979292.0, |
| "loss/crossentropy": 2.746462106704712, |
| "loss/hidden": 2.294778823852539e-06, |
| "loss/logits": 0.0006695009651593864, |
| "loss/reg": 23979290.0, |
| "step": 1 |
| }, |
| { |
| "epoch": 5e-05, |
| "grad_norm": 1.6366995573043823, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 23979292.0, |
| "loss/crossentropy": 2.6046388149261475, |
| "loss/hidden": 2.2202730178833008e-06, |
| "loss/logits": 0.0006655176403000951, |
| "loss/reg": 23979290.0, |
| "step": 2 |
| }, |
| { |
| "epoch": 7.5e-05, |
| "grad_norm": 1.8284447193145752, |
| "learning_rate": 3e-06, |
| "loss": 23978872.0, |
| "loss/crossentropy": 2.878774404525757, |
| "loss/hidden": 2.3543834686279297e-06, |
| "loss/logits": 0.0006678127683699131, |
| "loss/reg": 23978870.0, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0001, |
| "grad_norm": 1.556563377380371, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 23978052.0, |
| "loss/crossentropy": 2.5428426265716553, |
| "loss/hidden": 2.2202730178833008e-06, |
| "loss/logits": 0.0006393545190803707, |
| "loss/reg": 23978050.0, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.000125, |
| "grad_norm": 1.5663527250289917, |
| "learning_rate": 5e-06, |
| "loss": 23977126.0, |
| "loss/crossentropy": 2.746645927429199, |
| "loss/hidden": 2.3245811462402344e-06, |
| "loss/logits": 0.0006400058045983315, |
| "loss/reg": 23977124.0, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.00015, |
| "grad_norm": 1.4696992635726929, |
| "learning_rate": 6e-06, |
| "loss": 23975882.0, |
| "loss/crossentropy": 2.5563833713531494, |
| "loss/hidden": 2.5033950805664062e-06, |
| "loss/logits": 0.0006549001554958522, |
| "loss/reg": 23975880.0, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.000175, |
| "grad_norm": 1.4133743047714233, |
| "learning_rate": 7.000000000000001e-06, |
| "loss": 23974370.0, |
| "loss/crossentropy": 2.6262099742889404, |
| "loss/hidden": 3.11434268951416e-06, |
| "loss/logits": 0.0009181889472529292, |
| "loss/reg": 23974368.0, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.0002, |
| "grad_norm": 1.2239353656768799, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 23972572.0, |
| "loss/crossentropy": 2.7176015377044678, |
| "loss/hidden": 5.334615707397461e-06, |
| "loss/logits": 0.0014323468785732985, |
| "loss/reg": 23972570.0, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.000225, |
| "grad_norm": 1.1190756559371948, |
| "learning_rate": 9e-06, |
| "loss": 23970580.0, |
| "loss/crossentropy": 2.3941187858581543, |
| "loss/hidden": 1.1801719665527344e-05, |
| "loss/logits": 0.002698513213545084, |
| "loss/reg": 23970578.0, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.00025, |
| "grad_norm": 1.1112234592437744, |
| "learning_rate": 1e-05, |
| "loss": 23968372.0, |
| "loss/crossentropy": 2.692230224609375, |
| "loss/hidden": 1.6450881958007812e-05, |
| "loss/logits": 0.0033866295125335455, |
| "loss/reg": 23968370.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.000275, |
| "grad_norm": 1.1117942333221436, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 23965842.0, |
| "loss/crossentropy": 2.483330249786377, |
| "loss/hidden": 2.300739288330078e-05, |
| "loss/logits": 0.004709391854703426, |
| "loss/reg": 23965840.0, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.0003, |
| "grad_norm": 1.2483229637145996, |
| "learning_rate": 1.2e-05, |
| "loss": 23963038.0, |
| "loss/crossentropy": 2.868898391723633, |
| "loss/hidden": 3.24249267578125e-05, |
| "loss/logits": 0.00696477759629488, |
| "loss/reg": 23963036.0, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.000325, |
| "grad_norm": 1.1655009984970093, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 23960020.0, |
| "loss/crossentropy": 2.664245128631592, |
| "loss/hidden": 0.00238037109375, |
| "loss/logits": 0.009140372276306152, |
| "loss/reg": 23960018.0, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.00035, |
| "grad_norm": 1.134067177772522, |
| "learning_rate": 1.4000000000000001e-05, |
| "loss": 23956590.0, |
| "loss/crossentropy": 2.340970516204834, |
| "loss/hidden": 0.01171875, |
| "loss/logits": 0.012021241709589958, |
| "loss/reg": 23956588.0, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.000375, |
| "grad_norm": 1.3018617630004883, |
| "learning_rate": 1.5e-05, |
| "loss": 23952838.0, |
| "loss/crossentropy": 2.885324239730835, |
| "loss/hidden": 0.016357421875, |
| "loss/logits": 0.01563672348856926, |
| "loss/reg": 23952836.0, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.0004, |
| "grad_norm": 1.1381428241729736, |
| "grad_norm_var": 0.053035585449344086, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 23948930.0, |
| "loss/crossentropy": 2.795009136199951, |
| "loss/hidden": 0.0093994140625, |
| "loss/logits": 0.012821242213249207, |
| "loss/reg": 23948928.0, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.000425, |
| "grad_norm": 1.1428611278533936, |
| "grad_norm_var": 0.05142304695606151, |
| "learning_rate": 1.7000000000000003e-05, |
| "loss": 23944570.0, |
| "loss/crossentropy": 2.582778215408325, |
| "loss/hidden": 0.01171875, |
| "loss/logits": 0.013820314779877663, |
| "loss/reg": 23944568.0, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.00045, |
| "grad_norm": 1.2635234594345093, |
| "grad_norm_var": 0.044517881443493876, |
| "learning_rate": 1.8e-05, |
| "loss": 23939936.0, |
| "loss/crossentropy": 2.978813648223877, |
| "loss/hidden": 0.0140380859375, |
| "loss/logits": 0.017139241099357605, |
| "loss/reg": 23939934.0, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.000475, |
| "grad_norm": 1.1270564794540405, |
| "grad_norm_var": 0.025814427140172333, |
| "learning_rate": 1.9e-05, |
| "loss": 23935036.0, |
| "loss/crossentropy": 2.894636392593384, |
| "loss/hidden": 0.0047607421875, |
| "loss/logits": 0.01289904024451971, |
| "loss/reg": 23935034.0, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.0005, |
| "grad_norm": 1.114075779914856, |
| "grad_norm_var": 0.020309137022476817, |
| "learning_rate": 2e-05, |
| "loss": 23929816.0, |
| "loss/crossentropy": 2.8091928958892822, |
| "loss/hidden": 0.0023956298828125, |
| "loss/logits": 0.01356520690023899, |
| "loss/reg": 23929814.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.000525, |
| "grad_norm": 1.0825225114822388, |
| "grad_norm_var": 0.013124059103994945, |
| "learning_rate": 2.1e-05, |
| "loss": 23924240.0, |
| "loss/crossentropy": 2.545332670211792, |
| "loss/hidden": 0.00238037109375, |
| "loss/logits": 0.011450954712927341, |
| "loss/reg": 23924238.0, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.00055, |
| "grad_norm": 0.9934895634651184, |
| "grad_norm_var": 0.010042275575547288, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 23918148.0, |
| "loss/crossentropy": 2.438739776611328, |
| "loss/hidden": 0.0023956298828125, |
| "loss/logits": 0.009597850032150745, |
| "loss/reg": 23918146.0, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.000575, |
| "grad_norm": 1.2488293647766113, |
| "grad_norm_var": 0.0063549960248830304, |
| "learning_rate": 2.3000000000000003e-05, |
| "loss": 23911768.0, |
| "loss/crossentropy": 3.0437679290771484, |
| "loss/hidden": 0.0023956298828125, |
| "loss/logits": 0.010121545754373074, |
| "loss/reg": 23911764.0, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.0006, |
| "grad_norm": 1.0638293027877808, |
| "grad_norm_var": 0.006547273197725412, |
| "learning_rate": 2.4e-05, |
| "loss": 23904968.0, |
| "loss/crossentropy": 2.6823315620422363, |
| "loss/hidden": 0.0023956298828125, |
| "loss/logits": 0.011773628182709217, |
| "loss/reg": 23904966.0, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.000625, |
| "grad_norm": 1.1260216236114502, |
| "grad_norm_var": 0.0065236064676203725, |
| "learning_rate": 2.5e-05, |
| "loss": 23897788.0, |
| "loss/crossentropy": 2.694211483001709, |
| "loss/hidden": 0.0023956298828125, |
| "loss/logits": 0.010089674033224583, |
| "loss/reg": 23897786.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.00065, |
| "grad_norm": 1.115420937538147, |
| "grad_norm_var": 0.006503945962090644, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 23890298.0, |
| "loss/crossentropy": 2.920772075653076, |
| "loss/hidden": 0.0047607421875, |
| "loss/logits": 0.013391964137554169, |
| "loss/reg": 23890296.0, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.000675, |
| "grad_norm": 1.1204029321670532, |
| "grad_norm_var": 0.006466351262282292, |
| "learning_rate": 2.7000000000000002e-05, |
| "loss": 23882354.0, |
| "loss/crossentropy": 2.905461549758911, |
| "loss/hidden": 0.0047607421875, |
| "loss/logits": 0.013330795802175999, |
| "loss/reg": 23882352.0, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.0007, |
| "grad_norm": 1.0363003015518188, |
| "grad_norm_var": 0.006471530545566025, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 23873970.0, |
| "loss/crossentropy": 2.6746280193328857, |
| "loss/hidden": 0.0023956298828125, |
| "loss/logits": 0.012038306333124638, |
| "loss/reg": 23873968.0, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.000725, |
| "grad_norm": 1.0674101114273071, |
| "grad_norm_var": 0.006685345648087789, |
| "learning_rate": 2.9e-05, |
| "loss": 23865218.0, |
| "loss/crossentropy": 2.785122871398926, |
| "loss/hidden": 0.007110595703125, |
| "loss/logits": 0.012699823826551437, |
| "loss/reg": 23865216.0, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.00075, |
| "grad_norm": 1.1132105588912964, |
| "grad_norm_var": 0.006700495384649096, |
| "learning_rate": 3e-05, |
| "loss": 23855898.0, |
| "loss/crossentropy": 2.806924343109131, |
| "loss/hidden": 0.007110595703125, |
| "loss/logits": 0.013277316465973854, |
| "loss/reg": 23855896.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.000775, |
| "grad_norm": 1.1099059581756592, |
| "grad_norm_var": 0.004564729466811205, |
| "learning_rate": 3.1e-05, |
| "loss": 23846260.0, |
| "loss/crossentropy": 2.5398411750793457, |
| "loss/hidden": 0.00946044921875, |
| "loss/logits": 0.017075439915060997, |
| "loss/reg": 23846258.0, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.0008, |
| "grad_norm": 1.1187163591384888, |
| "grad_norm_var": 0.004532095618039718, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 23835966.0, |
| "loss/crossentropy": 2.7458691596984863, |
| "loss/hidden": 0.007110595703125, |
| "loss/logits": 0.014956941828131676, |
| "loss/reg": 23835964.0, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.000825, |
| "grad_norm": 1.087096929550171, |
| "grad_norm_var": 0.004520956632664038, |
| "learning_rate": 3.3e-05, |
| "loss": 23825254.0, |
| "loss/crossentropy": 2.9982759952545166, |
| "loss/hidden": 0.0118408203125, |
| "loss/logits": 0.01633520796895027, |
| "loss/reg": 23825252.0, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.00085, |
| "grad_norm": 1.1248656511306763, |
| "grad_norm_var": 0.0029164204742953446, |
| "learning_rate": 3.4000000000000007e-05, |
| "loss": 23814122.0, |
| "loss/crossentropy": 2.938610076904297, |
| "loss/hidden": 0.0118408203125, |
| "loss/logits": 0.016870109364390373, |
| "loss/reg": 23814120.0, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.000875, |
| "grad_norm": 1.0480307340621948, |
| "grad_norm_var": 0.0030540199600813, |
| "learning_rate": 3.5e-05, |
| "loss": 23802424.0, |
| "loss/crossentropy": 2.4780776500701904, |
| "loss/hidden": 0.007110595703125, |
| "loss/logits": 0.013872872106730938, |
| "loss/reg": 23802422.0, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.0009, |
| "grad_norm": 1.1236826181411743, |
| "grad_norm_var": 0.00308020941360246, |
| "learning_rate": 3.6e-05, |
| "loss": 23790394.0, |
| "loss/crossentropy": 2.87992787361145, |
| "loss/hidden": 0.0118408203125, |
| "loss/logits": 0.015783678740262985, |
| "loss/reg": 23790392.0, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.000925, |
| "grad_norm": 1.0299606323242188, |
| "grad_norm_var": 0.003366491791708848, |
| "learning_rate": 3.7e-05, |
| "loss": 23777894.0, |
| "loss/crossentropy": 2.445013999938965, |
| "loss/hidden": 0.00946044921875, |
| "loss/logits": 0.013436071574687958, |
| "loss/reg": 23777892.0, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.00095, |
| "grad_norm": 1.0358997583389282, |
| "grad_norm_var": 0.0029023602377444392, |
| "learning_rate": 3.8e-05, |
| "loss": 23765000.0, |
| "loss/crossentropy": 2.51252818107605, |
| "loss/hidden": 0.0118408203125, |
| "loss/logits": 0.01649520918726921, |
| "loss/reg": 23764998.0, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.000975, |
| "grad_norm": 1.2583109140396118, |
| "grad_norm_var": 0.0030985333088665357, |
| "learning_rate": 3.9000000000000006e-05, |
| "loss": 23751672.0, |
| "loss/crossentropy": 2.753805637359619, |
| "loss/hidden": 0.0118408203125, |
| "loss/logits": 0.017278429120779037, |
| "loss/reg": 23751670.0, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.001, |
| "grad_norm": 1.084017276763916, |
| "grad_norm_var": 0.0030301656123009834, |
| "learning_rate": 4e-05, |
| "loss": 23737698.0, |
| "loss/crossentropy": 2.5898780822753906, |
| "loss/hidden": 0.01416015625, |
| "loss/logits": 0.01646130532026291, |
| "loss/reg": 23737696.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.001025, |
| "grad_norm": 1.1648989915847778, |
| "grad_norm_var": 0.0032597601013804934, |
| "learning_rate": 4.1e-05, |
| "loss": 23723434.0, |
| "loss/crossentropy": 2.980100154876709, |
| "loss/hidden": 0.018798828125, |
| "loss/logits": 0.019089510664343834, |
| "loss/reg": 23723432.0, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.00105, |
| "grad_norm": 1.1199809312820435, |
| "grad_norm_var": 0.003268986651698924, |
| "learning_rate": 4.2e-05, |
| "loss": 23708500.0, |
| "loss/crossentropy": 2.625920534133911, |
| "loss/hidden": 0.01416015625, |
| "loss/logits": 0.016438281163573265, |
| "loss/reg": 23708498.0, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.001075, |
| "grad_norm": 1.1081212759017944, |
| "grad_norm_var": 0.003249372454232713, |
| "learning_rate": 4.3e-05, |
| "loss": 23693190.0, |
| "loss/crossentropy": 2.88944149017334, |
| "loss/hidden": 0.01416015625, |
| "loss/logits": 0.01906749978661537, |
| "loss/reg": 23693188.0, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.0011, |
| "grad_norm": 1.0233378410339355, |
| "grad_norm_var": 0.0033732528132327743, |
| "learning_rate": 4.4000000000000006e-05, |
| "loss": 23677338.0, |
| "loss/crossentropy": 2.740055561065674, |
| "loss/hidden": 0.0118408203125, |
| "loss/logits": 0.017147231847047806, |
| "loss/reg": 23677336.0, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.001125, |
| "grad_norm": 1.087576150894165, |
| "grad_norm_var": 0.003308109873993459, |
| "learning_rate": 4.5e-05, |
| "loss": 23661376.0, |
| "loss/crossentropy": 2.7993900775909424, |
| "loss/hidden": 0.018798828125, |
| "loss/logits": 0.018611574545502663, |
| "loss/reg": 23661374.0, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.00115, |
| "grad_norm": 1.096280574798584, |
| "grad_norm_var": 0.0033015097930141125, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 23644900.0, |
| "loss/crossentropy": 2.7295026779174805, |
| "loss/hidden": 0.018798828125, |
| "loss/logits": 0.02007441595196724, |
| "loss/reg": 23644898.0, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.001175, |
| "grad_norm": 1.0814288854599, |
| "grad_norm_var": 0.0033194895787734898, |
| "learning_rate": 4.7e-05, |
| "loss": 23627180.0, |
| "loss/crossentropy": 2.7218711376190186, |
| "loss/hidden": 0.018798828125, |
| "loss/logits": 0.01868956908583641, |
| "loss/reg": 23627178.0, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.0012, |
| "grad_norm": 1.1050183773040771, |
| "grad_norm_var": 0.0032961434967167117, |
| "learning_rate": 4.8e-05, |
| "loss": 23609456.0, |
| "loss/crossentropy": 2.707954168319702, |
| "loss/hidden": 0.0164794921875, |
| "loss/logits": 0.013874183408915997, |
| "loss/reg": 23609454.0, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.001225, |
| "grad_norm": 1.1689001321792603, |
| "grad_norm_var": 0.0035882950114247097, |
| "learning_rate": 4.9e-05, |
| "loss": 23591148.0, |
| "loss/crossentropy": 2.9289839267730713, |
| "loss/hidden": 0.018798828125, |
| "loss/logits": 0.01716800406575203, |
| "loss/reg": 23591146.0, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.00125, |
| "grad_norm": 1.0706809759140015, |
| "grad_norm_var": 0.003619381387810705, |
| "learning_rate": 5e-05, |
| "loss": 23572382.0, |
| "loss/crossentropy": 2.6460366249084473, |
| "loss/hidden": 0.018798828125, |
| "loss/logits": 0.02036178484559059, |
| "loss/reg": 23572380.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.001275, |
| "grad_norm": 1.0851693153381348, |
| "grad_norm_var": 0.003446348106013808, |
| "learning_rate": 5.1000000000000006e-05, |
| "loss": 23553052.0, |
| "loss/crossentropy": 2.9845404624938965, |
| "loss/hidden": 0.0211181640625, |
| "loss/logits": 0.01931852102279663, |
| "loss/reg": 23553050.0, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.0013, |
| "grad_norm": 1.0980045795440674, |
| "grad_norm_var": 0.003415733047339131, |
| "learning_rate": 5.2000000000000004e-05, |
| "loss": 23533346.0, |
| "loss/crossentropy": 2.7311153411865234, |
| "loss/hidden": 0.0164794921875, |
| "loss/logits": 0.01767030730843544, |
| "loss/reg": 23533344.0, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.001325, |
| "grad_norm": 1.0971754789352417, |
| "grad_norm_var": 0.0030605557130138905, |
| "learning_rate": 5.300000000000001e-05, |
| "loss": 23512176.0, |
| "loss/crossentropy": 2.7936437129974365, |
| "loss/hidden": 0.0211181640625, |
| "loss/logits": 0.02029425837099552, |
| "loss/reg": 23512174.0, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.00135, |
| "grad_norm": 1.021135926246643, |
| "grad_norm_var": 0.00321079419902374, |
| "learning_rate": 5.4000000000000005e-05, |
| "loss": 23491054.0, |
| "loss/crossentropy": 2.4848897457122803, |
| "loss/hidden": 0.0257568359375, |
| "loss/logits": 0.021194741129875183, |
| "loss/reg": 23491052.0, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.001375, |
| "grad_norm": 1.2304422855377197, |
| "grad_norm_var": 0.002687346509064052, |
| "learning_rate": 5.500000000000001e-05, |
| "loss": 23469514.0, |
| "loss/crossentropy": 3.005992889404297, |
| "loss/hidden": 0.0302734375, |
| "loss/logits": 0.02294519916176796, |
| "loss/reg": 23469510.0, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.0014, |
| "grad_norm": 4.639296054840088, |
| "grad_norm_var": 0.7838620364237424, |
| "learning_rate": 5.6000000000000006e-05, |
| "loss": 23447546.0, |
| "loss/crossentropy": 2.5872466564178467, |
| "loss/hidden": 0.0279541015625, |
| "loss/logits": 0.020812125876545906, |
| "loss/reg": 23447544.0, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.001425, |
| "grad_norm": 1.0491299629211426, |
| "grad_norm_var": 0.7871685268678944, |
| "learning_rate": 5.6999999999999996e-05, |
| "loss": 23425128.0, |
| "loss/crossentropy": 2.621213912963867, |
| "loss/hidden": 0.0279541015625, |
| "loss/logits": 0.021800218150019646, |
| "loss/reg": 23425126.0, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.00145, |
| "grad_norm": 1.1081963777542114, |
| "grad_norm_var": 0.7874877279984801, |
| "learning_rate": 5.8e-05, |
| "loss": 23401876.0, |
| "loss/crossentropy": 2.71842622756958, |
| "loss/hidden": 0.03515625, |
| "loss/logits": 0.022813528776168823, |
| "loss/reg": 23401874.0, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.001475, |
| "grad_norm": 1.2238346338272095, |
| "grad_norm_var": 0.7851039329301315, |
| "learning_rate": 5.9e-05, |
| "loss": 23378118.0, |
| "loss/crossentropy": 2.5764827728271484, |
| "loss/hidden": 0.0302734375, |
| "loss/logits": 0.02273097261786461, |
| "loss/reg": 23378116.0, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.0015, |
| "grad_norm": 1.1989353895187378, |
| "grad_norm_var": 0.7799893316958446, |
| "learning_rate": 6e-05, |
| "loss": 23353888.0, |
| "loss/crossentropy": 3.1227993965148926, |
| "loss/hidden": 0.03515625, |
| "loss/logits": 0.02281002700328827, |
| "loss/reg": 23353884.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.001525, |
| "grad_norm": 1.268097996711731, |
| "grad_norm_var": 0.7760688893627872, |
| "learning_rate": 6.1e-05, |
| "loss": 23329136.0, |
| "loss/crossentropy": 2.5917367935180664, |
| "loss/hidden": 0.0302734375, |
| "loss/logits": 0.023834211751818657, |
| "loss/reg": 23329134.0, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.00155, |
| "grad_norm": 1.2467041015625, |
| "grad_norm_var": 0.7724674225856768, |
| "learning_rate": 6.2e-05, |
| "loss": 23303966.0, |
| "loss/crossentropy": 2.958501100540161, |
| "loss/hidden": 0.037353515625, |
| "loss/logits": 0.02654830738902092, |
| "loss/reg": 23303964.0, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.001575, |
| "grad_norm": 1.2648345232009888, |
| "grad_norm_var": 0.7678612724909214, |
| "learning_rate": 6.3e-05, |
| "loss": 23278184.0, |
| "loss/crossentropy": 2.8347504138946533, |
| "loss/hidden": 0.042236328125, |
| "loss/logits": 0.025617901235818863, |
| "loss/reg": 23278182.0, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.0016, |
| "grad_norm": 1.1708258390426636, |
| "grad_norm_var": 0.7658312734634589, |
| "learning_rate": 6.400000000000001e-05, |
| "loss": 23251446.0, |
| "loss/crossentropy": 2.644226312637329, |
| "loss/hidden": 0.037353515625, |
| "loss/logits": 0.027889616787433624, |
| "loss/reg": 23251444.0, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.001625, |
| "grad_norm": 1.2009215354919434, |
| "grad_norm_var": 0.7650310583456122, |
| "learning_rate": 6.500000000000001e-05, |
| "loss": 23224268.0, |
| "loss/crossentropy": 2.9230639934539795, |
| "loss/hidden": 0.039794921875, |
| "loss/logits": 0.024060644209384918, |
| "loss/reg": 23224266.0, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.00165, |
| "grad_norm": 1.1358376741409302, |
| "grad_norm_var": 0.7626670570699228, |
| "learning_rate": 6.6e-05, |
| "loss": 23196476.0, |
| "loss/crossentropy": 2.341249465942383, |
| "loss/hidden": 0.03515625, |
| "loss/logits": 0.01903577148914337, |
| "loss/reg": 23196474.0, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.001675, |
| "grad_norm": 1.3390766382217407, |
| "grad_norm_var": 0.7568027972321848, |
| "learning_rate": 6.7e-05, |
| "loss": 23168444.0, |
| "loss/crossentropy": 2.8148157596588135, |
| "loss/hidden": 0.037353515625, |
| "loss/logits": 0.023385722190141678, |
| "loss/reg": 23168442.0, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.0017, |
| "grad_norm": 1.313673734664917, |
| "grad_norm_var": 0.7512190202529315, |
| "learning_rate": 6.800000000000001e-05, |
| "loss": 23139776.0, |
| "loss/crossentropy": 3.0276989936828613, |
| "loss/hidden": 0.039794921875, |
| "loss/logits": 0.028288207948207855, |
| "loss/reg": 23139772.0, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.001725, |
| "grad_norm": 1.213808536529541, |
| "grad_norm_var": 0.7472548934054827, |
| "learning_rate": 6.9e-05, |
| "loss": 23110438.0, |
| "loss/crossentropy": 2.642642021179199, |
| "loss/hidden": 0.04443359375, |
| "loss/logits": 0.03349403291940689, |
| "loss/reg": 23110436.0, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.00175, |
| "grad_norm": 1.2119991779327393, |
| "grad_norm_var": 0.7395327221097505, |
| "learning_rate": 7e-05, |
| "loss": 23080596.0, |
| "loss/crossentropy": 2.7086288928985596, |
| "loss/hidden": 0.042236328125, |
| "loss/logits": 0.026158086955547333, |
| "loss/reg": 23080594.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.001775, |
| "grad_norm": 1.190576434135437, |
| "grad_norm_var": 0.74067140104784, |
| "learning_rate": 7.1e-05, |
| "loss": 23050164.0, |
| "loss/crossentropy": 2.9833340644836426, |
| "loss/hidden": 0.046875, |
| "loss/logits": 0.03177064657211304, |
| "loss/reg": 23050162.0, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.0018, |
| "grad_norm": 1.0935430526733398, |
| "grad_norm_var": 0.0061142762548276035, |
| "learning_rate": 7.2e-05, |
| "loss": 23019096.0, |
| "loss/crossentropy": 2.696112871170044, |
| "loss/hidden": 0.042236328125, |
| "loss/logits": 0.021792840212583542, |
| "loss/reg": 23019094.0, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.001825, |
| "grad_norm": 1.1738063097000122, |
| "grad_norm_var": 0.004546633688142758, |
| "learning_rate": 7.3e-05, |
| "loss": 22985556.0, |
| "loss/crossentropy": 2.832942485809326, |
| "loss/hidden": 0.042236328125, |
| "loss/logits": 0.026215866208076477, |
| "loss/reg": 22985554.0, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.00185, |
| "grad_norm": 1.1817190647125244, |
| "grad_norm_var": 0.003889763769933019, |
| "learning_rate": 7.4e-05, |
| "loss": 22952622.0, |
| "loss/crossentropy": 2.8479084968566895, |
| "loss/hidden": 0.039794921875, |
| "loss/logits": 0.02599533274769783, |
| "loss/reg": 22952620.0, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.001875, |
| "grad_norm": 1.1273410320281982, |
| "grad_norm_var": 0.00434854462148806, |
| "learning_rate": 7.500000000000001e-05, |
| "loss": 22917650.0, |
| "loss/crossentropy": 2.732118606567383, |
| "loss/hidden": 0.04443359375, |
| "loss/logits": 0.032023150473833084, |
| "loss/reg": 22917648.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.0019, |
| "grad_norm": 1.7657408714294434, |
| "grad_norm_var": 0.02372529214584578, |
| "learning_rate": 7.6e-05, |
| "loss": 22883134.0, |
| "loss/crossentropy": 2.9701271057128906, |
| "loss/hidden": 0.049072265625, |
| "loss/logits": 0.03243165463209152, |
| "loss/reg": 22883132.0, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.001925, |
| "grad_norm": 1.1680349111557007, |
| "grad_norm_var": 0.024024990856963236, |
| "learning_rate": 7.7e-05, |
| "loss": 22848052.0, |
| "loss/crossentropy": 2.720111131668091, |
| "loss/hidden": 0.046875, |
| "loss/logits": 0.026185277849435806, |
| "loss/reg": 22848050.0, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.00195, |
| "grad_norm": 1.423579216003418, |
| "grad_norm_var": 0.026199649085415368, |
| "learning_rate": 7.800000000000001e-05, |
| "loss": 22811198.0, |
| "loss/crossentropy": 3.158296823501587, |
| "loss/hidden": 0.049072265625, |
| "loss/logits": 0.030892210081219673, |
| "loss/reg": 22811194.0, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.001975, |
| "grad_norm": 1.2518665790557861, |
| "grad_norm_var": 0.026181842559342527, |
| "learning_rate": 7.900000000000001e-05, |
| "loss": 22774578.0, |
| "loss/crossentropy": 2.735158681869507, |
| "loss/hidden": 0.049072265625, |
| "loss/logits": 0.030565477907657623, |
| "loss/reg": 22774576.0, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 1.08651602268219, |
| "grad_norm_var": 0.02748967104789794, |
| "learning_rate": 8e-05, |
| "loss": 22737422.0, |
| "loss/crossentropy": 2.5904459953308105, |
| "loss/hidden": 0.049560546875, |
| "loss/logits": 0.025047684088349342, |
| "loss/reg": 22737420.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.002025, |
| "grad_norm": 1.248820424079895, |
| "grad_norm_var": 0.027368305419468147, |
| "learning_rate": 8.1e-05, |
| "loss": 22699552.0, |
| "loss/crossentropy": 2.743410587310791, |
| "loss/hidden": 0.05126953125, |
| "loss/logits": 0.03643161803483963, |
| "loss/reg": 22699550.0, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.00205, |
| "grad_norm": 1.4418671131134033, |
| "grad_norm_var": 0.028752282496747956, |
| "learning_rate": 8.2e-05, |
| "loss": 22661236.0, |
| "loss/crossentropy": 2.8847553730010986, |
| "loss/hidden": 0.060791015625, |
| "loss/logits": 0.030914129689335823, |
| "loss/reg": 22661234.0, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.002075, |
| "grad_norm": 1.2557183504104614, |
| "grad_norm_var": 0.028357671354033442, |
| "learning_rate": 8.3e-05, |
| "loss": 22621418.0, |
| "loss/crossentropy": 2.889835834503174, |
| "loss/hidden": 0.0537109375, |
| "loss/logits": 0.0339287668466568, |
| "loss/reg": 22621416.0, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.0021, |
| "grad_norm": 1.502173662185669, |
| "grad_norm_var": 0.0319453250976359, |
| "learning_rate": 8.4e-05, |
| "loss": 22581462.0, |
| "loss/crossentropy": 3.021746873855591, |
| "loss/hidden": 0.06494140625, |
| "loss/logits": 0.05287540704011917, |
| "loss/reg": 22581458.0, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.002125, |
| "grad_norm": 1.3276371955871582, |
| "grad_norm_var": 0.03188607801059575, |
| "learning_rate": 8.5e-05, |
| "loss": 22540664.0, |
| "loss/crossentropy": 2.7517428398132324, |
| "loss/hidden": 0.055908203125, |
| "loss/logits": 0.031271547079086304, |
| "loss/reg": 22540662.0, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.00215, |
| "grad_norm": 1.3479669094085693, |
| "grad_norm_var": 0.0318416680217787, |
| "learning_rate": 8.6e-05, |
| "loss": 22499866.0, |
| "loss/crossentropy": 2.769167184829712, |
| "loss/hidden": 0.05810546875, |
| "loss/logits": 0.030875004827976227, |
| "loss/reg": 22499864.0, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.002175, |
| "grad_norm": 1.1761291027069092, |
| "grad_norm_var": 0.03203984196921249, |
| "learning_rate": 8.7e-05, |
| "loss": 22458496.0, |
| "loss/crossentropy": 2.667074680328369, |
| "loss/hidden": 0.053955078125, |
| "loss/logits": 0.030924052000045776, |
| "loss/reg": 22458494.0, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.0022, |
| "grad_norm": 1.2423137426376343, |
| "grad_norm_var": 0.029609932106258913, |
| "learning_rate": 8.800000000000001e-05, |
| "loss": 22416698.0, |
| "loss/crossentropy": 2.8491878509521484, |
| "loss/hidden": 0.058349609375, |
| "loss/logits": 0.03164982795715332, |
| "loss/reg": 22416696.0, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.002225, |
| "grad_norm": 1.3169476985931396, |
| "grad_norm_var": 0.028576010957647627, |
| "learning_rate": 8.900000000000001e-05, |
| "loss": 22374172.0, |
| "loss/crossentropy": 2.9881014823913574, |
| "loss/hidden": 0.0654296875, |
| "loss/logits": 0.03372816741466522, |
| "loss/reg": 22374170.0, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.00225, |
| "grad_norm": 1.2177362442016602, |
| "grad_norm_var": 0.02806974807590302, |
| "learning_rate": 9e-05, |
| "loss": 22331298.0, |
| "loss/crossentropy": 2.6472673416137695, |
| "loss/hidden": 0.058349609375, |
| "loss/logits": 0.03545096144080162, |
| "loss/reg": 22331296.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.002275, |
| "grad_norm": 1.3880282640457153, |
| "grad_norm_var": 0.02609769625732549, |
| "learning_rate": 9.1e-05, |
| "loss": 22287586.0, |
| "loss/crossentropy": 2.3834619522094727, |
| "loss/hidden": 0.0673828125, |
| "loss/logits": 0.04082828015089035, |
| "loss/reg": 22287584.0, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.0023, |
| "grad_norm": 1.2139270305633545, |
| "grad_norm_var": 0.012522276427923392, |
| "learning_rate": 9.200000000000001e-05, |
| "loss": 22243676.0, |
| "loss/crossentropy": 2.514038562774658, |
| "loss/hidden": 0.06298828125, |
| "loss/logits": 0.03664637729525566, |
| "loss/reg": 22243674.0, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.002325, |
| "grad_norm": 1.218690276145935, |
| "grad_norm_var": 0.011871866332385631, |
| "learning_rate": 9.300000000000001e-05, |
| "loss": 22196858.0, |
| "loss/crossentropy": 2.7747766971588135, |
| "loss/hidden": 0.055908203125, |
| "loss/logits": 0.03466580808162689, |
| "loss/reg": 22196856.0, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.00235, |
| "grad_norm": 1.425099492073059, |
| "grad_norm_var": 0.01189883541788858, |
| "learning_rate": 9.4e-05, |
| "loss": 22150936.0, |
| "loss/crossentropy": 2.751585006713867, |
| "loss/hidden": 0.06982421875, |
| "loss/logits": 0.039963848888874054, |
| "loss/reg": 22150934.0, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.002375, |
| "grad_norm": 1.3297115564346313, |
| "grad_norm_var": 0.011867869002290376, |
| "learning_rate": 9.5e-05, |
| "loss": 22104462.0, |
| "loss/crossentropy": 2.548816204071045, |
| "loss/hidden": 0.058349609375, |
| "loss/logits": 0.03424321860074997, |
| "loss/reg": 22104460.0, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.0024, |
| "grad_norm": 1.436805009841919, |
| "grad_norm_var": 0.009743193140733731, |
| "learning_rate": 9.6e-05, |
| "loss": 22057430.0, |
| "loss/crossentropy": 3.3089489936828613, |
| "loss/hidden": 0.0673828125, |
| "loss/logits": 0.03730063512921333, |
| "loss/reg": 22057426.0, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.002425, |
| "grad_norm": 1.4323129653930664, |
| "grad_norm_var": 0.01015260899747735, |
| "learning_rate": 9.7e-05, |
| "loss": 22009754.0, |
| "loss/crossentropy": 2.8683815002441406, |
| "loss/hidden": 0.07177734375, |
| "loss/logits": 0.05274114012718201, |
| "loss/reg": 22009752.0, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.00245, |
| "grad_norm": 1.6781682968139648, |
| "grad_norm_var": 0.01718073408620716, |
| "learning_rate": 9.8e-05, |
| "loss": 21961312.0, |
| "loss/crossentropy": 2.790402889251709, |
| "loss/hidden": 0.08154296875, |
| "loss/logits": 0.04316875338554382, |
| "loss/reg": 21961310.0, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.002475, |
| "grad_norm": 2.6317436695098877, |
| "grad_norm_var": 0.11926252206686451, |
| "learning_rate": 9.900000000000001e-05, |
| "loss": 21912664.0, |
| "loss/crossentropy": 3.310865640640259, |
| "loss/hidden": 0.09716796875, |
| "loss/logits": 0.05726727843284607, |
| "loss/reg": 21912660.0, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.0025, |
| "grad_norm": 1.9610826969146729, |
| "grad_norm_var": 0.1368204017719316, |
| "learning_rate": 0.0001, |
| "loss": 21863312.0, |
| "loss/crossentropy": 2.5899853706359863, |
| "loss/hidden": 0.08349609375, |
| "loss/logits": 0.05911686643958092, |
| "loss/reg": 21863310.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.002525, |
| "grad_norm": 1.293816089630127, |
| "grad_norm_var": 0.13748435611130952, |
| "learning_rate": 0.0001, |
| "loss": 21813370.0, |
| "loss/crossentropy": 2.7261266708374023, |
| "loss/hidden": 0.083984375, |
| "loss/logits": 0.04105532914400101, |
| "loss/reg": 21813368.0, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.00255, |
| "grad_norm": 1.3441261053085327, |
| "grad_norm_var": 0.13754106604874797, |
| "learning_rate": 0.0001, |
| "loss": 21760636.0, |
| "loss/crossentropy": 2.932553768157959, |
| "loss/hidden": 0.09765625, |
| "loss/logits": 0.044218409806489944, |
| "loss/reg": 21760634.0, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.002575, |
| "grad_norm": 1.2986276149749756, |
| "grad_norm_var": 0.133896905014646, |
| "learning_rate": 0.0001, |
| "loss": 21707878.0, |
| "loss/crossentropy": 2.7184226512908936, |
| "loss/hidden": 0.0810546875, |
| "loss/logits": 0.043878279626369476, |
| "loss/reg": 21707876.0, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.0026, |
| "grad_norm": 1.268932819366455, |
| "grad_norm_var": 0.13315324037138362, |
| "learning_rate": 0.0001, |
| "loss": 21656358.0, |
| "loss/crossentropy": 2.5228793621063232, |
| "loss/hidden": 0.07470703125, |
| "loss/logits": 0.03703644871711731, |
| "loss/reg": 21656356.0, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.002625, |
| "grad_norm": 2.0396835803985596, |
| "grad_norm_var": 0.1514380152717677, |
| "learning_rate": 0.0001, |
| "loss": 21604872.0, |
| "loss/crossentropy": 2.7191953659057617, |
| "loss/hidden": 0.10205078125, |
| "loss/logits": 0.05202098190784454, |
| "loss/reg": 21604870.0, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.00265, |
| "grad_norm": 1.3344433307647705, |
| "grad_norm_var": 0.14772341480061418, |
| "learning_rate": 0.0001, |
| "loss": 21553338.0, |
| "loss/crossentropy": 2.982604742050171, |
| "loss/hidden": 0.0908203125, |
| "loss/logits": 0.04185960069298744, |
| "loss/reg": 21553336.0, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.002675, |
| "grad_norm": 1.3174992799758911, |
| "grad_norm_var": 0.14926077853220499, |
| "learning_rate": 0.0001, |
| "loss": 21501964.0, |
| "loss/crossentropy": 2.64125657081604, |
| "loss/hidden": 0.10009765625, |
| "loss/logits": 0.05030299350619316, |
| "loss/reg": 21501962.0, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.0027, |
| "grad_norm": 1.5658462047576904, |
| "grad_norm_var": 0.1429190673878617, |
| "learning_rate": 0.0001, |
| "loss": 21449158.0, |
| "loss/crossentropy": 2.9215292930603027, |
| "loss/hidden": 0.0927734375, |
| "loss/logits": 0.04710224270820618, |
| "loss/reg": 21449156.0, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.002725, |
| "grad_norm": 9.12102222442627, |
| "grad_norm_var": 3.711476850571243, |
| "learning_rate": 0.0001, |
| "loss": 21397206.0, |
| "loss/crossentropy": 3.46460223197937, |
| "loss/hidden": 0.1962890625, |
| "loss/logits": 0.33538657426834106, |
| "loss/reg": 21397202.0, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.00275, |
| "grad_norm": 1.3513977527618408, |
| "grad_norm_var": 3.717759980418676, |
| "learning_rate": 0.0001, |
| "loss": 21344904.0, |
| "loss/crossentropy": 2.5775856971740723, |
| "loss/hidden": 0.10888671875, |
| "loss/logits": 0.0566323846578598, |
| "loss/reg": 21344902.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.002775, |
| "grad_norm": 1.5712535381317139, |
| "grad_norm_var": 3.6990037032145606, |
| "learning_rate": 0.0001, |
| "loss": 21293542.0, |
| "loss/crossentropy": 2.57499623298645, |
| "loss/hidden": 0.10693359375, |
| "loss/logits": 0.06146764010190964, |
| "loss/reg": 21293540.0, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.0028, |
| "grad_norm": 1.2029298543930054, |
| "grad_norm_var": 3.7212451226956627, |
| "learning_rate": 0.0001, |
| "loss": 21240130.0, |
| "loss/crossentropy": 2.7808516025543213, |
| "loss/hidden": 0.09130859375, |
| "loss/logits": 0.04128836840391159, |
| "loss/reg": 21240128.0, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.002825, |
| "grad_norm": 1.5115809440612793, |
| "grad_norm_var": 3.7153651768024964, |
| "learning_rate": 0.0001, |
| "loss": 21186670.0, |
| "loss/crossentropy": 2.824909210205078, |
| "loss/hidden": 0.0927734375, |
| "loss/logits": 0.055429115891456604, |
| "loss/reg": 21186668.0, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.00285, |
| "grad_norm": 1.4047253131866455, |
| "grad_norm_var": 3.7328935183591323, |
| "learning_rate": 0.0001, |
| "loss": 21134034.0, |
| "loss/crossentropy": 2.7012040615081787, |
| "loss/hidden": 0.09033203125, |
| "loss/logits": 0.045089542865753174, |
| "loss/reg": 21134032.0, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.002875, |
| "grad_norm": 1.1647084951400757, |
| "grad_norm_var": 3.746507312766105, |
| "learning_rate": 0.0001, |
| "loss": 21081756.0, |
| "loss/crossentropy": 2.6434690952301025, |
| "loss/hidden": 0.0888671875, |
| "loss/logits": 0.043579697608947754, |
| "loss/reg": 21081754.0, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.0029, |
| "grad_norm": 1.2545047998428345, |
| "grad_norm_var": 3.77402667840472, |
| "learning_rate": 0.0001, |
| "loss": 21029456.0, |
| "loss/crossentropy": 2.691488265991211, |
| "loss/hidden": 0.0888671875, |
| "loss/logits": 0.043844159692525864, |
| "loss/reg": 21029454.0, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.002925, |
| "grad_norm": 1.178173542022705, |
| "grad_norm_var": 3.7838672420797472, |
| "learning_rate": 0.0001, |
| "loss": 20977194.0, |
| "loss/crossentropy": 2.6559066772460938, |
| "loss/hidden": 0.09521484375, |
| "loss/logits": 0.05716244503855705, |
| "loss/reg": 20977192.0, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.00295, |
| "grad_norm": 1.2668310403823853, |
| "grad_norm_var": 3.7896664013965315, |
| "learning_rate": 0.0001, |
| "loss": 20922972.0, |
| "loss/crossentropy": 2.740365743637085, |
| "loss/hidden": 0.0810546875, |
| "loss/logits": 0.041061073541641235, |
| "loss/reg": 20922970.0, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.002975, |
| "grad_norm": 1.2409541606903076, |
| "grad_norm_var": 3.7942354219228878, |
| "learning_rate": 0.0001, |
| "loss": 20868506.0, |
| "loss/crossentropy": 2.961731195449829, |
| "loss/hidden": 0.0859375, |
| "loss/logits": 0.044006288051605225, |
| "loss/reg": 20868504.0, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 1.228840947151184, |
| "grad_norm_var": 3.797507002491694, |
| "learning_rate": 0.0001, |
| "loss": 20815554.0, |
| "loss/crossentropy": 2.8881726264953613, |
| "loss/hidden": 0.1025390625, |
| "loss/logits": 0.045989107340574265, |
| "loss/reg": 20815552.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.003025, |
| "grad_norm": 1.2890663146972656, |
| "grad_norm_var": 3.814702938559192, |
| "learning_rate": 0.0001, |
| "loss": 20762824.0, |
| "loss/crossentropy": 2.6567206382751465, |
| "loss/hidden": 0.09521484375, |
| "loss/logits": 0.05085486173629761, |
| "loss/reg": 20762822.0, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.00305, |
| "grad_norm": 1.5960758924484253, |
| "grad_norm_var": 3.802296234755437, |
| "learning_rate": 0.0001, |
| "loss": 20708376.0, |
| "loss/crossentropy": 3.12703537940979, |
| "loss/hidden": 0.1015625, |
| "loss/logits": 0.049035705626010895, |
| "loss/reg": 20708372.0, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.003075, |
| "grad_norm": 1.2626285552978516, |
| "grad_norm_var": 3.806227243092966, |
| "learning_rate": 0.0001, |
| "loss": 20655212.0, |
| "loss/crossentropy": 2.5815978050231934, |
| "loss/hidden": 0.0908203125, |
| "loss/logits": 0.03751040995121002, |
| "loss/reg": 20655210.0, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.0031, |
| "grad_norm": 1.4448137283325195, |
| "grad_norm_var": 3.8113355638748563, |
| "learning_rate": 0.0001, |
| "loss": 20601744.0, |
| "loss/crossentropy": 2.7269201278686523, |
| "loss/hidden": 0.10400390625, |
| "loss/logits": 0.05382286012172699, |
| "loss/reg": 20601742.0, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.003125, |
| "grad_norm": 1.2743065357208252, |
| "grad_norm_var": 0.01898643087393239, |
| "learning_rate": 0.0001, |
| "loss": 20548514.0, |
| "loss/crossentropy": 2.9527673721313477, |
| "loss/hidden": 0.09521484375, |
| "loss/logits": 0.04568991810083389, |
| "loss/reg": 20548512.0, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.00315, |
| "grad_norm": 1.2714648246765137, |
| "grad_norm_var": 0.01913292417152377, |
| "learning_rate": 0.0001, |
| "loss": 20495564.0, |
| "loss/crossentropy": 2.7679290771484375, |
| "loss/hidden": 0.09521484375, |
| "loss/logits": 0.04457134008407593, |
| "loss/reg": 20495562.0, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.003175, |
| "grad_norm": 1.1398032903671265, |
| "grad_norm_var": 0.016467563806870977, |
| "learning_rate": 0.0001, |
| "loss": 20442526.0, |
| "loss/crossentropy": 2.5426719188690186, |
| "loss/hidden": 0.0888671875, |
| "loss/logits": 0.037348851561546326, |
| "loss/reg": 20442524.0, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.0032, |
| "grad_norm": 1.1865832805633545, |
| "grad_norm_var": 0.016686489366070607, |
| "learning_rate": 0.0001, |
| "loss": 20389632.0, |
| "loss/crossentropy": 2.7126035690307617, |
| "loss/hidden": 0.08837890625, |
| "loss/logits": 0.04238874465227127, |
| "loss/reg": 20389630.0, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.003225, |
| "grad_norm": 1.1992998123168945, |
| "grad_norm_var": 0.013750721558917196, |
| "learning_rate": 0.0001, |
| "loss": 20336756.0, |
| "loss/crossentropy": 2.4656715393066406, |
| "loss/hidden": 0.0927734375, |
| "loss/logits": 0.04687324911355972, |
| "loss/reg": 20336754.0, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.00325, |
| "grad_norm": 1.591745376586914, |
| "grad_norm_var": 0.019167251110399994, |
| "learning_rate": 0.0001, |
| "loss": 20283658.0, |
| "loss/crossentropy": 2.8413236141204834, |
| "loss/hidden": 0.1015625, |
| "loss/logits": 0.0509529635310173, |
| "loss/reg": 20283656.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.003275, |
| "grad_norm": 1.7687066793441772, |
| "grad_norm_var": 0.0321306713998105, |
| "learning_rate": 0.0001, |
| "loss": 20228520.0, |
| "loss/crossentropy": 2.6996963024139404, |
| "loss/hidden": 0.1015625, |
| "loss/logits": 0.050136156380176544, |
| "loss/reg": 20228518.0, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.0033, |
| "grad_norm": 1.398307204246521, |
| "grad_norm_var": 0.0320788978008153, |
| "learning_rate": 0.0001, |
| "loss": 20174934.0, |
| "loss/crossentropy": 2.7786731719970703, |
| "loss/hidden": 0.10205078125, |
| "loss/logits": 0.056733570992946625, |
| "loss/reg": 20174932.0, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.003325, |
| "grad_norm": 1.3080873489379883, |
| "grad_norm_var": 0.030441473964627146, |
| "learning_rate": 0.0001, |
| "loss": 20121414.0, |
| "loss/crossentropy": 2.492565631866455, |
| "loss/hidden": 0.1025390625, |
| "loss/logits": 0.05269423872232437, |
| "loss/reg": 20121412.0, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.00335, |
| "grad_norm": 1.2051341533660889, |
| "grad_norm_var": 0.03129543343044545, |
| "learning_rate": 0.0001, |
| "loss": 20066430.0, |
| "loss/crossentropy": 2.7192039489746094, |
| "loss/hidden": 0.09521484375, |
| "loss/logits": 0.04698524624109268, |
| "loss/reg": 20066428.0, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.003375, |
| "grad_norm": 1.3290663957595825, |
| "grad_norm_var": 0.030642147459965942, |
| "learning_rate": 0.0001, |
| "loss": 20012506.0, |
| "loss/crossentropy": 2.8672525882720947, |
| "loss/hidden": 0.09765625, |
| "loss/logits": 0.0463830940425396, |
| "loss/reg": 20012504.0, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.0034, |
| "grad_norm": 1.30889892578125, |
| "grad_norm_var": 0.02982019196275818, |
| "learning_rate": 0.0001, |
| "loss": 19958618.0, |
| "loss/crossentropy": 2.684338092803955, |
| "loss/hidden": 0.09521484375, |
| "loss/logits": 0.04849115014076233, |
| "loss/reg": 19958616.0, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.003425, |
| "grad_norm": 1.4370224475860596, |
| "grad_norm_var": 0.030018383781239243, |
| "learning_rate": 0.0001, |
| "loss": 19904722.0, |
| "loss/crossentropy": 2.377598762512207, |
| "loss/hidden": 0.1064453125, |
| "loss/logits": 0.04614124447107315, |
| "loss/reg": 19904720.0, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.00345, |
| "grad_norm": 1.2927887439727783, |
| "grad_norm_var": 0.026124641293548242, |
| "learning_rate": 0.0001, |
| "loss": 19850890.0, |
| "loss/crossentropy": 2.8107824325561523, |
| "loss/hidden": 0.1044921875, |
| "loss/logits": 0.04102378338575363, |
| "loss/reg": 19850888.0, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.003475, |
| "grad_norm": 1.2370617389678955, |
| "grad_norm_var": 0.026424700169928646, |
| "learning_rate": 0.0001, |
| "loss": 19797298.0, |
| "loss/crossentropy": 2.5717079639434814, |
| "loss/hidden": 0.099609375, |
| "loss/logits": 0.05525430664420128, |
| "loss/reg": 19797296.0, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.0035, |
| "grad_norm": 1.6020715236663818, |
| "grad_norm_var": 0.030229503557688987, |
| "learning_rate": 0.0001, |
| "loss": 19743302.0, |
| "loss/crossentropy": 3.2840170860290527, |
| "loss/hidden": 0.12255859375, |
| "loss/logits": 0.0607365220785141, |
| "loss/reg": 19743298.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.003525, |
| "grad_norm": 1.2904223203659058, |
| "grad_norm_var": 0.030089756158917604, |
| "learning_rate": 0.0001, |
| "loss": 19687738.0, |
| "loss/crossentropy": 2.3972127437591553, |
| "loss/hidden": 0.099609375, |
| "loss/logits": 0.04840380698442459, |
| "loss/reg": 19687736.0, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.00355, |
| "grad_norm": 1.4341049194335938, |
| "grad_norm_var": 0.03008538363282387, |
| "learning_rate": 0.0001, |
| "loss": 19633738.0, |
| "loss/crossentropy": 2.512824296951294, |
| "loss/hidden": 0.1064453125, |
| "loss/logits": 0.0616685189306736, |
| "loss/reg": 19633736.0, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.003575, |
| "grad_norm": 1.321701169013977, |
| "grad_norm_var": 0.0268597015042511, |
| "learning_rate": 0.0001, |
| "loss": 19579358.0, |
| "loss/crossentropy": 2.532776117324829, |
| "loss/hidden": 0.10205078125, |
| "loss/logits": 0.053171977400779724, |
| "loss/reg": 19579356.0, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.0036, |
| "grad_norm": 1.2932393550872803, |
| "grad_norm_var": 0.02497033448003183, |
| "learning_rate": 0.0001, |
| "loss": 19525614.0, |
| "loss/crossentropy": 2.7050604820251465, |
| "loss/hidden": 0.10205078125, |
| "loss/logits": 0.05065223202109337, |
| "loss/reg": 19525612.0, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.003625, |
| "grad_norm": 1.6670914888381958, |
| "grad_norm_var": 0.02761950289613537, |
| "learning_rate": 0.0001, |
| "loss": 19471726.0, |
| "loss/crossentropy": 3.175818681716919, |
| "loss/hidden": 0.12255859375, |
| "loss/logits": 0.059824831783771515, |
| "loss/reg": 19471722.0, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.00365, |
| "grad_norm": 1.669080138206482, |
| "grad_norm_var": 0.02991537021935873, |
| "learning_rate": 0.0001, |
| "loss": 19417332.0, |
| "loss/crossentropy": 3.160999298095703, |
| "loss/hidden": 0.1064453125, |
| "loss/logits": 0.0447954386472702, |
| "loss/reg": 19417328.0, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.003675, |
| "grad_norm": 1.6329416036605835, |
| "grad_norm_var": 0.024577218400994562, |
| "learning_rate": 0.0001, |
| "loss": 19362994.0, |
| "loss/crossentropy": 2.6171441078186035, |
| "loss/hidden": 0.1162109375, |
| "loss/logits": 0.06706968694925308, |
| "loss/reg": 19362992.0, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.0037, |
| "grad_norm": 1.8200608491897583, |
| "grad_norm_var": 0.03550432157051085, |
| "learning_rate": 0.0001, |
| "loss": 19308968.0, |
| "loss/crossentropy": 2.8265509605407715, |
| "loss/hidden": 0.1298828125, |
| "loss/logits": 0.05379139631986618, |
| "loss/reg": 19308966.0, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.003725, |
| "grad_norm": 1.4770478010177612, |
| "grad_norm_var": 0.03458606072885762, |
| "learning_rate": 0.0001, |
| "loss": 19253440.0, |
| "loss/crossentropy": 2.368999481201172, |
| "loss/hidden": 0.12060546875, |
| "loss/logits": 0.061977319419384, |
| "loss/reg": 19253438.0, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.00375, |
| "grad_norm": 1.3455870151519775, |
| "grad_norm_var": 0.03144671611566139, |
| "learning_rate": 0.0001, |
| "loss": 19199202.0, |
| "loss/crossentropy": 2.6737399101257324, |
| "loss/hidden": 0.11328125, |
| "loss/logits": 0.041911885142326355, |
| "loss/reg": 19199200.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.003775, |
| "grad_norm": 1.4803009033203125, |
| "grad_norm_var": 0.030490327620056174, |
| "learning_rate": 0.0001, |
| "loss": 19145438.0, |
| "loss/crossentropy": 2.828763008117676, |
| "loss/hidden": 0.12255859375, |
| "loss/logits": 0.06406622380018234, |
| "loss/reg": 19145436.0, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.0038, |
| "grad_norm": 1.502377986907959, |
| "grad_norm_var": 0.02901352585353904, |
| "learning_rate": 0.0001, |
| "loss": 19089612.0, |
| "loss/crossentropy": 3.127129316329956, |
| "loss/hidden": 0.123046875, |
| "loss/logits": 0.055032793432474136, |
| "loss/reg": 19089608.0, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.003825, |
| "grad_norm": 1.751934289932251, |
| "grad_norm_var": 0.033871822986112396, |
| "learning_rate": 0.0001, |
| "loss": 19035182.0, |
| "loss/crossentropy": 2.866281509399414, |
| "loss/hidden": 0.1396484375, |
| "loss/logits": 0.0661492794752121, |
| "loss/reg": 19035180.0, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.00385, |
| "grad_norm": 1.3410906791687012, |
| "grad_norm_var": 0.03275648001013085, |
| "learning_rate": 0.0001, |
| "loss": 18981036.0, |
| "loss/crossentropy": 2.321021795272827, |
| "loss/hidden": 0.11083984375, |
| "loss/logits": 0.05458283796906471, |
| "loss/reg": 18981034.0, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.003875, |
| "grad_norm": 1.3823696374893188, |
| "grad_norm_var": 0.029143984317305655, |
| "learning_rate": 0.0001, |
| "loss": 18926840.0, |
| "loss/crossentropy": 2.425764560699463, |
| "loss/hidden": 0.1298828125, |
| "loss/logits": 0.04776450991630554, |
| "loss/reg": 18926838.0, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.0039, |
| "grad_norm": 2.480062961578369, |
| "grad_norm_var": 0.0891887833302062, |
| "learning_rate": 0.0001, |
| "loss": 18872570.0, |
| "loss/crossentropy": 2.6627087593078613, |
| "loss/hidden": 0.11865234375, |
| "loss/logits": 0.052156057208776474, |
| "loss/reg": 18872568.0, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.003925, |
| "grad_norm": 2.14629864692688, |
| "grad_norm_var": 0.10471164434709544, |
| "learning_rate": 0.0001, |
| "loss": 18818126.0, |
| "loss/crossentropy": 2.6637701988220215, |
| "loss/hidden": 0.134765625, |
| "loss/logits": 0.058607276529073715, |
| "loss/reg": 18818124.0, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.00395, |
| "grad_norm": 1.4461016654968262, |
| "grad_norm_var": 0.10444075430808931, |
| "learning_rate": 0.0001, |
| "loss": 18764118.0, |
| "loss/crossentropy": 2.579051971435547, |
| "loss/hidden": 0.1181640625, |
| "loss/logits": 0.049537718296051025, |
| "loss/reg": 18764116.0, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.003975, |
| "grad_norm": 1.8066216707229614, |
| "grad_norm_var": 0.10050819563091687, |
| "learning_rate": 0.0001, |
| "loss": 18710078.0, |
| "loss/crossentropy": 2.7758357524871826, |
| "loss/hidden": 0.1298828125, |
| "loss/logits": 0.06919336318969727, |
| "loss/reg": 18710076.0, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 1.2947094440460205, |
| "grad_norm_var": 0.10044033447050389, |
| "learning_rate": 0.0001, |
| "loss": 18656092.0, |
| "loss/crossentropy": 2.577569007873535, |
| "loss/hidden": 0.111328125, |
| "loss/logits": 0.04529140144586563, |
| "loss/reg": 18656090.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.004025, |
| "grad_norm": 1.2442281246185303, |
| "grad_norm_var": 0.11010166357437257, |
| "learning_rate": 0.0001, |
| "loss": 18600634.0, |
| "loss/crossentropy": 2.6058568954467773, |
| "loss/hidden": 0.11376953125, |
| "loss/logits": 0.04296538978815079, |
| "loss/reg": 18600632.0, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.00405, |
| "grad_norm": 1.238753318786621, |
| "grad_norm_var": 0.11850373143602529, |
| "learning_rate": 0.0001, |
| "loss": 18546590.0, |
| "loss/crossentropy": 2.848583459854126, |
| "loss/hidden": 0.10693359375, |
| "loss/logits": 0.04048790782690048, |
| "loss/reg": 18546588.0, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.004075, |
| "grad_norm": 1.2021461725234985, |
| "grad_norm_var": 0.1274584846210776, |
| "learning_rate": 0.0001, |
| "loss": 18492182.0, |
| "loss/crossentropy": 2.683387517929077, |
| "loss/hidden": 0.10888671875, |
| "loss/logits": 0.05293326824903488, |
| "loss/reg": 18492180.0, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.0041, |
| "grad_norm": 1.3073196411132812, |
| "grad_norm_var": 0.1261094481476486, |
| "learning_rate": 0.0001, |
| "loss": 18438208.0, |
| "loss/crossentropy": 2.8019156455993652, |
| "loss/hidden": 0.10693359375, |
| "loss/logits": 0.05101510137319565, |
| "loss/reg": 18438206.0, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.004125, |
| "grad_norm": 1.3466458320617676, |
| "grad_norm_var": 0.1280570014558648, |
| "learning_rate": 0.0001, |
| "loss": 18383844.0, |
| "loss/crossentropy": 2.914206027984619, |
| "loss/hidden": 0.1181640625, |
| "loss/logits": 0.05748095363378525, |
| "loss/reg": 18383842.0, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.00415, |
| "grad_norm": 1.3539113998413086, |
| "grad_norm_var": 0.12786798777855538, |
| "learning_rate": 0.0001, |
| "loss": 18329448.0, |
| "loss/crossentropy": 2.554934501647949, |
| "loss/hidden": 0.11767578125, |
| "loss/logits": 0.06395170092582703, |
| "loss/reg": 18329446.0, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.004175, |
| "grad_norm": 1.3374489545822144, |
| "grad_norm_var": 0.12990535124368588, |
| "learning_rate": 0.0001, |
| "loss": 18275214.0, |
| "loss/crossentropy": 2.657240629196167, |
| "loss/hidden": 0.111328125, |
| "loss/logits": 0.048241015523672104, |
| "loss/reg": 18275212.0, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.0042, |
| "grad_norm": 1.3718914985656738, |
| "grad_norm_var": 0.13112607550112684, |
| "learning_rate": 0.0001, |
| "loss": 18221442.0, |
| "loss/crossentropy": 2.409538507461548, |
| "loss/hidden": 0.10205078125, |
| "loss/logits": 0.04052230715751648, |
| "loss/reg": 18221440.0, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.004225, |
| "grad_norm": 1.225712537765503, |
| "grad_norm_var": 0.13098245397278915, |
| "learning_rate": 0.0001, |
| "loss": 18167330.0, |
| "loss/crossentropy": 2.777529716491699, |
| "loss/hidden": 0.12255859375, |
| "loss/logits": 0.06095242500305176, |
| "loss/reg": 18167328.0, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.00425, |
| "grad_norm": 1.6373209953308105, |
| "grad_norm_var": 0.1313622855512598, |
| "learning_rate": 0.0001, |
| "loss": 18111558.0, |
| "loss/crossentropy": 3.0446035861968994, |
| "loss/hidden": 0.1328125, |
| "loss/logits": 0.06305646896362305, |
| "loss/reg": 18111554.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.004275, |
| "grad_norm": 1.216644287109375, |
| "grad_norm_var": 0.13543162792952987, |
| "learning_rate": 0.0001, |
| "loss": 18057268.0, |
| "loss/crossentropy": 2.627608060836792, |
| "loss/hidden": 0.111328125, |
| "loss/logits": 0.05334187299013138, |
| "loss/reg": 18057266.0, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.0043, |
| "grad_norm": 1.428633451461792, |
| "grad_norm_var": 0.0641143128722513, |
| "learning_rate": 0.0001, |
| "loss": 18003362.0, |
| "loss/crossentropy": 2.7025094032287598, |
| "loss/hidden": 0.126953125, |
| "loss/logits": 0.06098884344100952, |
| "loss/reg": 18003360.0, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.004325, |
| "grad_norm": 1.2322746515274048, |
| "grad_norm_var": 0.026934781647535287, |
| "learning_rate": 0.0001, |
| "loss": 17949652.0, |
| "loss/crossentropy": 2.638559103012085, |
| "loss/hidden": 0.10693359375, |
| "loss/logits": 0.049217596650123596, |
| "loss/reg": 17949650.0, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.00435, |
| "grad_norm": 1.338496446609497, |
| "grad_norm_var": 0.026360686408602875, |
| "learning_rate": 0.0001, |
| "loss": 17895770.0, |
| "loss/crossentropy": 2.855954647064209, |
| "loss/hidden": 0.1181640625, |
| "loss/logits": 0.05508654564619064, |
| "loss/reg": 17895768.0, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.004375, |
| "grad_norm": 1.5095555782318115, |
| "grad_norm_var": 0.013747278412734865, |
| "learning_rate": 0.0001, |
| "loss": 17841828.0, |
| "loss/crossentropy": 2.9633593559265137, |
| "loss/hidden": 0.126953125, |
| "loss/logits": 0.060760144144296646, |
| "loss/reg": 17841826.0, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.0044, |
| "grad_norm": 1.2842398881912231, |
| "grad_norm_var": 0.01380388929903802, |
| "learning_rate": 0.0001, |
| "loss": 17786558.0, |
| "loss/crossentropy": 2.632108688354492, |
| "loss/hidden": 0.109375, |
| "loss/logits": 0.05042888596653938, |
| "loss/reg": 17786556.0, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.004425, |
| "grad_norm": 1.378301978111267, |
| "grad_norm_var": 0.013399412076441722, |
| "learning_rate": 0.0001, |
| "loss": 17732774.0, |
| "loss/crossentropy": 2.973829984664917, |
| "loss/hidden": 0.111328125, |
| "loss/logits": 0.05111613869667053, |
| "loss/reg": 17732772.0, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.00445, |
| "grad_norm": 1.690528154373169, |
| "grad_norm_var": 0.02017252483054405, |
| "learning_rate": 0.0001, |
| "loss": 17679170.0, |
| "loss/crossentropy": 2.578587532043457, |
| "loss/hidden": 0.126953125, |
| "loss/logits": 0.06507566571235657, |
| "loss/reg": 17679168.0, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.004475, |
| "grad_norm": 1.2591404914855957, |
| "grad_norm_var": 0.019127973174063822, |
| "learning_rate": 0.0001, |
| "loss": 17625080.0, |
| "loss/crossentropy": 2.7367069721221924, |
| "loss/hidden": 0.11767578125, |
| "loss/logits": 0.0503883957862854, |
| "loss/reg": 17625078.0, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.0045, |
| "grad_norm": 1.3769865036010742, |
| "grad_norm_var": 0.018850205552983113, |
| "learning_rate": 0.0001, |
| "loss": 17571554.0, |
| "loss/crossentropy": 2.7446157932281494, |
| "loss/hidden": 0.1328125, |
| "loss/logits": 0.07044874131679535, |
| "loss/reg": 17571552.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.004525, |
| "grad_norm": 1.142971158027649, |
| "grad_norm_var": 0.022192098571084708, |
| "learning_rate": 0.0001, |
| "loss": 17518048.0, |
| "loss/crossentropy": 2.6257646083831787, |
| "loss/hidden": 0.109375, |
| "loss/logits": 0.04560330510139465, |
| "loss/reg": 17518046.0, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.00455, |
| "grad_norm": 1.3016066551208496, |
| "grad_norm_var": 0.02241603312496251, |
| "learning_rate": 0.0001, |
| "loss": 17464022.0, |
| "loss/crossentropy": 2.944082260131836, |
| "loss/hidden": 0.1201171875, |
| "loss/logits": 0.054708562791347504, |
| "loss/reg": 17464020.0, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.004575, |
| "grad_norm": 1.2301819324493408, |
| "grad_norm_var": 0.02343245307657919, |
| "learning_rate": 0.0001, |
| "loss": 17408696.0, |
| "loss/crossentropy": 2.7266287803649902, |
| "loss/hidden": 0.11572265625, |
| "loss/logits": 0.056787606328725815, |
| "loss/reg": 17408694.0, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.0046, |
| "grad_norm": 1.4518113136291504, |
| "grad_norm_var": 0.02404861912121016, |
| "learning_rate": 0.0001, |
| "loss": 17354430.0, |
| "loss/crossentropy": 2.5430450439453125, |
| "loss/hidden": 0.134765625, |
| "loss/logits": 0.0794079378247261, |
| "loss/reg": 17354428.0, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.004625, |
| "grad_norm": 1.278257966041565, |
| "grad_norm_var": 0.023304700780639998, |
| "learning_rate": 0.0001, |
| "loss": 17300120.0, |
| "loss/crossentropy": 2.6483333110809326, |
| "loss/hidden": 0.125, |
| "loss/logits": 0.05337969958782196, |
| "loss/reg": 17300118.0, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.00465, |
| "grad_norm": 1.182318091392517, |
| "grad_norm_var": 0.019408121528592424, |
| "learning_rate": 0.0001, |
| "loss": 17246254.0, |
| "loss/crossentropy": 2.454202651977539, |
| "loss/hidden": 0.11376953125, |
| "loss/logits": 0.05566149204969406, |
| "loss/reg": 17246252.0, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.004675, |
| "grad_norm": 1.2396453619003296, |
| "grad_norm_var": 0.019089339686502584, |
| "learning_rate": 0.0001, |
| "loss": 17192456.0, |
| "loss/crossentropy": 2.77221417427063, |
| "loss/hidden": 0.11572265625, |
| "loss/logits": 0.05261695012450218, |
| "loss/reg": 17192454.0, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.0047, |
| "grad_norm": 1.2326287031173706, |
| "grad_norm_var": 0.018986192206223034, |
| "learning_rate": 0.0001, |
| "loss": 17138794.0, |
| "loss/crossentropy": 2.837191581726074, |
| "loss/hidden": 0.111328125, |
| "loss/logits": 0.04671577736735344, |
| "loss/reg": 17138792.0, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.004725, |
| "grad_norm": 1.3114184141159058, |
| "grad_norm_var": 0.018446054341770216, |
| "learning_rate": 0.0001, |
| "loss": 17084936.0, |
| "loss/crossentropy": 2.7469685077667236, |
| "loss/hidden": 0.1328125, |
| "loss/logits": 0.05689527839422226, |
| "loss/reg": 17084934.0, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.00475, |
| "grad_norm": 1.202352523803711, |
| "grad_norm_var": 0.019368684588443822, |
| "learning_rate": 0.0001, |
| "loss": 17031480.0, |
| "loss/crossentropy": 2.839620590209961, |
| "loss/hidden": 0.11572265625, |
| "loss/logits": 0.05177593231201172, |
| "loss/reg": 17031478.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.004775, |
| "grad_norm": 1.2085026502609253, |
| "grad_norm_var": 0.017303843894135144, |
| "learning_rate": 0.0001, |
| "loss": 16977832.0, |
| "loss/crossentropy": 2.6213932037353516, |
| "loss/hidden": 0.11328125, |
| "loss/logits": 0.05087145045399666, |
| "loss/reg": 16977830.0, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.0048, |
| "grad_norm": 1.312438726425171, |
| "grad_norm_var": 0.017301126868439334, |
| "learning_rate": 0.0001, |
| "loss": 16924376.0, |
| "loss/crossentropy": 2.664512872695923, |
| "loss/hidden": 0.125, |
| "loss/logits": 0.05937638878822327, |
| "loss/reg": 16924374.0, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.004825, |
| "grad_norm": 1.2645244598388672, |
| "grad_norm_var": 0.016921480076736786, |
| "learning_rate": 0.0001, |
| "loss": 16870958.0, |
| "loss/crossentropy": 2.697314739227295, |
| "loss/hidden": 0.1181640625, |
| "loss/logits": 0.06033787503838539, |
| "loss/reg": 16870956.0, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.00485, |
| "grad_norm": 1.2122687101364136, |
| "grad_norm_var": 0.00585698158105572, |
| "learning_rate": 0.0001, |
| "loss": 16816176.0, |
| "loss/crossentropy": 2.724501132965088, |
| "loss/hidden": 0.11767578125, |
| "loss/logits": 0.05155929923057556, |
| "loss/reg": 16816174.0, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.004875, |
| "grad_norm": 1.268837809562683, |
| "grad_norm_var": 0.005857945178340114, |
| "learning_rate": 0.0001, |
| "loss": 16762343.0, |
| "loss/crossentropy": 2.6175894737243652, |
| "loss/hidden": 0.10888671875, |
| "loss/logits": 0.050487220287323, |
| "loss/reg": 16762340.0, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.0049, |
| "grad_norm": 1.4376094341278076, |
| "grad_norm_var": 0.007004579944968666, |
| "learning_rate": 0.0001, |
| "loss": 16708609.0, |
| "loss/crossentropy": 2.911649465560913, |
| "loss/hidden": 0.1298828125, |
| "loss/logits": 0.0706474781036377, |
| "loss/reg": 16708606.0, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.004925, |
| "grad_norm": 1.2329065799713135, |
| "grad_norm_var": 0.006018797997194032, |
| "learning_rate": 0.0001, |
| "loss": 16655230.0, |
| "loss/crossentropy": 2.5988035202026367, |
| "loss/hidden": 0.125, |
| "loss/logits": 0.06360973417758942, |
| "loss/reg": 16655227.0, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.00495, |
| "grad_norm": 1.3495337963104248, |
| "grad_norm_var": 0.006345441717197294, |
| "learning_rate": 0.0001, |
| "loss": 16602087.0, |
| "loss/crossentropy": 2.508890151977539, |
| "loss/hidden": 0.1201171875, |
| "loss/logits": 0.05080607905983925, |
| "loss/reg": 16602084.0, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.004975, |
| "grad_norm": 1.319189429283142, |
| "grad_norm_var": 0.0062974004194898585, |
| "learning_rate": 0.0001, |
| "loss": 16549369.0, |
| "loss/crossentropy": 2.9947011470794678, |
| "loss/hidden": 0.134765625, |
| "loss/logits": 0.06312789022922516, |
| "loss/reg": 16549366.0, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 5.087399482727051, |
| "grad_norm_var": 0.9149414153427258, |
| "learning_rate": 0.0001, |
| "loss": 16496365.0, |
| "loss/crossentropy": 2.5968453884124756, |
| "loss/hidden": 0.1328125, |
| "loss/logits": 0.08010983467102051, |
| "loss/reg": 16496362.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.005025, |
| "grad_norm": 1.3334623575210571, |
| "grad_norm_var": 0.9134354065372629, |
| "learning_rate": 0.0001, |
| "loss": 16441998.0, |
| "loss/crossentropy": 3.0274949073791504, |
| "loss/hidden": 0.1328125, |
| "loss/logits": 0.06672625243663788, |
| "loss/reg": 16441995.0, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.00505, |
| "grad_norm": 1.5618314743041992, |
| "grad_norm_var": 0.9057452108029392, |
| "learning_rate": 0.0001, |
| "loss": 16387711.0, |
| "loss/crossentropy": 3.1711623668670654, |
| "loss/hidden": 0.1416015625, |
| "loss/logits": 0.06717319786548615, |
| "loss/reg": 16387708.0, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.005075, |
| "grad_norm": 2.5483736991882324, |
| "grad_norm_var": 0.9610961134803181, |
| "learning_rate": 0.0001, |
| "loss": 16333579.0, |
| "loss/crossentropy": 2.680380344390869, |
| "loss/hidden": 0.1396484375, |
| "loss/logits": 0.06322912871837616, |
| "loss/reg": 16333576.0, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.0051, |
| "grad_norm": 1.2988861799240112, |
| "grad_norm_var": 0.9579686015986401, |
| "learning_rate": 0.0001, |
| "loss": 16280504.0, |
| "loss/crossentropy": 2.927647590637207, |
| "loss/hidden": 0.134765625, |
| "loss/logits": 0.05952024459838867, |
| "loss/reg": 16280501.0, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.005125, |
| "grad_norm": 2.0663909912109375, |
| "grad_norm_var": 0.9623440025545553, |
| "learning_rate": 0.0001, |
| "loss": 16227708.0, |
| "loss/crossentropy": 2.977742910385132, |
| "loss/hidden": 0.1396484375, |
| "loss/logits": 0.057787422090768814, |
| "loss/reg": 16227705.0, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.00515, |
| "grad_norm": 1.7247551679611206, |
| "grad_norm_var": 0.9468946056606867, |
| "learning_rate": 0.0001, |
| "loss": 16175112.0, |
| "loss/crossentropy": 2.61124324798584, |
| "loss/hidden": 0.1552734375, |
| "loss/logits": 0.06803576648235321, |
| "loss/reg": 16175109.0, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.005175, |
| "grad_norm": 1.4099866151809692, |
| "grad_norm_var": 0.9361828134096427, |
| "learning_rate": 0.0001, |
| "loss": 16122558.0, |
| "loss/crossentropy": 2.6480321884155273, |
| "loss/hidden": 0.1533203125, |
| "loss/logits": 0.06561914086341858, |
| "loss/reg": 16122555.0, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.0052, |
| "grad_norm": 1.29368257522583, |
| "grad_norm_var": 0.9372097199440715, |
| "learning_rate": 0.0001, |
| "loss": 16068349.0, |
| "loss/crossentropy": 2.5879201889038086, |
| "loss/hidden": 0.1298828125, |
| "loss/logits": 0.054698191583156586, |
| "loss/reg": 16068346.0, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.005225, |
| "grad_norm": 1.3361625671386719, |
| "grad_norm_var": 0.9332457675974954, |
| "learning_rate": 0.0001, |
| "loss": 16015307.0, |
| "loss/crossentropy": 2.8112809658050537, |
| "loss/hidden": 0.13671875, |
| "loss/logits": 0.061736419796943665, |
| "loss/reg": 16015304.0, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.00525, |
| "grad_norm": 1.4294019937515259, |
| "grad_norm_var": 0.9215631322410246, |
| "learning_rate": 0.0001, |
| "loss": 15962736.0, |
| "loss/crossentropy": 2.642965078353882, |
| "loss/hidden": 0.134765625, |
| "loss/logits": 0.06178359314799309, |
| "loss/reg": 15962733.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.005275, |
| "grad_norm": 1.31008780002594, |
| "grad_norm_var": 0.9191267598983291, |
| "learning_rate": 0.0001, |
| "loss": 15909914.0, |
| "loss/crossentropy": 2.614102363586426, |
| "loss/hidden": 0.126953125, |
| "loss/logits": 0.06696215271949768, |
| "loss/reg": 15909911.0, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.0053, |
| "grad_norm": 1.407581090927124, |
| "grad_norm_var": 0.9203687125975856, |
| "learning_rate": 0.0001, |
| "loss": 15857375.0, |
| "loss/crossentropy": 2.8371071815490723, |
| "loss/hidden": 0.1328125, |
| "loss/logits": 0.06245514005422592, |
| "loss/reg": 15857372.0, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.005325, |
| "grad_norm": 1.233622670173645, |
| "grad_norm_var": 0.9203211059909447, |
| "learning_rate": 0.0001, |
| "loss": 15804969.0, |
| "loss/crossentropy": 2.7081260681152344, |
| "loss/hidden": 0.1328125, |
| "loss/logits": 0.05777532234787941, |
| "loss/reg": 15804966.0, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.00535, |
| "grad_norm": 1.5727124214172363, |
| "grad_norm_var": 0.9120561180901433, |
| "learning_rate": 0.0001, |
| "loss": 15752524.0, |
| "loss/crossentropy": 2.6647605895996094, |
| "loss/hidden": 0.1328125, |
| "loss/logits": 0.060189034789800644, |
| "loss/reg": 15752521.0, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.005375, |
| "grad_norm": 1.4179580211639404, |
| "grad_norm_var": 0.907047125573383, |
| "learning_rate": 0.0001, |
| "loss": 15700315.0, |
| "loss/crossentropy": 2.772528886795044, |
| "loss/hidden": 0.146484375, |
| "loss/logits": 0.07045571506023407, |
| "loss/reg": 15700312.0, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.0054, |
| "grad_norm": 1.3946458101272583, |
| "grad_norm_var": 0.1170919037272635, |
| "learning_rate": 0.0001, |
| "loss": 15647816.0, |
| "loss/crossentropy": 2.8495371341705322, |
| "loss/hidden": 0.1298828125, |
| "loss/logits": 0.05053715780377388, |
| "loss/reg": 15647813.0, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.005425, |
| "grad_norm": 6.674625396728516, |
| "grad_norm_var": 1.7663798054851976, |
| "learning_rate": 0.0001, |
| "loss": 15595888.0, |
| "loss/crossentropy": 2.8965396881103516, |
| "loss/hidden": 0.1767578125, |
| "loss/logits": 0.06219835206866264, |
| "loss/reg": 15595885.0, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.00545, |
| "grad_norm": 1.5235565900802612, |
| "grad_norm_var": 1.7679677227890425, |
| "learning_rate": 0.0001, |
| "loss": 15544034.0, |
| "loss/crossentropy": 2.958644151687622, |
| "loss/hidden": 0.146484375, |
| "loss/logits": 0.06609360873699188, |
| "loss/reg": 15544031.0, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.005475, |
| "grad_norm": 1.5045474767684937, |
| "grad_norm_var": 1.7392376853248037, |
| "learning_rate": 0.0001, |
| "loss": 15492036.0, |
| "loss/crossentropy": 2.811326742172241, |
| "loss/hidden": 0.1494140625, |
| "loss/logits": 0.05948233976960182, |
| "loss/reg": 15492033.0, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.0055, |
| "grad_norm": 1.3838304281234741, |
| "grad_norm_var": 1.7341556531409579, |
| "learning_rate": 0.0001, |
| "loss": 15438646.0, |
| "loss/crossentropy": 2.8927597999572754, |
| "loss/hidden": 0.1396484375, |
| "loss/logits": 0.05972151458263397, |
| "loss/reg": 15438643.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.005525, |
| "grad_norm": 1.3063217401504517, |
| "grad_norm_var": 1.7425278864985139, |
| "learning_rate": 0.0001, |
| "loss": 15386273.0, |
| "loss/crossentropy": 2.3805766105651855, |
| "loss/hidden": 0.126953125, |
| "loss/logits": 0.04874323680996895, |
| "loss/reg": 15386271.0, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.00555, |
| "grad_norm": 1.2621768712997437, |
| "grad_norm_var": 1.7571636051562562, |
| "learning_rate": 0.0001, |
| "loss": 15334386.0, |
| "loss/crossentropy": 2.8858351707458496, |
| "loss/hidden": 0.1396484375, |
| "loss/logits": 0.06341494619846344, |
| "loss/reg": 15334383.0, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.005575, |
| "grad_norm": 1.2603700160980225, |
| "grad_norm_var": 1.7646734092606573, |
| "learning_rate": 0.0001, |
| "loss": 15281238.0, |
| "loss/crossentropy": 2.628260374069214, |
| "loss/hidden": 0.1328125, |
| "loss/logits": 0.06586393713951111, |
| "loss/reg": 15281235.0, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.0056, |
| "grad_norm": 1.2761486768722534, |
| "grad_norm_var": 1.7656587948421396, |
| "learning_rate": 0.0001, |
| "loss": 15229115.0, |
| "loss/crossentropy": 2.7330679893493652, |
| "loss/hidden": 0.134765625, |
| "loss/logits": 0.07080426812171936, |
| "loss/reg": 15229112.0, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.005625, |
| "grad_norm": 1.246948003768921, |
| "grad_norm_var": 1.7705538921569173, |
| "learning_rate": 0.0001, |
| "loss": 15177146.0, |
| "loss/crossentropy": 2.811768054962158, |
| "loss/hidden": 0.13671875, |
| "loss/logits": 0.05987050384283066, |
| "loss/reg": 15177143.0, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.00565, |
| "grad_norm": 1.367579460144043, |
| "grad_norm_var": 1.77302564571926, |
| "learning_rate": 0.0001, |
| "loss": 15125624.0, |
| "loss/crossentropy": 2.9393317699432373, |
| "loss/hidden": 0.1416015625, |
| "loss/logits": 0.0645347386598587, |
| "loss/reg": 15125621.0, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.005675, |
| "grad_norm": 1.5183837413787842, |
| "grad_norm_var": 1.7650078348296498, |
| "learning_rate": 0.0001, |
| "loss": 15074144.0, |
| "loss/crossentropy": 3.0256075859069824, |
| "loss/hidden": 0.1513671875, |
| "loss/logits": 0.07241727411746979, |
| "loss/reg": 15074141.0, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.0057, |
| "grad_norm": 1.2703989744186401, |
| "grad_norm_var": 1.771705267911678, |
| "learning_rate": 0.0001, |
| "loss": 15022850.0, |
| "loss/crossentropy": 2.36106014251709, |
| "loss/hidden": 0.1318359375, |
| "loss/logits": 0.05644441395998001, |
| "loss/reg": 15022848.0, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.005725, |
| "grad_norm": 1.409714698791504, |
| "grad_norm_var": 1.7626729598809598, |
| "learning_rate": 0.0001, |
| "loss": 14970714.0, |
| "loss/crossentropy": 2.8809123039245605, |
| "loss/hidden": 0.134765625, |
| "loss/logits": 0.06301996856927872, |
| "loss/reg": 14970711.0, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.00575, |
| "grad_norm": 1.2583949565887451, |
| "grad_norm_var": 1.7746796266014728, |
| "learning_rate": 0.0001, |
| "loss": 14918887.0, |
| "loss/crossentropy": 2.6043992042541504, |
| "loss/hidden": 0.134765625, |
| "loss/logits": 0.05820544809103012, |
| "loss/reg": 14918884.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.005775, |
| "grad_norm": 1.2716870307922363, |
| "grad_norm_var": 1.7813658014995668, |
| "learning_rate": 0.0001, |
| "loss": 14867479.0, |
| "loss/crossentropy": 2.569526433944702, |
| "loss/hidden": 0.1416015625, |
| "loss/logits": 0.06259813904762268, |
| "loss/reg": 14867476.0, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.0058, |
| "grad_norm": 1.2171908617019653, |
| "grad_norm_var": 1.790158559658471, |
| "learning_rate": 0.0001, |
| "loss": 14816213.0, |
| "loss/crossentropy": 2.667618989944458, |
| "loss/hidden": 0.126953125, |
| "loss/logits": 0.0575670450925827, |
| "loss/reg": 14816210.0, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.005825, |
| "grad_norm": 1.410304307937622, |
| "grad_norm_var": 0.010830172135893908, |
| "learning_rate": 0.0001, |
| "loss": 14765031.0, |
| "loss/crossentropy": 2.635871410369873, |
| "loss/hidden": 0.13671875, |
| "loss/logits": 0.0656755268573761, |
| "loss/reg": 14765028.0, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.00585, |
| "grad_norm": 1.335420846939087, |
| "grad_norm_var": 0.00851244398751252, |
| "learning_rate": 0.0001, |
| "loss": 14714231.0, |
| "loss/crossentropy": 2.4875986576080322, |
| "loss/hidden": 0.126953125, |
| "loss/logits": 0.05866444483399391, |
| "loss/reg": 14714229.0, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.005875, |
| "grad_norm": 1.3414114713668823, |
| "grad_norm_var": 0.006405513254640916, |
| "learning_rate": 0.0001, |
| "loss": 14663123.0, |
| "loss/crossentropy": 2.6054978370666504, |
| "loss/hidden": 0.1328125, |
| "loss/logits": 0.056889429688453674, |
| "loss/reg": 14663120.0, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.0059, |
| "grad_norm": 1.5171602964401245, |
| "grad_norm_var": 0.008633209556131227, |
| "learning_rate": 0.0001, |
| "loss": 14612035.0, |
| "loss/crossentropy": 2.5035834312438965, |
| "loss/hidden": 0.1552734375, |
| "loss/logits": 0.0759689137339592, |
| "loss/reg": 14612032.0, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.005925, |
| "grad_norm": 1.2348196506500244, |
| "grad_norm_var": 0.00917229347243141, |
| "learning_rate": 0.0001, |
| "loss": 14561254.0, |
| "loss/crossentropy": 2.778177261352539, |
| "loss/hidden": 0.146484375, |
| "loss/logits": 0.06729499250650406, |
| "loss/reg": 14561251.0, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.00595, |
| "grad_norm": 1.2025796175003052, |
| "grad_norm_var": 0.009892555749424521, |
| "learning_rate": 0.0001, |
| "loss": 14510966.0, |
| "loss/crossentropy": 2.3543541431427, |
| "loss/hidden": 0.12255859375, |
| "loss/logits": 0.04802260547876358, |
| "loss/reg": 14510964.0, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.005975, |
| "grad_norm": 1.2842696905136108, |
| "grad_norm_var": 0.009734550063452651, |
| "learning_rate": 0.0001, |
| "loss": 14459453.0, |
| "loss/crossentropy": 2.7179393768310547, |
| "loss/hidden": 0.1328125, |
| "loss/logits": 0.059902362525463104, |
| "loss/reg": 14459450.0, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 1.3977264165878296, |
| "grad_norm_var": 0.009904555856142316, |
| "learning_rate": 0.0001, |
| "loss": 14409158.0, |
| "loss/crossentropy": 2.774381160736084, |
| "loss/hidden": 0.1416015625, |
| "loss/logits": 0.06660085171461105, |
| "loss/reg": 14409155.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.006025, |
| "grad_norm": 1.242409110069275, |
| "grad_norm_var": 0.009956256263565659, |
| "learning_rate": 0.0001, |
| "loss": 14358507.0, |
| "loss/crossentropy": 2.4397199153900146, |
| "loss/hidden": 0.1298828125, |
| "loss/logits": 0.052751779556274414, |
| "loss/reg": 14358505.0, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.00605, |
| "grad_norm": 1.370653510093689, |
| "grad_norm_var": 0.009972263753752427, |
| "learning_rate": 0.0001, |
| "loss": 14308243.0, |
| "loss/crossentropy": 2.6412172317504883, |
| "loss/hidden": 0.1328125, |
| "loss/logits": 0.05721309036016464, |
| "loss/reg": 14308240.0, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.006075, |
| "grad_norm": 1.1662626266479492, |
| "grad_norm_var": 0.008884484399011624, |
| "learning_rate": 0.0001, |
| "loss": 14257844.0, |
| "loss/crossentropy": 2.665437936782837, |
| "loss/hidden": 0.126953125, |
| "loss/logits": 0.05664648860692978, |
| "loss/reg": 14257841.0, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.0061, |
| "grad_norm": 1.8995401859283447, |
| "grad_norm_var": 0.030456366114738623, |
| "learning_rate": 0.0001, |
| "loss": 14207718.0, |
| "loss/crossentropy": 2.67364501953125, |
| "loss/hidden": 0.1650390625, |
| "loss/logits": 0.07564130425453186, |
| "loss/reg": 14207715.0, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.006125, |
| "grad_norm": 1.334937572479248, |
| "grad_norm_var": 0.03018526073676237, |
| "learning_rate": 0.0001, |
| "loss": 14157583.0, |
| "loss/crossentropy": 2.4179587364196777, |
| "loss/hidden": 0.1416015625, |
| "loss/logits": 0.056714944541454315, |
| "loss/reg": 14157581.0, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.00615, |
| "grad_norm": 2.0209946632385254, |
| "grad_norm_var": 0.057950556152923126, |
| "learning_rate": 0.0001, |
| "loss": 14107935.0, |
| "loss/crossentropy": 2.9014651775360107, |
| "loss/hidden": 0.1513671875, |
| "loss/logits": 0.07615067064762115, |
| "loss/reg": 14107932.0, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.006175, |
| "grad_norm": 1.383357286453247, |
| "grad_norm_var": 0.056961484836246495, |
| "learning_rate": 0.0001, |
| "loss": 14057996.0, |
| "loss/crossentropy": 2.815054416656494, |
| "loss/hidden": 0.1416015625, |
| "loss/logits": 0.05734875425696373, |
| "loss/reg": 14057993.0, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.0062, |
| "grad_norm": 1.391110897064209, |
| "grad_norm_var": 0.05467214053676338, |
| "learning_rate": 0.0001, |
| "loss": 14008416.0, |
| "loss/crossentropy": 2.761909246444702, |
| "loss/hidden": 0.1396484375, |
| "loss/logits": 0.061668697744607925, |
| "loss/reg": 14008413.0, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.006225, |
| "grad_norm": 1.3255330324172974, |
| "grad_norm_var": 0.055098733464872866, |
| "learning_rate": 0.0001, |
| "loss": 13958851.0, |
| "loss/crossentropy": 2.9398627281188965, |
| "loss/hidden": 0.14453125, |
| "loss/logits": 0.06547331809997559, |
| "loss/reg": 13958848.0, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.00625, |
| "grad_norm": 1.4229283332824707, |
| "grad_norm_var": 0.05478870379254559, |
| "learning_rate": 0.0001, |
| "loss": 13908190.0, |
| "loss/crossentropy": 2.458496570587158, |
| "loss/hidden": 0.134765625, |
| "loss/logits": 0.05823984369635582, |
| "loss/reg": 13908188.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.006275, |
| "grad_norm": 1.3032993078231812, |
| "grad_norm_var": 0.055220308814244605, |
| "learning_rate": 0.0001, |
| "loss": 13858638.0, |
| "loss/crossentropy": 2.803184986114502, |
| "loss/hidden": 0.1396484375, |
| "loss/logits": 0.054867275059223175, |
| "loss/reg": 13858635.0, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.0063, |
| "grad_norm": 1.5352330207824707, |
| "grad_norm_var": 0.05550834707216972, |
| "learning_rate": 0.0001, |
| "loss": 13809093.0, |
| "loss/crossentropy": 2.876777172088623, |
| "loss/hidden": 0.14453125, |
| "loss/logits": 0.06620798259973526, |
| "loss/reg": 13809090.0, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.006325, |
| "grad_norm": 1.2868770360946655, |
| "grad_norm_var": 0.05448103356127281, |
| "learning_rate": 0.0001, |
| "loss": 13759784.0, |
| "loss/crossentropy": 2.796433448791504, |
| "loss/hidden": 0.14453125, |
| "loss/logits": 0.06459345668554306, |
| "loss/reg": 13759781.0, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.00635, |
| "grad_norm": 1.4136613607406616, |
| "grad_norm_var": 0.051414498940635885, |
| "learning_rate": 0.0001, |
| "loss": 13710695.0, |
| "loss/crossentropy": 3.004969596862793, |
| "loss/hidden": 0.14453125, |
| "loss/logits": 0.06056927144527435, |
| "loss/reg": 13710692.0, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.006375, |
| "grad_norm": 1.3518521785736084, |
| "grad_norm_var": 0.05044378331290407, |
| "learning_rate": 0.0001, |
| "loss": 13661589.0, |
| "loss/crossentropy": 2.7409610748291016, |
| "loss/hidden": 0.146484375, |
| "loss/logits": 0.05814550817012787, |
| "loss/reg": 13661586.0, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.0064, |
| "grad_norm": 1.1929123401641846, |
| "grad_norm_var": 0.05388954025350093, |
| "learning_rate": 0.0001, |
| "loss": 13612754.0, |
| "loss/crossentropy": 2.747603416442871, |
| "loss/hidden": 0.1318359375, |
| "loss/logits": 0.05141870677471161, |
| "loss/reg": 13612751.0, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.006425, |
| "grad_norm": 1.3743230104446411, |
| "grad_norm_var": 0.05193978415803689, |
| "learning_rate": 0.0001, |
| "loss": 13563942.0, |
| "loss/crossentropy": 2.7724549770355225, |
| "loss/hidden": 0.1298828125, |
| "loss/logits": 0.05072151497006416, |
| "loss/reg": 13563939.0, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.00645, |
| "grad_norm": 1.2566577196121216, |
| "grad_norm_var": 0.053552813791624014, |
| "learning_rate": 0.0001, |
| "loss": 13515261.0, |
| "loss/crossentropy": 2.636183500289917, |
| "loss/hidden": 0.1416015625, |
| "loss/logits": 0.06476832181215286, |
| "loss/reg": 13515258.0, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.006475, |
| "grad_norm": 1.3123027086257935, |
| "grad_norm_var": 0.050018670666301016, |
| "learning_rate": 0.0001, |
| "loss": 13466797.0, |
| "loss/crossentropy": 2.955885410308838, |
| "loss/hidden": 0.14453125, |
| "loss/logits": 0.059732139110565186, |
| "loss/reg": 13466794.0, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.0065, |
| "grad_norm": 1.560585856437683, |
| "grad_norm_var": 0.03576856501614785, |
| "learning_rate": 0.0001, |
| "loss": 13418454.0, |
| "loss/crossentropy": 2.534496545791626, |
| "loss/hidden": 0.15625, |
| "loss/logits": 0.08581315726041794, |
| "loss/reg": 13418451.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.006525, |
| "grad_norm": 1.2515180110931396, |
| "grad_norm_var": 0.03697342980280212, |
| "learning_rate": 0.0001, |
| "loss": 13370145.0, |
| "loss/crossentropy": 2.733609437942505, |
| "loss/hidden": 0.134765625, |
| "loss/logits": 0.054118528962135315, |
| "loss/reg": 13370142.0, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.00655, |
| "grad_norm": 1.178729772567749, |
| "grad_norm_var": 0.011454339563150739, |
| "learning_rate": 0.0001, |
| "loss": 13321868.0, |
| "loss/crossentropy": 2.6014256477355957, |
| "loss/hidden": 0.12890625, |
| "loss/logits": 0.05922620743513107, |
| "loss/reg": 13321865.0, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.006575, |
| "grad_norm": 1.3556615114212036, |
| "grad_norm_var": 0.011365455420405545, |
| "learning_rate": 0.0001, |
| "loss": 13273696.0, |
| "loss/crossentropy": 2.8918492794036865, |
| "loss/hidden": 0.1513671875, |
| "loss/logits": 0.06422768533229828, |
| "loss/reg": 13273693.0, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.0066, |
| "grad_norm": 1.3636914491653442, |
| "grad_norm_var": 0.01124230956236237, |
| "learning_rate": 0.0001, |
| "loss": 13225507.0, |
| "loss/crossentropy": 2.80137038230896, |
| "loss/hidden": 0.146484375, |
| "loss/logits": 0.0683981329202652, |
| "loss/reg": 13225504.0, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.006625, |
| "grad_norm": 1.5487805604934692, |
| "grad_norm_var": 0.01384150300905252, |
| "learning_rate": 0.0001, |
| "loss": 13177538.0, |
| "loss/crossentropy": 2.9500300884246826, |
| "loss/hidden": 0.1533203125, |
| "loss/logits": 0.06856787204742432, |
| "loss/reg": 13177535.0, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.00665, |
| "grad_norm": 1.3983412981033325, |
| "grad_norm_var": 0.013662542915044599, |
| "learning_rate": 0.0001, |
| "loss": 13129641.0, |
| "loss/crossentropy": 2.6822879314422607, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.0812714546918869, |
| "loss/reg": 13129638.0, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.006675, |
| "grad_norm": 1.3333851099014282, |
| "grad_norm_var": 0.013510610942991679, |
| "learning_rate": 0.0001, |
| "loss": 13081750.0, |
| "loss/crossentropy": 2.658419370651245, |
| "loss/hidden": 0.146484375, |
| "loss/logits": 0.0646916851401329, |
| "loss/reg": 13081747.0, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.0067, |
| "grad_norm": 1.5055360794067383, |
| "grad_norm_var": 0.012860622027157771, |
| "learning_rate": 0.0001, |
| "loss": 13034133.0, |
| "loss/crossentropy": 2.8443102836608887, |
| "loss/hidden": 0.1552734375, |
| "loss/logits": 0.07461690157651901, |
| "loss/reg": 13034130.0, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.006725, |
| "grad_norm": 1.323675274848938, |
| "grad_norm_var": 0.012609536410041677, |
| "learning_rate": 0.0001, |
| "loss": 12985076.0, |
| "loss/crossentropy": 2.678550958633423, |
| "loss/hidden": 0.14453125, |
| "loss/logits": 0.061058469116687775, |
| "loss/reg": 12985073.0, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.00675, |
| "grad_norm": 1.6064231395721436, |
| "grad_norm_var": 0.01637269751022714, |
| "learning_rate": 0.0001, |
| "loss": 12937573.0, |
| "loss/crossentropy": 2.397972345352173, |
| "loss/hidden": 0.1513671875, |
| "loss/logits": 0.07134551554918289, |
| "loss/reg": 12937571.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.006775, |
| "grad_norm": 1.2385890483856201, |
| "grad_norm_var": 0.017443236680171736, |
| "learning_rate": 0.0001, |
| "loss": 12888905.0, |
| "loss/crossentropy": 2.6373982429504395, |
| "loss/hidden": 0.1416015625, |
| "loss/logits": 0.0653548389673233, |
| "loss/reg": 12888902.0, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.0068, |
| "grad_norm": 1.247679352760315, |
| "grad_norm_var": 0.01639181827019905, |
| "learning_rate": 0.0001, |
| "loss": 12841632.0, |
| "loss/crossentropy": 2.9294281005859375, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.06420879811048508, |
| "loss/reg": 12841629.0, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.006825, |
| "grad_norm": 1.2457160949707031, |
| "grad_norm_var": 0.017282703668287998, |
| "learning_rate": 0.0001, |
| "loss": 12793025.0, |
| "loss/crossentropy": 2.591928720474243, |
| "loss/hidden": 0.146484375, |
| "loss/logits": 0.06234884262084961, |
| "loss/reg": 12793022.0, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.00685, |
| "grad_norm": 1.4415357112884521, |
| "grad_norm_var": 0.01692193809214325, |
| "learning_rate": 0.0001, |
| "loss": 12745643.0, |
| "loss/crossentropy": 2.8222391605377197, |
| "loss/hidden": 0.1484375, |
| "loss/logits": 0.06979811191558838, |
| "loss/reg": 12745640.0, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.006875, |
| "grad_norm": 1.3460314273834229, |
| "grad_norm_var": 0.016735771796449465, |
| "learning_rate": 0.0001, |
| "loss": 12698444.0, |
| "loss/crossentropy": 2.6770355701446533, |
| "loss/hidden": 0.1533203125, |
| "loss/logits": 0.0641147643327713, |
| "loss/reg": 12698441.0, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.0069, |
| "grad_norm": 1.37147855758667, |
| "grad_norm_var": 0.014206163414568056, |
| "learning_rate": 0.0001, |
| "loss": 12651290.0, |
| "loss/crossentropy": 2.907212018966675, |
| "loss/hidden": 0.150390625, |
| "loss/logits": 0.0651848316192627, |
| "loss/reg": 12651287.0, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.006925, |
| "grad_norm": 1.2531616687774658, |
| "grad_norm_var": 0.014182602173533058, |
| "learning_rate": 0.0001, |
| "loss": 12604421.0, |
| "loss/crossentropy": 2.603314161300659, |
| "loss/hidden": 0.14453125, |
| "loss/logits": 0.061645396053791046, |
| "loss/reg": 12604418.0, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.00695, |
| "grad_norm": 1.2453293800354004, |
| "grad_norm_var": 0.012851030356757024, |
| "learning_rate": 0.0001, |
| "loss": 12557653.0, |
| "loss/crossentropy": 2.649489164352417, |
| "loss/hidden": 0.1533203125, |
| "loss/logits": 0.07151087373495102, |
| "loss/reg": 12557650.0, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.006975, |
| "grad_norm": 1.3640412092208862, |
| "grad_norm_var": 0.012846031605303131, |
| "learning_rate": 0.0001, |
| "loss": 12511062.0, |
| "loss/crossentropy": 2.8236021995544434, |
| "loss/hidden": 0.1552734375, |
| "loss/logits": 0.0749356746673584, |
| "loss/reg": 12511059.0, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 1.4946240186691284, |
| "grad_norm_var": 0.013901852310667984, |
| "learning_rate": 0.0001, |
| "loss": 12464600.0, |
| "loss/crossentropy": 2.7980360984802246, |
| "loss/hidden": 0.1630859375, |
| "loss/logits": 0.0737333670258522, |
| "loss/reg": 12464597.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.007025, |
| "grad_norm": 1.1885918378829956, |
| "grad_norm_var": 0.013557435000826424, |
| "learning_rate": 0.0001, |
| "loss": 12418278.0, |
| "loss/crossentropy": 2.5716607570648193, |
| "loss/hidden": 0.1494140625, |
| "loss/logits": 0.0630015879869461, |
| "loss/reg": 12418275.0, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.00705, |
| "grad_norm": 1.2317140102386475, |
| "grad_norm_var": 0.014224476107298756, |
| "learning_rate": 0.0001, |
| "loss": 12371951.0, |
| "loss/crossentropy": 2.624586820602417, |
| "loss/hidden": 0.14453125, |
| "loss/logits": 0.06658768653869629, |
| "loss/reg": 12371948.0, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.007075, |
| "grad_norm": 1.2487077713012695, |
| "grad_norm_var": 0.01474554530936505, |
| "learning_rate": 0.0001, |
| "loss": 12325238.0, |
| "loss/crossentropy": 2.5505199432373047, |
| "loss/hidden": 0.1416015625, |
| "loss/logits": 0.05641041696071625, |
| "loss/reg": 12325235.0, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.0071, |
| "grad_norm": 1.246119499206543, |
| "grad_norm_var": 0.013037463668912074, |
| "learning_rate": 0.0001, |
| "loss": 12278784.0, |
| "loss/crossentropy": 2.8936548233032227, |
| "loss/hidden": 0.150390625, |
| "loss/logits": 0.06394726037979126, |
| "loss/reg": 12278781.0, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.007125, |
| "grad_norm": 1.3816927671432495, |
| "grad_norm_var": 0.013289123045005565, |
| "learning_rate": 0.0001, |
| "loss": 12232824.0, |
| "loss/crossentropy": 2.8195226192474365, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.06303142011165619, |
| "loss/reg": 12232821.0, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.00715, |
| "grad_norm": 1.4366105794906616, |
| "grad_norm_var": 0.00865077711116342, |
| "learning_rate": 0.0001, |
| "loss": 12186775.0, |
| "loss/crossentropy": 2.888385057449341, |
| "loss/hidden": 0.16015625, |
| "loss/logits": 0.07798555493354797, |
| "loss/reg": 12186772.0, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.007175, |
| "grad_norm": 1.3869065046310425, |
| "grad_norm_var": 0.008586732103005978, |
| "learning_rate": 0.0001, |
| "loss": 12141003.0, |
| "loss/crossentropy": 2.5025510787963867, |
| "loss/hidden": 0.146484375, |
| "loss/logits": 0.060528963804244995, |
| "loss/reg": 12141000.0, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.0072, |
| "grad_norm": 1.2768436670303345, |
| "grad_norm_var": 0.008356251779483573, |
| "learning_rate": 0.0001, |
| "loss": 12095420.0, |
| "loss/crossentropy": 2.7354958057403564, |
| "loss/hidden": 0.150390625, |
| "loss/logits": 0.06157786399126053, |
| "loss/reg": 12095417.0, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.007225, |
| "grad_norm": 1.5967066287994385, |
| "grad_norm_var": 0.012465127782396657, |
| "learning_rate": 0.0001, |
| "loss": 12049617.0, |
| "loss/crossentropy": 2.6356041431427, |
| "loss/hidden": 0.1552734375, |
| "loss/logits": 0.06536837667226791, |
| "loss/reg": 12049614.0, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.00725, |
| "grad_norm": 2.3392114639282227, |
| "grad_norm_var": 0.07445745042612652, |
| "learning_rate": 0.0001, |
| "loss": 12003760.0, |
| "loss/crossentropy": 2.8234245777130127, |
| "loss/hidden": 0.16015625, |
| "loss/logits": 0.06355369091033936, |
| "loss/reg": 12003757.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.007275, |
| "grad_norm": 1.4052925109863281, |
| "grad_norm_var": 0.0742466735540986, |
| "learning_rate": 0.0001, |
| "loss": 11957367.0, |
| "loss/crossentropy": 2.770820379257202, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.06595819443464279, |
| "loss/reg": 11957364.0, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.0073, |
| "grad_norm": 1.5095019340515137, |
| "grad_norm_var": 0.07483534345019074, |
| "learning_rate": 0.0001, |
| "loss": 11911583.0, |
| "loss/crossentropy": 2.622516632080078, |
| "loss/hidden": 0.1552734375, |
| "loss/logits": 0.061189621686935425, |
| "loss/reg": 11911580.0, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.007325, |
| "grad_norm": 1.5426888465881348, |
| "grad_norm_var": 0.0739112332161359, |
| "learning_rate": 0.0001, |
| "loss": 11866278.0, |
| "loss/crossentropy": 2.597724199295044, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.08662068843841553, |
| "loss/reg": 11866275.0, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.00735, |
| "grad_norm": 1.3020626306533813, |
| "grad_norm_var": 0.07270857663244973, |
| "learning_rate": 0.0001, |
| "loss": 11820649.0, |
| "loss/crossentropy": 2.547603130340576, |
| "loss/hidden": 0.1552734375, |
| "loss/logits": 0.06915931403636932, |
| "loss/reg": 11820646.0, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.007375, |
| "grad_norm": 1.222300410270691, |
| "grad_norm_var": 0.07529500665479721, |
| "learning_rate": 0.0001, |
| "loss": 11775242.0, |
| "loss/crossentropy": 2.5956859588623047, |
| "loss/hidden": 0.1552734375, |
| "loss/logits": 0.07406603544950485, |
| "loss/reg": 11775239.0, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.0074, |
| "grad_norm": 1.4893590211868286, |
| "grad_norm_var": 0.0752482832359417, |
| "learning_rate": 0.0001, |
| "loss": 11730212.0, |
| "loss/crossentropy": 2.9076459407806396, |
| "loss/hidden": 0.1630859375, |
| "loss/logits": 0.062755286693573, |
| "loss/reg": 11730209.0, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.007425, |
| "grad_norm": 1.3485422134399414, |
| "grad_norm_var": 0.07179973599680659, |
| "learning_rate": 0.0001, |
| "loss": 11684183.0, |
| "loss/crossentropy": 2.72701358795166, |
| "loss/hidden": 0.15625, |
| "loss/logits": 0.05923382192850113, |
| "loss/reg": 11684180.0, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.00745, |
| "grad_norm": 1.3736414909362793, |
| "grad_norm_var": 0.0692067443544459, |
| "learning_rate": 0.0001, |
| "loss": 11639334.0, |
| "loss/crossentropy": 2.9518539905548096, |
| "loss/hidden": 0.1650390625, |
| "loss/logits": 0.06741394102573395, |
| "loss/reg": 11639331.0, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.007475, |
| "grad_norm": 1.6101781129837036, |
| "grad_norm_var": 0.06795414107035427, |
| "learning_rate": 0.0001, |
| "loss": 11594727.0, |
| "loss/crossentropy": 2.747162103652954, |
| "loss/hidden": 0.1630859375, |
| "loss/logits": 0.06554447114467621, |
| "loss/reg": 11594724.0, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.0075, |
| "grad_norm": 1.4444247484207153, |
| "grad_norm_var": 0.06457889165781641, |
| "learning_rate": 0.0001, |
| "loss": 11550294.0, |
| "loss/crossentropy": 2.7491886615753174, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.07129132002592087, |
| "loss/reg": 11550291.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.007525, |
| "grad_norm": 1.2323545217514038, |
| "grad_norm_var": 0.06791276356974355, |
| "learning_rate": 0.0001, |
| "loss": 11505964.0, |
| "loss/crossentropy": 2.6432175636291504, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.07246769219636917, |
| "loss/reg": 11505961.0, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.00755, |
| "grad_norm": 1.1970010995864868, |
| "grad_norm_var": 0.07256104194224325, |
| "learning_rate": 0.0001, |
| "loss": 11461953.0, |
| "loss/crossentropy": 2.41015887260437, |
| "loss/hidden": 0.1494140625, |
| "loss/logits": 0.05817747861146927, |
| "loss/reg": 11461951.0, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.007575, |
| "grad_norm": 1.467490553855896, |
| "grad_norm_var": 0.07223727446386707, |
| "learning_rate": 0.0001, |
| "loss": 11418255.0, |
| "loss/crossentropy": 2.652329683303833, |
| "loss/hidden": 0.1767578125, |
| "loss/logits": 0.09269724786281586, |
| "loss/reg": 11418252.0, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.0076, |
| "grad_norm": 1.3236180543899536, |
| "grad_norm_var": 0.071232680179969, |
| "learning_rate": 0.0001, |
| "loss": 11374562.0, |
| "loss/crossentropy": 2.699312448501587, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.07041790336370468, |
| "loss/reg": 11374559.0, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.007625, |
| "grad_norm": 1.314737319946289, |
| "grad_norm_var": 0.07116650667664613, |
| "learning_rate": 0.0001, |
| "loss": 11330972.0, |
| "loss/crossentropy": 2.7268805503845215, |
| "loss/hidden": 0.16015625, |
| "loss/logits": 0.06517157703638077, |
| "loss/reg": 11330969.0, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.00765, |
| "grad_norm": 1.2834835052490234, |
| "grad_norm_var": 0.014975220680485238, |
| "learning_rate": 0.0001, |
| "loss": 11287487.0, |
| "loss/crossentropy": 2.823366403579712, |
| "loss/hidden": 0.1669921875, |
| "loss/logits": 0.07153861224651337, |
| "loss/reg": 11287484.0, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.007675, |
| "grad_norm": 1.2672278881072998, |
| "grad_norm_var": 0.015685656899695685, |
| "learning_rate": 0.0001, |
| "loss": 11243797.0, |
| "loss/crossentropy": 2.671966552734375, |
| "loss/hidden": 0.1494140625, |
| "loss/logits": 0.05726707726716995, |
| "loss/reg": 11243794.0, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.0077, |
| "grad_norm": 1.3546096086502075, |
| "grad_norm_var": 0.014315213293349179, |
| "learning_rate": 0.0001, |
| "loss": 11200662.0, |
| "loss/crossentropy": 2.774587631225586, |
| "loss/hidden": 0.16796875, |
| "loss/logits": 0.07468734681606293, |
| "loss/reg": 11200659.0, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.007725, |
| "grad_norm": 1.2348955869674683, |
| "grad_norm_var": 0.012774061477813762, |
| "learning_rate": 0.0001, |
| "loss": 11157187.0, |
| "loss/crossentropy": 2.704190731048584, |
| "loss/hidden": 0.1494140625, |
| "loss/logits": 0.06237147003412247, |
| "loss/reg": 11157184.0, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.00775, |
| "grad_norm": 1.2152798175811768, |
| "grad_norm_var": 0.013702489883450871, |
| "learning_rate": 0.0001, |
| "loss": 11114156.0, |
| "loss/crossentropy": 2.44256591796875, |
| "loss/hidden": 0.15625, |
| "loss/logits": 0.0657656192779541, |
| "loss/reg": 11114154.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.007775, |
| "grad_norm": 1.467839002609253, |
| "grad_norm_var": 0.013741780260100178, |
| "learning_rate": 0.0001, |
| "loss": 11071337.0, |
| "loss/crossentropy": 2.494431257247925, |
| "loss/hidden": 0.1513671875, |
| "loss/logits": 0.060574792325496674, |
| "loss/reg": 11071335.0, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.0078, |
| "grad_norm": 4.799531936645508, |
| "grad_norm_var": 0.7593957077764041, |
| "learning_rate": 0.0001, |
| "loss": 11028225.0, |
| "loss/crossentropy": 2.7325127124786377, |
| "loss/hidden": 0.205078125, |
| "loss/logits": 0.10913005471229553, |
| "loss/reg": 11028222.0, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.007825, |
| "grad_norm": 1.5475250482559204, |
| "grad_norm_var": 0.7563018417659354, |
| "learning_rate": 0.0001, |
| "loss": 10985258.0, |
| "loss/crossentropy": 2.623095750808716, |
| "loss/hidden": 0.169921875, |
| "loss/logits": 0.06868870556354523, |
| "loss/reg": 10985255.0, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.00785, |
| "grad_norm": 1.5860449075698853, |
| "grad_norm_var": 0.7535360858724476, |
| "learning_rate": 0.0001, |
| "loss": 10942147.0, |
| "loss/crossentropy": 2.978168249130249, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.08529134094715118, |
| "loss/reg": 10942144.0, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.007875, |
| "grad_norm": 1.559672474861145, |
| "grad_norm_var": 0.7535201699426369, |
| "learning_rate": 0.0001, |
| "loss": 10899102.0, |
| "loss/crossentropy": 2.6341423988342285, |
| "loss/hidden": 0.1767578125, |
| "loss/logits": 0.07178042829036713, |
| "loss/reg": 10899099.0, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.0079, |
| "grad_norm": 1.6394689083099365, |
| "grad_norm_var": 0.7523464772438392, |
| "learning_rate": 0.0001, |
| "loss": 10856200.0, |
| "loss/crossentropy": 2.9553263187408447, |
| "loss/hidden": 0.171875, |
| "loss/logits": 0.07007050514221191, |
| "loss/reg": 10856197.0, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.007925, |
| "grad_norm": 1.447920560836792, |
| "grad_norm_var": 0.7448800554003959, |
| "learning_rate": 0.0001, |
| "loss": 10813541.0, |
| "loss/crossentropy": 2.946011781692505, |
| "loss/hidden": 0.1767578125, |
| "loss/logits": 0.07400735467672348, |
| "loss/reg": 10813538.0, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.00795, |
| "grad_norm": 1.0697530508041382, |
| "grad_norm_var": 0.7528422723171038, |
| "learning_rate": 0.0001, |
| "loss": 10770241.0, |
| "loss/crossentropy": 2.3608431816101074, |
| "loss/hidden": 0.1513671875, |
| "loss/logits": 0.06279103457927704, |
| "loss/reg": 10770239.0, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.007975, |
| "grad_norm": 1.2852470874786377, |
| "grad_norm_var": 0.7581061855543741, |
| "learning_rate": 0.0001, |
| "loss": 10727912.0, |
| "loss/crossentropy": 2.5171866416931152, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.06151127815246582, |
| "loss/reg": 10727909.0, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 1.2004117965698242, |
| "grad_norm_var": 0.7633866135850361, |
| "learning_rate": 0.0001, |
| "loss": 10685688.0, |
| "loss/crossentropy": 2.453835964202881, |
| "loss/hidden": 0.1513671875, |
| "loss/logits": 0.0553789883852005, |
| "loss/reg": 10685686.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.008025, |
| "grad_norm": 1.3304163217544556, |
| "grad_norm_var": 0.762848267366391, |
| "learning_rate": 0.0001, |
| "loss": 10643868.0, |
| "loss/crossentropy": 2.4853131771087646, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.060194939374923706, |
| "loss/reg": 10643866.0, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.00805, |
| "grad_norm": 1.3564056158065796, |
| "grad_norm_var": 0.7602919368464993, |
| "learning_rate": 0.0001, |
| "loss": 10602117.0, |
| "loss/crossentropy": 2.878424882888794, |
| "loss/hidden": 0.1650390625, |
| "loss/logits": 0.08001019805669785, |
| "loss/reg": 10602114.0, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.008075, |
| "grad_norm": 1.5387823581695557, |
| "grad_norm_var": 0.7533900521384792, |
| "learning_rate": 0.0001, |
| "loss": 10560412.0, |
| "loss/crossentropy": 2.9458792209625244, |
| "loss/hidden": 0.16796875, |
| "loss/logits": 0.07027631998062134, |
| "loss/reg": 10560409.0, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.0081, |
| "grad_norm": 1.539499282836914, |
| "grad_norm_var": 0.749425127298521, |
| "learning_rate": 0.0001, |
| "loss": 10518908.0, |
| "loss/crossentropy": 3.2355947494506836, |
| "loss/hidden": 0.1748046875, |
| "loss/logits": 0.08225306868553162, |
| "loss/reg": 10518905.0, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.008125, |
| "grad_norm": 1.4031704664230347, |
| "grad_norm_var": 0.7426965121305542, |
| "learning_rate": 0.0001, |
| "loss": 10477454.0, |
| "loss/crossentropy": 2.7750797271728516, |
| "loss/hidden": 0.1748046875, |
| "loss/logits": 0.07553953677415848, |
| "loss/reg": 10477451.0, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.00815, |
| "grad_norm": 1.2704875469207764, |
| "grad_norm_var": 0.7398770379418408, |
| "learning_rate": 0.0001, |
| "loss": 10436184.0, |
| "loss/crossentropy": 2.783022165298462, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.06672142446041107, |
| "loss/reg": 10436181.0, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.008175, |
| "grad_norm": 1.1805408000946045, |
| "grad_norm_var": 0.751157056906436, |
| "learning_rate": 0.0001, |
| "loss": 10395058.0, |
| "loss/crossentropy": 2.4957735538482666, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.06673818826675415, |
| "loss/reg": 10395056.0, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.0082, |
| "grad_norm": 1.2916992902755737, |
| "grad_norm_var": 0.028283719007375415, |
| "learning_rate": 0.0001, |
| "loss": 10353862.0, |
| "loss/crossentropy": 2.9765448570251465, |
| "loss/hidden": 0.1611328125, |
| "loss/logits": 0.07167674601078033, |
| "loss/reg": 10353859.0, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.008225, |
| "grad_norm": 1.4934465885162354, |
| "grad_norm_var": 0.02733384582056472, |
| "learning_rate": 0.0001, |
| "loss": 10312590.0, |
| "loss/crossentropy": 2.790802478790283, |
| "loss/hidden": 0.16796875, |
| "loss/logits": 0.07411827147006989, |
| "loss/reg": 10312587.0, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.00825, |
| "grad_norm": 1.258028268814087, |
| "grad_norm_var": 0.02535583257836317, |
| "learning_rate": 0.0001, |
| "loss": 10271773.0, |
| "loss/crossentropy": 2.9188411235809326, |
| "loss/hidden": 0.1669921875, |
| "loss/logits": 0.0773385763168335, |
| "loss/reg": 10271770.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.008275, |
| "grad_norm": 1.4528026580810547, |
| "grad_norm_var": 0.02331792709637733, |
| "learning_rate": 0.0001, |
| "loss": 10231186.0, |
| "loss/crossentropy": 2.6370275020599365, |
| "loss/hidden": 0.1630859375, |
| "loss/logits": 0.062418196350336075, |
| "loss/reg": 10231183.0, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.0083, |
| "grad_norm": 1.2306187152862549, |
| "grad_norm_var": 0.018524003616722892, |
| "learning_rate": 0.0001, |
| "loss": 10190540.0, |
| "loss/crossentropy": 2.61413311958313, |
| "loss/hidden": 0.15625, |
| "loss/logits": 0.06008683890104294, |
| "loss/reg": 10190537.0, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.008325, |
| "grad_norm": 1.5792001485824585, |
| "grad_norm_var": 0.021589486437513253, |
| "learning_rate": 0.0001, |
| "loss": 10150061.0, |
| "loss/crossentropy": 2.899104118347168, |
| "loss/hidden": 0.171875, |
| "loss/logits": 0.07638096809387207, |
| "loss/reg": 10150058.0, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.00835, |
| "grad_norm": 1.3788844347000122, |
| "grad_norm_var": 0.01631885867243265, |
| "learning_rate": 0.0001, |
| "loss": 10109258.0, |
| "loss/crossentropy": 2.6897454261779785, |
| "loss/hidden": 0.1650390625, |
| "loss/logits": 0.0693337470293045, |
| "loss/reg": 10109255.0, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.008375, |
| "grad_norm": 1.7988924980163574, |
| "grad_norm_var": 0.027561925774394307, |
| "learning_rate": 0.0001, |
| "loss": 10068907.0, |
| "loss/crossentropy": 2.754288911819458, |
| "loss/hidden": 0.1865234375, |
| "loss/logits": 0.07164829969406128, |
| "loss/reg": 10068904.0, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.0084, |
| "grad_norm": 1.3495973348617554, |
| "grad_norm_var": 0.025103091369277724, |
| "learning_rate": 0.0001, |
| "loss": 10028749.0, |
| "loss/crossentropy": 2.725576639175415, |
| "loss/hidden": 0.169921875, |
| "loss/logits": 0.07103095948696136, |
| "loss/reg": 10028746.0, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.008425, |
| "grad_norm": 1.410449504852295, |
| "grad_norm_var": 0.024725893438143014, |
| "learning_rate": 0.0001, |
| "loss": 9988687.0, |
| "loss/crossentropy": 2.6426005363464355, |
| "loss/hidden": 0.1748046875, |
| "loss/logits": 0.07541201263666153, |
| "loss/reg": 9988684.0, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.00845, |
| "grad_norm": 1.5643993616104126, |
| "grad_norm_var": 0.02599108028098437, |
| "learning_rate": 0.0001, |
| "loss": 9948401.0, |
| "loss/crossentropy": 2.8818442821502686, |
| "loss/hidden": 0.1767578125, |
| "loss/logits": 0.06993991136550903, |
| "loss/reg": 9948398.0, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.008475, |
| "grad_norm": 1.3496482372283936, |
| "grad_norm_var": 0.02526368216931374, |
| "learning_rate": 0.0001, |
| "loss": 9908155.0, |
| "loss/crossentropy": 2.8442389965057373, |
| "loss/hidden": 0.16796875, |
| "loss/logits": 0.0711495652794838, |
| "loss/reg": 9908152.0, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.0085, |
| "grad_norm": 1.764139175415039, |
| "grad_norm_var": 0.032312549873250634, |
| "learning_rate": 0.0001, |
| "loss": 9868146.0, |
| "loss/crossentropy": 2.986398220062256, |
| "loss/hidden": 0.181640625, |
| "loss/logits": 0.08195482194423676, |
| "loss/reg": 9868143.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.008525, |
| "grad_norm": 1.231662392616272, |
| "grad_norm_var": 0.03461588632583433, |
| "learning_rate": 0.0001, |
| "loss": 9827914.0, |
| "loss/crossentropy": 2.597038745880127, |
| "loss/hidden": 0.15625, |
| "loss/logits": 0.059737734496593475, |
| "loss/reg": 9827911.0, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.00855, |
| "grad_norm": 1.28449547290802, |
| "grad_norm_var": 0.03436238526742977, |
| "learning_rate": 0.0001, |
| "loss": 9788249.0, |
| "loss/crossentropy": 2.788884162902832, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.06188058853149414, |
| "loss/reg": 9788246.0, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.008575, |
| "grad_norm": 1.2977092266082764, |
| "grad_norm_var": 0.031578571949970484, |
| "learning_rate": 0.0001, |
| "loss": 9748577.0, |
| "loss/crossentropy": 2.685559034347534, |
| "loss/hidden": 0.1533203125, |
| "loss/logits": 0.05644197016954422, |
| "loss/reg": 9748574.0, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.0086, |
| "grad_norm": 1.30158531665802, |
| "grad_norm_var": 0.03141427117849241, |
| "learning_rate": 0.0001, |
| "loss": 9709206.0, |
| "loss/crossentropy": 2.511206865310669, |
| "loss/hidden": 0.1630859375, |
| "loss/logits": 0.06572791188955307, |
| "loss/reg": 9709203.0, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.008625, |
| "grad_norm": 1.5106379985809326, |
| "grad_norm_var": 0.031597434429175016, |
| "learning_rate": 0.0001, |
| "loss": 9670055.0, |
| "loss/crossentropy": 2.707446575164795, |
| "loss/hidden": 0.1611328125, |
| "loss/logits": 0.0631917342543602, |
| "loss/reg": 9670052.0, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.00865, |
| "grad_norm": 1.240299940109253, |
| "grad_norm_var": 0.03200625868757682, |
| "learning_rate": 0.0001, |
| "loss": 9630905.0, |
| "loss/crossentropy": 2.964033603668213, |
| "loss/hidden": 0.154296875, |
| "loss/logits": 0.05968720465898514, |
| "loss/reg": 9630902.0, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.008675, |
| "grad_norm": 1.293732762336731, |
| "grad_norm_var": 0.03292515789337713, |
| "learning_rate": 0.0001, |
| "loss": 9591949.0, |
| "loss/crossentropy": 2.7571427822113037, |
| "loss/hidden": 0.1767578125, |
| "loss/logits": 0.06747777760028839, |
| "loss/reg": 9591946.0, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.0087, |
| "grad_norm": 1.4188120365142822, |
| "grad_norm_var": 0.030596887888174013, |
| "learning_rate": 0.0001, |
| "loss": 9553180.0, |
| "loss/crossentropy": 2.637974739074707, |
| "loss/hidden": 0.1728515625, |
| "loss/logits": 0.06675441563129425, |
| "loss/reg": 9553177.0, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.008725, |
| "grad_norm": 1.5369672775268555, |
| "grad_norm_var": 0.02983095605712875, |
| "learning_rate": 0.0001, |
| "loss": 9513629.0, |
| "loss/crossentropy": 3.1814582347869873, |
| "loss/hidden": 0.169921875, |
| "loss/logits": 0.06379696726799011, |
| "loss/reg": 9513626.0, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.00875, |
| "grad_norm": 2.4517834186553955, |
| "grad_norm_var": 0.09578724553663373, |
| "learning_rate": 0.0001, |
| "loss": 9473729.0, |
| "loss/crossentropy": 2.876161813735962, |
| "loss/hidden": 0.1865234375, |
| "loss/logits": 0.07904592156410217, |
| "loss/reg": 9473726.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.008775, |
| "grad_norm": 1.2984338998794556, |
| "grad_norm_var": 0.09068244886104485, |
| "learning_rate": 0.0001, |
| "loss": 9434606.0, |
| "loss/crossentropy": 2.6049013137817383, |
| "loss/hidden": 0.169921875, |
| "loss/logits": 0.0777474120259285, |
| "loss/reg": 9434603.0, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.0088, |
| "grad_norm": 1.3467525243759155, |
| "grad_norm_var": 0.09072351209017923, |
| "learning_rate": 0.0001, |
| "loss": 9395830.0, |
| "loss/crossentropy": 2.937755584716797, |
| "loss/hidden": 0.1767578125, |
| "loss/logits": 0.0749131590127945, |
| "loss/reg": 9395827.0, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.008825, |
| "grad_norm": 1.3239003419876099, |
| "grad_norm_var": 0.0917213050697472, |
| "learning_rate": 0.0001, |
| "loss": 9356927.0, |
| "loss/crossentropy": 2.7258141040802, |
| "loss/hidden": 0.1591796875, |
| "loss/logits": 0.05831865966320038, |
| "loss/reg": 9356924.0, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.00885, |
| "grad_norm": 1.2075529098510742, |
| "grad_norm_var": 0.09428143447274563, |
| "learning_rate": 0.0001, |
| "loss": 9318495.0, |
| "loss/crossentropy": 2.5584659576416016, |
| "loss/hidden": 0.1630859375, |
| "loss/logits": 0.06568388640880585, |
| "loss/reg": 9318492.0, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.008875, |
| "grad_norm": 1.262475609779358, |
| "grad_norm_var": 0.09567440645214731, |
| "learning_rate": 0.0001, |
| "loss": 9279912.0, |
| "loss/crossentropy": 2.567420482635498, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.06195951998233795, |
| "loss/reg": 9279909.0, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.0089, |
| "grad_norm": 1.3166685104370117, |
| "grad_norm_var": 0.08784644221088438, |
| "learning_rate": 0.0001, |
| "loss": 9241708.0, |
| "loss/crossentropy": 2.7136054039001465, |
| "loss/hidden": 0.1650390625, |
| "loss/logits": 0.06915077567100525, |
| "loss/reg": 9241705.0, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.008925, |
| "grad_norm": 1.3084465265274048, |
| "grad_norm_var": 0.08654047823965337, |
| "learning_rate": 0.0001, |
| "loss": 9203715.0, |
| "loss/crossentropy": 2.670734405517578, |
| "loss/hidden": 0.1669921875, |
| "loss/logits": 0.07514587044715881, |
| "loss/reg": 9203712.0, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.00895, |
| "grad_norm": 1.3154995441436768, |
| "grad_norm_var": 0.0861230095386943, |
| "learning_rate": 0.0001, |
| "loss": 9165827.0, |
| "loss/crossentropy": 2.532402753829956, |
| "loss/hidden": 0.1650390625, |
| "loss/logits": 0.07370884716510773, |
| "loss/reg": 9165824.0, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.008975, |
| "grad_norm": 1.1215200424194336, |
| "grad_norm_var": 0.09051207166652937, |
| "learning_rate": 0.0001, |
| "loss": 9128111.0, |
| "loss/crossentropy": 2.537588596343994, |
| "loss/hidden": 0.1513671875, |
| "loss/logits": 0.05851493775844574, |
| "loss/reg": 9128108.0, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 1.28538179397583, |
| "grad_norm_var": 0.0907215332557539, |
| "learning_rate": 0.0001, |
| "loss": 9090374.0, |
| "loss/crossentropy": 2.8996965885162354, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.08524011075496674, |
| "loss/reg": 9090371.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.009025, |
| "grad_norm": 1.3671303987503052, |
| "grad_norm_var": 0.08969899874427222, |
| "learning_rate": 0.0001, |
| "loss": 9052845.0, |
| "loss/crossentropy": 2.6090426445007324, |
| "loss/hidden": 0.169921875, |
| "loss/logits": 0.07099277526140213, |
| "loss/reg": 9052842.0, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.00905, |
| "grad_norm": 1.4142473936080933, |
| "grad_norm_var": 0.08832778170339163, |
| "learning_rate": 0.0001, |
| "loss": 9015424.0, |
| "loss/crossentropy": 2.8942418098449707, |
| "loss/hidden": 0.1748046875, |
| "loss/logits": 0.06831037253141403, |
| "loss/reg": 9015421.0, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.009075, |
| "grad_norm": 1.3467235565185547, |
| "grad_norm_var": 0.08781017211369473, |
| "learning_rate": 0.0001, |
| "loss": 8977683.0, |
| "loss/crossentropy": 2.631746292114258, |
| "loss/hidden": 0.169921875, |
| "loss/logits": 0.07015922665596008, |
| "loss/reg": 8977680.0, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.0091, |
| "grad_norm": 1.2741317749023438, |
| "grad_norm_var": 0.08866186281169106, |
| "learning_rate": 0.0001, |
| "loss": 8940348.0, |
| "loss/crossentropy": 2.687042713165283, |
| "loss/hidden": 0.1611328125, |
| "loss/logits": 0.06735162436962128, |
| "loss/reg": 8940345.0, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.009125, |
| "grad_norm": 1.4505982398986816, |
| "grad_norm_var": 0.08739073144151342, |
| "learning_rate": 0.0001, |
| "loss": 8903278.0, |
| "loss/crossentropy": 3.0909032821655273, |
| "loss/hidden": 0.1865234375, |
| "loss/logits": 0.0898236632347107, |
| "loss/reg": 8903275.0, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.00915, |
| "grad_norm": 1.3080906867980957, |
| "grad_norm_var": 0.005811198225498456, |
| "learning_rate": 0.0001, |
| "loss": 8866015.0, |
| "loss/crossentropy": 2.7376997470855713, |
| "loss/hidden": 0.1669921875, |
| "loss/logits": 0.06429491937160492, |
| "loss/reg": 8866012.0, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.009175, |
| "grad_norm": 1.4328027963638306, |
| "grad_norm_var": 0.006746355768591587, |
| "learning_rate": 0.0001, |
| "loss": 8829231.0, |
| "loss/crossentropy": 2.8941452503204346, |
| "loss/hidden": 0.1767578125, |
| "loss/logits": 0.07320351153612137, |
| "loss/reg": 8829228.0, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.0092, |
| "grad_norm": 1.188072681427002, |
| "grad_norm_var": 0.007703699139087187, |
| "learning_rate": 0.0001, |
| "loss": 8792526.0, |
| "loss/crossentropy": 2.722829818725586, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.0764017105102539, |
| "loss/reg": 8792523.0, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.009225, |
| "grad_norm": 1.2252023220062256, |
| "grad_norm_var": 0.008099373320596648, |
| "learning_rate": 0.0001, |
| "loss": 8755789.0, |
| "loss/crossentropy": 2.4984652996063232, |
| "loss/hidden": 0.169921875, |
| "loss/logits": 0.07053535431623459, |
| "loss/reg": 8755787.0, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.00925, |
| "grad_norm": 1.3226901292800903, |
| "grad_norm_var": 0.007485145918059289, |
| "learning_rate": 0.0001, |
| "loss": 8719278.0, |
| "loss/crossentropy": 2.465644598007202, |
| "loss/hidden": 0.1669921875, |
| "loss/logits": 0.06345571577548981, |
| "loss/reg": 8719276.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.009275, |
| "grad_norm": 1.3341246843338013, |
| "grad_norm_var": 0.007364116187156355, |
| "learning_rate": 0.0001, |
| "loss": 8682618.0, |
| "loss/crossentropy": 2.4528372287750244, |
| "loss/hidden": 0.1669921875, |
| "loss/logits": 0.06416066735982895, |
| "loss/reg": 8682616.0, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.0093, |
| "grad_norm": 1.3086628913879395, |
| "grad_norm_var": 0.007364428209979223, |
| "learning_rate": 0.0001, |
| "loss": 8646319.0, |
| "loss/crossentropy": 2.6269278526306152, |
| "loss/hidden": 0.1669921875, |
| "loss/logits": 0.06139897555112839, |
| "loss/reg": 8646316.0, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.009325, |
| "grad_norm": 1.2240703105926514, |
| "grad_norm_var": 0.007857327806585523, |
| "learning_rate": 0.0001, |
| "loss": 8610255.0, |
| "loss/crossentropy": 2.818922519683838, |
| "loss/hidden": 0.1630859375, |
| "loss/logits": 0.059509314596652985, |
| "loss/reg": 8610252.0, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.00935, |
| "grad_norm": 1.66130530834198, |
| "grad_norm_var": 0.015703045901657477, |
| "learning_rate": 0.0001, |
| "loss": 8574172.0, |
| "loss/crossentropy": 2.747607946395874, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.07515455782413483, |
| "loss/reg": 8574169.0, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.009375, |
| "grad_norm": 1.4012459516525269, |
| "grad_norm_var": 0.012853361482359974, |
| "learning_rate": 0.0001, |
| "loss": 8538154.0, |
| "loss/crossentropy": 3.1483116149902344, |
| "loss/hidden": 0.1767578125, |
| "loss/logits": 0.06820759922266006, |
| "loss/reg": 8538151.0, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.0094, |
| "grad_norm": 1.407920479774475, |
| "grad_norm_var": 0.012792774197204674, |
| "learning_rate": 0.0001, |
| "loss": 8502215.0, |
| "loss/crossentropy": 2.919933795928955, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.07294712215662003, |
| "loss/reg": 8502212.0, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.009425, |
| "grad_norm": 2.2570180892944336, |
| "grad_norm_var": 0.06382208200550246, |
| "learning_rate": 0.0001, |
| "loss": 8466393.0, |
| "loss/crossentropy": 3.442944288253784, |
| "loss/hidden": 0.2001953125, |
| "loss/logits": 0.08192779868841171, |
| "loss/reg": 8466390.0, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.00945, |
| "grad_norm": 1.2385762929916382, |
| "grad_norm_var": 0.06564683958297331, |
| "learning_rate": 0.0001, |
| "loss": 8430430.0, |
| "loss/crossentropy": 2.5357556343078613, |
| "loss/hidden": 0.166015625, |
| "loss/logits": 0.05833496153354645, |
| "loss/reg": 8430427.0, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.009475, |
| "grad_norm": 1.6625579595565796, |
| "grad_norm_var": 0.06968714784525123, |
| "learning_rate": 0.0001, |
| "loss": 8394895.0, |
| "loss/crossentropy": 2.8720271587371826, |
| "loss/hidden": 0.1982421875, |
| "loss/logits": 0.07314753532409668, |
| "loss/reg": 8394892.0, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.0095, |
| "grad_norm": 1.3794528245925903, |
| "grad_norm_var": 0.06835215585852689, |
| "learning_rate": 0.0001, |
| "loss": 8359635.0, |
| "loss/crossentropy": 2.6433968544006348, |
| "loss/hidden": 0.16796875, |
| "loss/logits": 0.057292331010103226, |
| "loss/reg": 8359632.5, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.009525, |
| "grad_norm": 1.2923458814620972, |
| "grad_norm_var": 0.06938041703969967, |
| "learning_rate": 0.0001, |
| "loss": 8324459.5, |
| "loss/crossentropy": 2.6968252658843994, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.06733741611242294, |
| "loss/reg": 8324457.0, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.00955, |
| "grad_norm": 1.5118707418441772, |
| "grad_norm_var": 0.06906398416367564, |
| "learning_rate": 0.0001, |
| "loss": 8289572.0, |
| "loss/crossentropy": 2.8361148834228516, |
| "loss/hidden": 0.1884765625, |
| "loss/logits": 0.07186661660671234, |
| "loss/reg": 8289568.5, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.009575, |
| "grad_norm": 1.2040274143218994, |
| "grad_norm_var": 0.07218846481904488, |
| "learning_rate": 0.0001, |
| "loss": 8254647.0, |
| "loss/crossentropy": 2.549699544906616, |
| "loss/hidden": 0.1669921875, |
| "loss/logits": 0.06833600252866745, |
| "loss/reg": 8254644.5, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.0096, |
| "grad_norm": 1.2194902896881104, |
| "grad_norm_var": 0.07130501502773139, |
| "learning_rate": 0.0001, |
| "loss": 8219960.0, |
| "loss/crossentropy": 2.5008010864257812, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.06810353696346283, |
| "loss/reg": 8219957.5, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.009625, |
| "grad_norm": 1.6944211721420288, |
| "grad_norm_var": 0.07314989344470361, |
| "learning_rate": 0.0001, |
| "loss": 8185385.0, |
| "loss/crossentropy": 2.962953567504883, |
| "loss/hidden": 0.2099609375, |
| "loss/logits": 0.1040625125169754, |
| "loss/reg": 8185381.5, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.00965, |
| "grad_norm": 1.3931043148040771, |
| "grad_norm_var": 0.07231159381160618, |
| "learning_rate": 0.0001, |
| "loss": 8150493.0, |
| "loss/crossentropy": 2.2730705738067627, |
| "loss/hidden": 0.1748046875, |
| "loss/logits": 0.06591656804084778, |
| "loss/reg": 8150490.5, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.009675, |
| "grad_norm": 1.247336745262146, |
| "grad_norm_var": 0.07411613868884098, |
| "learning_rate": 0.0001, |
| "loss": 8115972.5, |
| "loss/crossentropy": 2.6262216567993164, |
| "loss/hidden": 0.169921875, |
| "loss/logits": 0.07050032168626785, |
| "loss/reg": 8115970.0, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.0097, |
| "grad_norm": 1.2609894275665283, |
| "grad_norm_var": 0.07511821558432245, |
| "learning_rate": 0.0001, |
| "loss": 8081480.0, |
| "loss/crossentropy": 2.3553314208984375, |
| "loss/hidden": 0.16796875, |
| "loss/logits": 0.05998353660106659, |
| "loss/reg": 8081477.5, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.009725, |
| "grad_norm": 1.3658751249313354, |
| "grad_norm_var": 0.07227376211723939, |
| "learning_rate": 0.0001, |
| "loss": 8047147.0, |
| "loss/crossentropy": 2.7511541843414307, |
| "loss/hidden": 0.177734375, |
| "loss/logits": 0.06458113342523575, |
| "loss/reg": 8047144.0, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.00975, |
| "grad_norm": 1.3427902460098267, |
| "grad_norm_var": 0.06963411114551304, |
| "learning_rate": 0.0001, |
| "loss": 8013014.5, |
| "loss/crossentropy": 2.741870641708374, |
| "loss/hidden": 0.181640625, |
| "loss/logits": 0.06407298147678375, |
| "loss/reg": 8013012.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.009775, |
| "grad_norm": 1.4418491125106812, |
| "grad_norm_var": 0.06958181291310073, |
| "learning_rate": 0.0001, |
| "loss": 7978738.5, |
| "loss/crossentropy": 2.737600564956665, |
| "loss/hidden": 0.1884765625, |
| "loss/logits": 0.06659260392189026, |
| "loss/reg": 7978735.5, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.0098, |
| "grad_norm": 1.4318194389343262, |
| "grad_norm_var": 0.06953926156382385, |
| "learning_rate": 0.0001, |
| "loss": 7944718.5, |
| "loss/crossentropy": 2.7584660053253174, |
| "loss/hidden": 0.2080078125, |
| "loss/logits": 0.08844783902168274, |
| "loss/reg": 7944715.0, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.009825, |
| "grad_norm": 1.3763504028320312, |
| "grad_norm_var": 0.021368524636284057, |
| "learning_rate": 0.0001, |
| "loss": 7910741.5, |
| "loss/crossentropy": 2.660393238067627, |
| "loss/hidden": 0.1845703125, |
| "loss/logits": 0.0740346610546112, |
| "loss/reg": 7910738.5, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.00985, |
| "grad_norm": 1.4071277379989624, |
| "grad_norm_var": 0.01998991306205061, |
| "learning_rate": 0.0001, |
| "loss": 7876987.0, |
| "loss/crossentropy": 2.875865936279297, |
| "loss/hidden": 0.19140625, |
| "loss/logits": 0.08417264372110367, |
| "loss/reg": 7876983.5, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.009875, |
| "grad_norm": 1.3935880661010742, |
| "grad_norm_var": 0.014717555533152371, |
| "learning_rate": 0.0001, |
| "loss": 7842971.0, |
| "loss/crossentropy": 2.7700774669647217, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.07605085521936417, |
| "loss/reg": 7842967.5, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.0099, |
| "grad_norm": 1.5361367464065552, |
| "grad_norm_var": 0.016393989495809967, |
| "learning_rate": 0.0001, |
| "loss": 7809434.0, |
| "loss/crossentropy": 2.6684157848358154, |
| "loss/hidden": 0.19140625, |
| "loss/logits": 0.06940533220767975, |
| "loss/reg": 7809431.0, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.009925, |
| "grad_norm": 1.607358455657959, |
| "grad_norm_var": 0.018811725564341943, |
| "learning_rate": 0.0001, |
| "loss": 7776049.0, |
| "loss/crossentropy": 3.0041184425354004, |
| "loss/hidden": 0.2021484375, |
| "loss/logits": 0.08851586282253265, |
| "loss/reg": 7776045.5, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.00995, |
| "grad_norm": 1.3532912731170654, |
| "grad_norm_var": 0.018063163846052475, |
| "learning_rate": 0.0001, |
| "loss": 7742622.5, |
| "loss/crossentropy": 2.8445279598236084, |
| "loss/hidden": 0.1845703125, |
| "loss/logits": 0.07520662248134613, |
| "loss/reg": 7742619.0, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.009975, |
| "grad_norm": 1.3147300481796265, |
| "grad_norm_var": 0.016051284081685442, |
| "learning_rate": 0.0001, |
| "loss": 7709319.0, |
| "loss/crossentropy": 2.692878007888794, |
| "loss/hidden": 0.19140625, |
| "loss/logits": 0.08449488878250122, |
| "loss/reg": 7709316.0, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 1.397094964981079, |
| "grad_norm_var": 0.013768503213804214, |
| "learning_rate": 0.0001, |
| "loss": 7676072.0, |
| "loss/crossentropy": 2.591867446899414, |
| "loss/hidden": 0.1865234375, |
| "loss/logits": 0.0794595330953598, |
| "loss/reg": 7676069.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.010025, |
| "grad_norm": 1.5196943283081055, |
| "grad_norm_var": 0.00905608507461763, |
| "learning_rate": 0.0001, |
| "loss": 7642853.0, |
| "loss/crossentropy": 2.926584482192993, |
| "loss/hidden": 0.1943359375, |
| "loss/logits": 0.07917599380016327, |
| "loss/reg": 7642849.5, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.01005, |
| "grad_norm": 1.5146349668502808, |
| "grad_norm_var": 0.009878455139768741, |
| "learning_rate": 0.0001, |
| "loss": 7609861.0, |
| "loss/crossentropy": 3.100090742111206, |
| "loss/hidden": 0.1962890625, |
| "loss/logits": 0.08454690873622894, |
| "loss/reg": 7609857.5, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.010075, |
| "grad_norm": 1.4379841089248657, |
| "grad_norm_var": 0.008093640045690147, |
| "learning_rate": 0.0001, |
| "loss": 7576818.0, |
| "loss/crossentropy": 2.7622506618499756, |
| "loss/hidden": 0.1943359375, |
| "loss/logits": 0.08928876370191574, |
| "loss/reg": 7576814.5, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.0101, |
| "grad_norm": 1.9222147464752197, |
| "grad_norm_var": 0.021503875158050315, |
| "learning_rate": 0.0001, |
| "loss": 7544131.0, |
| "loss/crossentropy": 2.567054033279419, |
| "loss/hidden": 0.21484375, |
| "loss/logits": 0.09651297330856323, |
| "loss/reg": 7544128.0, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.010125, |
| "grad_norm": 1.8015581369400024, |
| "grad_norm_var": 0.02789056993373317, |
| "learning_rate": 0.0001, |
| "loss": 7511763.5, |
| "loss/crossentropy": 2.899754524230957, |
| "loss/hidden": 0.220703125, |
| "loss/logits": 0.08384756743907928, |
| "loss/reg": 7511760.0, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.01015, |
| "grad_norm": 1.8093312978744507, |
| "grad_norm_var": 0.032499525271881186, |
| "learning_rate": 0.0001, |
| "loss": 7478970.5, |
| "loss/crossentropy": 2.972369909286499, |
| "loss/hidden": 0.2001953125, |
| "loss/logits": 0.0721152126789093, |
| "loss/reg": 7478967.0, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.010175, |
| "grad_norm": 3.623868942260742, |
| "grad_norm_var": 0.30834266581300224, |
| "learning_rate": 0.0001, |
| "loss": 7446550.5, |
| "loss/crossentropy": 3.00593900680542, |
| "loss/hidden": 0.2734375, |
| "loss/logits": 0.13169603049755096, |
| "loss/reg": 7446547.0, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.0102, |
| "grad_norm": 1.5351911783218384, |
| "grad_norm_var": 0.30596306164395226, |
| "learning_rate": 0.0001, |
| "loss": 7413299.5, |
| "loss/crossentropy": 2.9634902477264404, |
| "loss/hidden": 0.205078125, |
| "loss/logits": 0.07906673848628998, |
| "loss/reg": 7413296.0, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.010225, |
| "grad_norm": 1.6074159145355225, |
| "grad_norm_var": 0.30058008704784905, |
| "learning_rate": 0.0001, |
| "loss": 7380455.5, |
| "loss/crossentropy": 2.6868479251861572, |
| "loss/hidden": 0.2080078125, |
| "loss/logits": 0.08233708888292313, |
| "loss/reg": 7380452.5, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.01025, |
| "grad_norm": 1.6311511993408203, |
| "grad_norm_var": 0.29575051245640976, |
| "learning_rate": 0.0001, |
| "loss": 7347736.0, |
| "loss/crossentropy": 3.0754292011260986, |
| "loss/hidden": 0.21875, |
| "loss/logits": 0.092095747590065, |
| "loss/reg": 7347732.5, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.010275, |
| "grad_norm": 1.5608453750610352, |
| "grad_norm_var": 0.2909371182015187, |
| "learning_rate": 0.0001, |
| "loss": 7315290.0, |
| "loss/crossentropy": 2.6300535202026367, |
| "loss/hidden": 0.205078125, |
| "loss/logits": 0.07370250672101974, |
| "loss/reg": 7315287.0, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.0103, |
| "grad_norm": 1.6363860368728638, |
| "grad_norm_var": 0.28939792061849606, |
| "learning_rate": 0.0001, |
| "loss": 7282878.0, |
| "loss/crossentropy": 2.495676040649414, |
| "loss/hidden": 0.1962890625, |
| "loss/logits": 0.09690120816230774, |
| "loss/reg": 7282875.0, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.010325, |
| "grad_norm": 1.519830584526062, |
| "grad_norm_var": 0.2910109679019257, |
| "learning_rate": 0.0001, |
| "loss": 7250847.0, |
| "loss/crossentropy": 2.907433271408081, |
| "loss/hidden": 0.2119140625, |
| "loss/logits": 0.08057542145252228, |
| "loss/reg": 7250843.5, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.01035, |
| "grad_norm": 1.4764982461929321, |
| "grad_norm_var": 0.286279296059934, |
| "learning_rate": 0.0001, |
| "loss": 7219078.0, |
| "loss/crossentropy": 2.495234966278076, |
| "loss/hidden": 0.205078125, |
| "loss/logits": 0.0714755579829216, |
| "loss/reg": 7219075.0, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.010375, |
| "grad_norm": 1.4259133338928223, |
| "grad_norm_var": 0.28124002976414414, |
| "learning_rate": 0.0001, |
| "loss": 7187560.5, |
| "loss/crossentropy": 2.728975534439087, |
| "loss/hidden": 0.205078125, |
| "loss/logits": 0.06841155886650085, |
| "loss/reg": 7187557.5, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.0104, |
| "grad_norm": 1.3673322200775146, |
| "grad_norm_var": 0.2825519007510555, |
| "learning_rate": 0.0001, |
| "loss": 7156008.0, |
| "loss/crossentropy": 2.73136568069458, |
| "loss/hidden": 0.189453125, |
| "loss/logits": 0.07159998267889023, |
| "loss/reg": 7156005.0, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.010425, |
| "grad_norm": 1.2539957761764526, |
| "grad_norm_var": 0.2937720860260924, |
| "learning_rate": 0.0001, |
| "loss": 7124688.5, |
| "loss/crossentropy": 2.6796677112579346, |
| "loss/hidden": 0.1962890625, |
| "loss/logits": 0.07542067766189575, |
| "loss/reg": 7124685.5, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.01045, |
| "grad_norm": 1.386030912399292, |
| "grad_norm_var": 0.2979029800775377, |
| "learning_rate": 0.0001, |
| "loss": 7093473.5, |
| "loss/crossentropy": 2.629286527633667, |
| "loss/hidden": 0.19921875, |
| "loss/logits": 0.07673083245754242, |
| "loss/reg": 7093470.5, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.010475, |
| "grad_norm": 1.5015889406204224, |
| "grad_norm_var": 0.29604213272403945, |
| "learning_rate": 0.0001, |
| "loss": 7062196.0, |
| "loss/crossentropy": 3.169019937515259, |
| "loss/hidden": 0.2119140625, |
| "loss/logits": 0.08491817116737366, |
| "loss/reg": 7062192.5, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.0105, |
| "grad_norm": 1.3452091217041016, |
| "grad_norm_var": 0.29907746640745925, |
| "learning_rate": 0.0001, |
| "loss": 7031249.5, |
| "loss/crossentropy": 2.554494857788086, |
| "loss/hidden": 0.1962890625, |
| "loss/logits": 0.0879049301147461, |
| "loss/reg": 7031246.5, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.010525, |
| "grad_norm": 1.341674566268921, |
| "grad_norm_var": 0.3033173775724302, |
| "learning_rate": 0.0001, |
| "loss": 7000481.5, |
| "loss/crossentropy": 2.814030885696411, |
| "loss/hidden": 0.2001953125, |
| "loss/logits": 0.08451371639966965, |
| "loss/reg": 7000478.0, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.01055, |
| "grad_norm": 1.4075064659118652, |
| "grad_norm_var": 0.3036075256393055, |
| "learning_rate": 0.0001, |
| "loss": 6969750.0, |
| "loss/crossentropy": 2.80719256401062, |
| "loss/hidden": 0.1943359375, |
| "loss/logits": 0.07341581583023071, |
| "loss/reg": 6969746.5, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.010575, |
| "grad_norm": 2.0001425743103027, |
| "grad_norm_var": 0.030503292781990154, |
| "learning_rate": 0.0001, |
| "loss": 6939113.0, |
| "loss/crossentropy": 2.8719654083251953, |
| "loss/hidden": 0.23828125, |
| "loss/logits": 0.11069092154502869, |
| "loss/reg": 6939109.5, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.0106, |
| "grad_norm": 1.3399561643600464, |
| "grad_norm_var": 0.03196416512706909, |
| "learning_rate": 0.0001, |
| "loss": 6908035.0, |
| "loss/crossentropy": 2.5573458671569824, |
| "loss/hidden": 0.1962890625, |
| "loss/logits": 0.07869204133749008, |
| "loss/reg": 6908032.0, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.010625, |
| "grad_norm": 1.417223572731018, |
| "grad_norm_var": 0.031186382045615288, |
| "learning_rate": 0.0001, |
| "loss": 6877190.5, |
| "loss/crossentropy": 2.964346170425415, |
| "loss/hidden": 0.2119140625, |
| "loss/logits": 0.08406369388103485, |
| "loss/reg": 6877187.0, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.01065, |
| "grad_norm": 1.2194716930389404, |
| "grad_norm_var": 0.033246365223378344, |
| "learning_rate": 0.0001, |
| "loss": 6846738.5, |
| "loss/crossentropy": 2.6852383613586426, |
| "loss/hidden": 0.19140625, |
| "loss/logits": 0.08158925175666809, |
| "loss/reg": 6846735.5, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.010675, |
| "grad_norm": 1.5298981666564941, |
| "grad_norm_var": 0.03284874095652406, |
| "learning_rate": 0.0001, |
| "loss": 6816436.5, |
| "loss/crossentropy": 2.7284371852874756, |
| "loss/hidden": 0.2080078125, |
| "loss/logits": 0.08474992215633392, |
| "loss/reg": 6816433.5, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.0107, |
| "grad_norm": 1.4798898696899414, |
| "grad_norm_var": 0.03044939785077266, |
| "learning_rate": 0.0001, |
| "loss": 6786142.5, |
| "loss/crossentropy": 3.054154634475708, |
| "loss/hidden": 0.1982421875, |
| "loss/logits": 0.07488109171390533, |
| "loss/reg": 6786139.0, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.010725, |
| "grad_norm": 1.3538404703140259, |
| "grad_norm_var": 0.0303661243628594, |
| "learning_rate": 0.0001, |
| "loss": 6756060.0, |
| "loss/crossentropy": 2.5270814895629883, |
| "loss/hidden": 0.1943359375, |
| "loss/logits": 0.07577107846736908, |
| "loss/reg": 6756057.0, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.01075, |
| "grad_norm": 1.3469491004943848, |
| "grad_norm_var": 0.030575366473054484, |
| "learning_rate": 0.0001, |
| "loss": 6726036.0, |
| "loss/crossentropy": 2.5783021450042725, |
| "loss/hidden": 0.189453125, |
| "loss/logits": 0.06793719530105591, |
| "loss/reg": 6726033.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.010775, |
| "grad_norm": 1.4875595569610596, |
| "grad_norm_var": 0.030863222004755584, |
| "learning_rate": 0.0001, |
| "loss": 6696293.0, |
| "loss/crossentropy": 2.907043218612671, |
| "loss/hidden": 0.2021484375, |
| "loss/logits": 0.07748304307460785, |
| "loss/reg": 6696289.5, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.0108, |
| "grad_norm": 1.4858613014221191, |
| "grad_norm_var": 0.03085138337899019, |
| "learning_rate": 0.0001, |
| "loss": 6666552.5, |
| "loss/crossentropy": 3.409048080444336, |
| "loss/hidden": 0.2001953125, |
| "loss/logits": 0.07302901148796082, |
| "loss/reg": 6666548.5, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.010825, |
| "grad_norm": 1.296472191810608, |
| "grad_norm_var": 0.029961398858387098, |
| "learning_rate": 0.0001, |
| "loss": 6637012.0, |
| "loss/crossentropy": 2.660501718521118, |
| "loss/hidden": 0.19921875, |
| "loss/logits": 0.07810290157794952, |
| "loss/reg": 6637009.0, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.01085, |
| "grad_norm": 1.286212682723999, |
| "grad_norm_var": 0.03121862342976834, |
| "learning_rate": 0.0001, |
| "loss": 6607651.5, |
| "loss/crossentropy": 2.7550857067108154, |
| "loss/hidden": 0.1943359375, |
| "loss/logits": 0.06891431659460068, |
| "loss/reg": 6607648.0, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.010875, |
| "grad_norm": 1.3449376821517944, |
| "grad_norm_var": 0.031204156461001217, |
| "learning_rate": 0.0001, |
| "loss": 6578012.5, |
| "loss/crossentropy": 2.7083327770233154, |
| "loss/hidden": 0.19921875, |
| "loss/logits": 0.07967344671487808, |
| "loss/reg": 6578009.5, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.0109, |
| "grad_norm": 1.4807077646255493, |
| "grad_norm_var": 0.031042439495325643, |
| "learning_rate": 0.0001, |
| "loss": 6548893.0, |
| "loss/crossentropy": 3.01952862739563, |
| "loss/hidden": 0.1884765625, |
| "loss/logits": 0.06882057338953018, |
| "loss/reg": 6548889.5, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.010925, |
| "grad_norm": 1.3358391523361206, |
| "grad_norm_var": 0.031110289621625438, |
| "learning_rate": 0.0001, |
| "loss": 6519929.5, |
| "loss/crossentropy": 2.733430862426758, |
| "loss/hidden": 0.1962890625, |
| "loss/logits": 0.07696013152599335, |
| "loss/reg": 6519926.5, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.01095, |
| "grad_norm": 1.3452327251434326, |
| "grad_norm_var": 0.03150438795774158, |
| "learning_rate": 0.0001, |
| "loss": 6490663.0, |
| "loss/crossentropy": 2.8212575912475586, |
| "loss/hidden": 0.1962890625, |
| "loss/logits": 0.08535327017307281, |
| "loss/reg": 6490659.5, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.010975, |
| "grad_norm": 1.3636789321899414, |
| "grad_norm_var": 0.007750455242761175, |
| "learning_rate": 0.0001, |
| "loss": 6461857.0, |
| "loss/crossentropy": 2.6902356147766113, |
| "loss/hidden": 0.1962890625, |
| "loss/logits": 0.07107086479663849, |
| "loss/reg": 6461854.0, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 1.1974213123321533, |
| "grad_norm_var": 0.009821301094535783, |
| "learning_rate": 0.0001, |
| "loss": 6432990.5, |
| "loss/crossentropy": 2.486919403076172, |
| "loss/hidden": 0.1845703125, |
| "loss/logits": 0.06680778414011002, |
| "loss/reg": 6432987.5, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.011025, |
| "grad_norm": 1.5477304458618164, |
| "grad_norm_var": 0.011651858510242086, |
| "learning_rate": 0.0001, |
| "loss": 6404415.0, |
| "loss/crossentropy": 2.839717388153076, |
| "loss/hidden": 0.2001953125, |
| "loss/logits": 0.08287405967712402, |
| "loss/reg": 6404411.5, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.01105, |
| "grad_norm": 1.3512002229690552, |
| "grad_norm_var": 0.009893071886873628, |
| "learning_rate": 0.0001, |
| "loss": 6375996.5, |
| "loss/crossentropy": 2.7745842933654785, |
| "loss/hidden": 0.1962890625, |
| "loss/logits": 0.07606562972068787, |
| "loss/reg": 6375993.0, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.011075, |
| "grad_norm": 1.2025905847549438, |
| "grad_norm_var": 0.01046549950436025, |
| "learning_rate": 0.0001, |
| "loss": 6347560.0, |
| "loss/crossentropy": 2.6066038608551025, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.0659542977809906, |
| "loss/reg": 6347557.5, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.0111, |
| "grad_norm": 1.643843650817871, |
| "grad_norm_var": 0.01456675863142601, |
| "learning_rate": 0.0001, |
| "loss": 6319336.0, |
| "loss/crossentropy": 2.732024669647217, |
| "loss/hidden": 0.185546875, |
| "loss/logits": 0.06361852586269379, |
| "loss/reg": 6319333.5, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.011125, |
| "grad_norm": 1.6402738094329834, |
| "grad_norm_var": 0.018719134512084186, |
| "learning_rate": 0.0001, |
| "loss": 6290964.0, |
| "loss/crossentropy": 2.6538524627685547, |
| "loss/hidden": 0.2021484375, |
| "loss/logits": 0.07512258738279343, |
| "loss/reg": 6290961.0, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.01115, |
| "grad_norm": 1.5652592182159424, |
| "grad_norm_var": 0.02023275201013135, |
| "learning_rate": 0.0001, |
| "loss": 6262839.0, |
| "loss/crossentropy": 2.8579986095428467, |
| "loss/hidden": 0.21875, |
| "loss/logits": 0.083011195063591, |
| "loss/reg": 6262835.5, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.011175, |
| "grad_norm": 1.6123555898666382, |
| "grad_norm_var": 0.022481266534340173, |
| "learning_rate": 0.0001, |
| "loss": 6234838.5, |
| "loss/crossentropy": 3.005380392074585, |
| "loss/hidden": 0.2138671875, |
| "loss/logits": 0.08437830954790115, |
| "loss/reg": 6234835.0, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.0112, |
| "grad_norm": 1.2739237546920776, |
| "grad_norm_var": 0.023391481859241735, |
| "learning_rate": 0.0001, |
| "loss": 6206024.0, |
| "loss/crossentropy": 2.741940975189209, |
| "loss/hidden": 0.1953125, |
| "loss/logits": 0.06752828508615494, |
| "loss/reg": 6206021.0, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.011225, |
| "grad_norm": 1.522573709487915, |
| "grad_norm_var": 0.023300356789412467, |
| "learning_rate": 0.0001, |
| "loss": 6177884.0, |
| "loss/crossentropy": 2.216064214706421, |
| "loss/hidden": 0.20703125, |
| "loss/logits": 0.07461199909448624, |
| "loss/reg": 6177881.5, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.01125, |
| "grad_norm": 1.307084321975708, |
| "grad_norm_var": 0.022956350253921145, |
| "learning_rate": 0.0001, |
| "loss": 6149811.0, |
| "loss/crossentropy": 2.932281255722046, |
| "loss/hidden": 0.19921875, |
| "loss/logits": 0.08466456830501556, |
| "loss/reg": 6149807.5, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.011275, |
| "grad_norm": 1.392687201499939, |
| "grad_norm_var": 0.022615128779616533, |
| "learning_rate": 0.0001, |
| "loss": 6122085.5, |
| "loss/crossentropy": 2.664841890335083, |
| "loss/hidden": 0.1962890625, |
| "loss/logits": 0.08006204664707184, |
| "loss/reg": 6122082.5, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.0113, |
| "grad_norm": 1.3573030233383179, |
| "grad_norm_var": 0.022632213880003308, |
| "learning_rate": 0.0001, |
| "loss": 6094571.5, |
| "loss/crossentropy": 2.7726354598999023, |
| "loss/hidden": 0.1962890625, |
| "loss/logits": 0.0739869773387909, |
| "loss/reg": 6094568.0, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.011325, |
| "grad_norm": 1.4230579137802124, |
| "grad_norm_var": 0.022173275657228864, |
| "learning_rate": 0.0001, |
| "loss": 6066895.0, |
| "loss/crossentropy": 2.5842363834381104, |
| "loss/hidden": 0.2099609375, |
| "loss/logits": 0.08860860764980316, |
| "loss/reg": 6066892.0, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.01135, |
| "grad_norm": 1.2906672954559326, |
| "grad_norm_var": 0.022915244336950913, |
| "learning_rate": 0.0001, |
| "loss": 6039671.5, |
| "loss/crossentropy": 2.7816314697265625, |
| "loss/hidden": 0.1953125, |
| "loss/logits": 0.07493859529495239, |
| "loss/reg": 6039668.0, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.011375, |
| "grad_norm": 1.311158299446106, |
| "grad_norm_var": 0.02346964023905092, |
| "learning_rate": 0.0001, |
| "loss": 6012314.5, |
| "loss/crossentropy": 2.511643171310425, |
| "loss/hidden": 0.201171875, |
| "loss/logits": 0.0692434310913086, |
| "loss/reg": 6012311.5, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.0114, |
| "grad_norm": 1.2861177921295166, |
| "grad_norm_var": 0.021388846132447032, |
| "learning_rate": 0.0001, |
| "loss": 5985016.5, |
| "loss/crossentropy": 2.6709518432617188, |
| "loss/hidden": 0.19921875, |
| "loss/logits": 0.07635175436735153, |
| "loss/reg": 5985013.5, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.011425, |
| "grad_norm": 1.4028676748275757, |
| "grad_norm_var": 0.020242752830213758, |
| "learning_rate": 0.0001, |
| "loss": 5958055.5, |
| "loss/crossentropy": 2.575296640396118, |
| "loss/hidden": 0.205078125, |
| "loss/logits": 0.08447666466236115, |
| "loss/reg": 5958052.5, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.01145, |
| "grad_norm": 1.2848471403121948, |
| "grad_norm_var": 0.02105082755216371, |
| "learning_rate": 0.0001, |
| "loss": 5931081.0, |
| "loss/crossentropy": 2.576475143432617, |
| "loss/hidden": 0.181640625, |
| "loss/logits": 0.06291022896766663, |
| "loss/reg": 5931078.5, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.011475, |
| "grad_norm": 1.5777517557144165, |
| "grad_norm_var": 0.019608167648357207, |
| "learning_rate": 0.0001, |
| "loss": 5904256.5, |
| "loss/crossentropy": 2.444087028503418, |
| "loss/hidden": 0.19921875, |
| "loss/logits": 0.06936931610107422, |
| "loss/reg": 5904253.5, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.0115, |
| "grad_norm": 1.4950165748596191, |
| "grad_norm_var": 0.016763681395289075, |
| "learning_rate": 0.0001, |
| "loss": 5877437.0, |
| "loss/crossentropy": 2.7429440021514893, |
| "loss/hidden": 0.2080078125, |
| "loss/logits": 0.0743948221206665, |
| "loss/reg": 5877434.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.011525, |
| "grad_norm": 1.2243472337722778, |
| "grad_norm_var": 0.015439695051961877, |
| "learning_rate": 0.0001, |
| "loss": 5850814.5, |
| "loss/crossentropy": 2.636840581893921, |
| "loss/hidden": 0.1875, |
| "loss/logits": 0.06128958985209465, |
| "loss/reg": 5850812.0, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.01155, |
| "grad_norm": 1.4954829216003418, |
| "grad_norm_var": 0.014164064120929727, |
| "learning_rate": 0.0001, |
| "loss": 5824456.0, |
| "loss/crossentropy": 2.7803921699523926, |
| "loss/hidden": 0.2080078125, |
| "loss/logits": 0.07527358829975128, |
| "loss/reg": 5824452.5, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.011575, |
| "grad_norm": 1.3304413557052612, |
| "grad_norm_var": 0.01081377074423718, |
| "learning_rate": 0.0001, |
| "loss": 5798308.0, |
| "loss/crossentropy": 2.6561946868896484, |
| "loss/hidden": 0.19921875, |
| "loss/logits": 0.06966042518615723, |
| "loss/reg": 5798305.0, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.0116, |
| "grad_norm": 1.2039903402328491, |
| "grad_norm_var": 0.012047540996890878, |
| "learning_rate": 0.0001, |
| "loss": 5772175.5, |
| "loss/crossentropy": 2.5695714950561523, |
| "loss/hidden": 0.1923828125, |
| "loss/logits": 0.06850994378328323, |
| "loss/reg": 5772172.5, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.011625, |
| "grad_norm": 1.6904760599136353, |
| "grad_norm_var": 0.01724559128236501, |
| "learning_rate": 0.0001, |
| "loss": 5746112.0, |
| "loss/crossentropy": 2.851910352706909, |
| "loss/hidden": 0.27734375, |
| "loss/logits": 0.1252196878194809, |
| "loss/reg": 5746108.5, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.01165, |
| "grad_norm": 1.4688254594802856, |
| "grad_norm_var": 0.017317176263207775, |
| "learning_rate": 0.0001, |
| "loss": 5719481.5, |
| "loss/crossentropy": 2.927516222000122, |
| "loss/hidden": 0.2138671875, |
| "loss/logits": 0.08722692728042603, |
| "loss/reg": 5719478.0, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.011675, |
| "grad_norm": 1.2999849319458008, |
| "grad_norm_var": 0.017817235356935097, |
| "learning_rate": 0.0001, |
| "loss": 5693368.5, |
| "loss/crossentropy": 2.497384548187256, |
| "loss/hidden": 0.1875, |
| "loss/logits": 0.06252273917198181, |
| "loss/reg": 5693365.5, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.0117, |
| "grad_norm": 1.3807448148727417, |
| "grad_norm_var": 0.017768461982079427, |
| "learning_rate": 0.0001, |
| "loss": 5667136.5, |
| "loss/crossentropy": 3.02346134185791, |
| "loss/hidden": 0.205078125, |
| "loss/logits": 0.07955018430948257, |
| "loss/reg": 5667133.0, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.011725, |
| "grad_norm": 1.345306158065796, |
| "grad_norm_var": 0.01775549630107752, |
| "learning_rate": 0.0001, |
| "loss": 5641151.0, |
| "loss/crossentropy": 2.5943477153778076, |
| "loss/hidden": 0.205078125, |
| "loss/logits": 0.08433480560779572, |
| "loss/reg": 5641148.0, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.01175, |
| "grad_norm": 2.015897274017334, |
| "grad_norm_var": 0.0419411652202759, |
| "learning_rate": 0.0001, |
| "loss": 5615308.5, |
| "loss/crossentropy": 3.060128927230835, |
| "loss/hidden": 0.2265625, |
| "loss/logits": 0.09130183607339859, |
| "loss/reg": 5615305.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.011775, |
| "grad_norm": 1.947918176651001, |
| "grad_norm_var": 0.057546961634034645, |
| "learning_rate": 0.0001, |
| "loss": 5589587.5, |
| "loss/crossentropy": 2.785301923751831, |
| "loss/hidden": 0.21875, |
| "loss/logits": 0.0809088796377182, |
| "loss/reg": 5589584.0, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.0118, |
| "grad_norm": 1.4415549039840698, |
| "grad_norm_var": 0.05533670723490687, |
| "learning_rate": 0.0001, |
| "loss": 5564089.5, |
| "loss/crossentropy": 2.92744517326355, |
| "loss/hidden": 0.2099609375, |
| "loss/logits": 0.07750119268894196, |
| "loss/reg": 5564086.0, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.011825, |
| "grad_norm": 1.3175444602966309, |
| "grad_norm_var": 0.05661619573567069, |
| "learning_rate": 0.0001, |
| "loss": 5538495.0, |
| "loss/crossentropy": 2.6798853874206543, |
| "loss/hidden": 0.201171875, |
| "loss/logits": 0.07629255950450897, |
| "loss/reg": 5538492.0, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.01185, |
| "grad_norm": 1.7042852640151978, |
| "grad_norm_var": 0.0572565750278736, |
| "learning_rate": 0.0001, |
| "loss": 5513250.5, |
| "loss/crossentropy": 3.1486897468566895, |
| "loss/hidden": 0.2099609375, |
| "loss/logits": 0.0755862146615982, |
| "loss/reg": 5513247.0, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.011875, |
| "grad_norm": 1.2919703722000122, |
| "grad_norm_var": 0.059254424978476966, |
| "learning_rate": 0.0001, |
| "loss": 5487853.5, |
| "loss/crossentropy": 2.893237590789795, |
| "loss/hidden": 0.2099609375, |
| "loss/logits": 0.08680570125579834, |
| "loss/reg": 5487850.0, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.0119, |
| "grad_norm": 1.2895604372024536, |
| "grad_norm_var": 0.061436441303387095, |
| "learning_rate": 0.0001, |
| "loss": 5462747.5, |
| "loss/crossentropy": 2.5355372428894043, |
| "loss/hidden": 0.205078125, |
| "loss/logits": 0.08786862343549728, |
| "loss/reg": 5462744.5, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.011925, |
| "grad_norm": 1.5746833086013794, |
| "grad_norm_var": 0.057841835926181064, |
| "learning_rate": 0.0001, |
| "loss": 5437810.0, |
| "loss/crossentropy": 2.791339159011841, |
| "loss/hidden": 0.2080078125, |
| "loss/logits": 0.07790270447731018, |
| "loss/reg": 5437806.5, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.01195, |
| "grad_norm": 1.5507785081863403, |
| "grad_norm_var": 0.05809240668158863, |
| "learning_rate": 0.0001, |
| "loss": 5412910.5, |
| "loss/crossentropy": 2.694282293319702, |
| "loss/hidden": 0.21875, |
| "loss/logits": 0.07798787951469421, |
| "loss/reg": 5412907.5, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.011975, |
| "grad_norm": 1.3629002571105957, |
| "grad_norm_var": 0.0574639324627024, |
| "learning_rate": 0.0001, |
| "loss": 5388070.5, |
| "loss/crossentropy": 2.692976713180542, |
| "loss/hidden": 0.201171875, |
| "loss/logits": 0.06126507371664047, |
| "loss/reg": 5388067.5, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 1.363987684249878, |
| "grad_norm_var": 0.05290054794509344, |
| "learning_rate": 0.0001, |
| "loss": 5362784.0, |
| "loss/crossentropy": 2.7948617935180664, |
| "loss/hidden": 0.1982421875, |
| "loss/logits": 0.07575780153274536, |
| "loss/reg": 5362780.5, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.012025, |
| "grad_norm": 1.7268861532211304, |
| "grad_norm_var": 0.05389401997217688, |
| "learning_rate": 0.0001, |
| "loss": 5337572.5, |
| "loss/crossentropy": 2.7440524101257324, |
| "loss/hidden": 0.2373046875, |
| "loss/logits": 0.09026629477739334, |
| "loss/reg": 5337569.5, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.01205, |
| "grad_norm": 1.3456615209579468, |
| "grad_norm_var": 0.055439060623228835, |
| "learning_rate": 0.0001, |
| "loss": 5313065.0, |
| "loss/crossentropy": 2.948178768157959, |
| "loss/hidden": 0.2099609375, |
| "loss/logits": 0.08057942986488342, |
| "loss/reg": 5313061.5, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.012075, |
| "grad_norm": 1.7810407876968384, |
| "grad_norm_var": 0.05723505701914462, |
| "learning_rate": 0.0001, |
| "loss": 5288676.0, |
| "loss/crossentropy": 2.70149564743042, |
| "loss/hidden": 0.2119140625, |
| "loss/logits": 0.07805442810058594, |
| "loss/reg": 5288673.0, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.0121, |
| "grad_norm": 1.3724489212036133, |
| "grad_norm_var": 0.05740173688735926, |
| "learning_rate": 0.0001, |
| "loss": 5264018.5, |
| "loss/crossentropy": 2.858067512512207, |
| "loss/hidden": 0.2119140625, |
| "loss/logits": 0.07276931405067444, |
| "loss/reg": 5264015.0, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.012125, |
| "grad_norm": 1.3056849241256714, |
| "grad_norm_var": 0.0584598499760979, |
| "learning_rate": 0.0001, |
| "loss": 5239710.5, |
| "loss/crossentropy": 2.771963596343994, |
| "loss/hidden": 0.2138671875, |
| "loss/logits": 0.08474366366863251, |
| "loss/reg": 5239707.0, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.01215, |
| "grad_norm": 1.2661532163619995, |
| "grad_norm_var": 0.044474168071272344, |
| "learning_rate": 0.0001, |
| "loss": 5215522.5, |
| "loss/crossentropy": 2.6953389644622803, |
| "loss/hidden": 0.203125, |
| "loss/logits": 0.0747906044125557, |
| "loss/reg": 5215519.5, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.012175, |
| "grad_norm": 1.2468206882476807, |
| "grad_norm_var": 0.031238611502943492, |
| "learning_rate": 0.0001, |
| "loss": 5191457.0, |
| "loss/crossentropy": 2.6519415378570557, |
| "loss/hidden": 0.2119140625, |
| "loss/logits": 0.08477511256933212, |
| "loss/reg": 5191454.0, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.0122, |
| "grad_norm": 1.2211402654647827, |
| "grad_norm_var": 0.03404925215350492, |
| "learning_rate": 0.0001, |
| "loss": 5167444.0, |
| "loss/crossentropy": 2.783219337463379, |
| "loss/hidden": 0.1953125, |
| "loss/logits": 0.06870489567518234, |
| "loss/reg": 5167440.5, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.012225, |
| "grad_norm": 1.2826435565948486, |
| "grad_norm_var": 0.03460260370794917, |
| "learning_rate": 0.0001, |
| "loss": 5143601.5, |
| "loss/crossentropy": 2.8013036251068115, |
| "loss/hidden": 0.203125, |
| "loss/logits": 0.06853881478309631, |
| "loss/reg": 5143598.0, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.01225, |
| "grad_norm": 1.482037901878357, |
| "grad_norm_var": 0.029203727925079134, |
| "learning_rate": 0.0001, |
| "loss": 5119596.5, |
| "loss/crossentropy": 3.088949203491211, |
| "loss/hidden": 0.220703125, |
| "loss/logits": 0.08849765360355377, |
| "loss/reg": 5119593.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.012275, |
| "grad_norm": 1.2898473739624023, |
| "grad_norm_var": 0.029235728497339627, |
| "learning_rate": 0.0001, |
| "loss": 5095813.5, |
| "loss/crossentropy": 2.8333511352539062, |
| "loss/hidden": 0.203125, |
| "loss/logits": 0.07336239516735077, |
| "loss/reg": 5095810.0, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.0123, |
| "grad_norm": 1.3572802543640137, |
| "grad_norm_var": 0.02849001486780143, |
| "learning_rate": 0.0001, |
| "loss": 5072186.5, |
| "loss/crossentropy": 2.775811195373535, |
| "loss/hidden": 0.2080078125, |
| "loss/logits": 0.0848051905632019, |
| "loss/reg": 5072183.0, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.012325, |
| "grad_norm": 1.4413648843765259, |
| "grad_norm_var": 0.02664016681148092, |
| "learning_rate": 0.0001, |
| "loss": 5048636.0, |
| "loss/crossentropy": 2.879716634750366, |
| "loss/hidden": 0.2119140625, |
| "loss/logits": 0.07454962283372879, |
| "loss/reg": 5048632.5, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.01235, |
| "grad_norm": 1.294523000717163, |
| "grad_norm_var": 0.02558554102007804, |
| "learning_rate": 0.0001, |
| "loss": 5025143.5, |
| "loss/crossentropy": 2.7503137588500977, |
| "loss/hidden": 0.19921875, |
| "loss/logits": 0.07119489461183548, |
| "loss/reg": 5025140.0, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.012375, |
| "grad_norm": 1.5049114227294922, |
| "grad_norm_var": 0.026450704360854616, |
| "learning_rate": 0.0001, |
| "loss": 5001721.5, |
| "loss/crossentropy": 2.5769202709198, |
| "loss/hidden": 0.2138671875, |
| "loss/logits": 0.07310409843921661, |
| "loss/reg": 5001718.5, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.0124, |
| "grad_norm": 1.5371195077896118, |
| "grad_norm_var": 0.027662423794416153, |
| "learning_rate": 0.0001, |
| "loss": 4978385.5, |
| "loss/crossentropy": 3.0871026515960693, |
| "loss/hidden": 0.2119140625, |
| "loss/logits": 0.0779845267534256, |
| "loss/reg": 4978382.0, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.012425, |
| "grad_norm": 1.2650963068008423, |
| "grad_norm_var": 0.021077338206120486, |
| "learning_rate": 0.0001, |
| "loss": 4955120.0, |
| "loss/crossentropy": 2.5726966857910156, |
| "loss/hidden": 0.201171875, |
| "loss/logits": 0.07523669302463531, |
| "loss/reg": 4955117.0, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.01245, |
| "grad_norm": 1.3636277914047241, |
| "grad_norm_var": 0.02102816404935949, |
| "learning_rate": 0.0001, |
| "loss": 4932099.5, |
| "loss/crossentropy": 2.7579758167266846, |
| "loss/hidden": 0.1943359375, |
| "loss/logits": 0.05860140174627304, |
| "loss/reg": 4932096.0, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.012475, |
| "grad_norm": 1.6063166856765747, |
| "grad_norm_var": 0.013493942781565653, |
| "learning_rate": 0.0001, |
| "loss": 4908983.5, |
| "loss/crossentropy": 2.981480360031128, |
| "loss/hidden": 0.21875, |
| "loss/logits": 0.07961302995681763, |
| "loss/reg": 4908980.0, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.0125, |
| "grad_norm": 1.9693169593811035, |
| "grad_norm_var": 0.03636730041201487, |
| "learning_rate": 0.0001, |
| "loss": 4886092.0, |
| "loss/crossentropy": 3.340043067932129, |
| "loss/hidden": 0.24609375, |
| "loss/logits": 0.08841927349567413, |
| "loss/reg": 4886088.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.012525, |
| "grad_norm": 1.312299132347107, |
| "grad_norm_var": 0.036284991033671984, |
| "learning_rate": 0.0001, |
| "loss": 4863000.0, |
| "loss/crossentropy": 2.7403998374938965, |
| "loss/hidden": 0.21875, |
| "loss/logits": 0.09443940222263336, |
| "loss/reg": 4862997.0, |
| "step": 501 |
| }, |
| { |
| "epoch": 0.01255, |
| "grad_norm": 1.2781603336334229, |
| "grad_norm_var": 0.036075667545579414, |
| "learning_rate": 0.0001, |
| "loss": 4840273.0, |
| "loss/crossentropy": 2.6407599449157715, |
| "loss/hidden": 0.2138671875, |
| "loss/logits": 0.07989004254341125, |
| "loss/reg": 4840270.0, |
| "step": 502 |
| }, |
| { |
| "epoch": 0.012575, |
| "grad_norm": 6.7000412940979, |
| "grad_norm_var": 1.780914466671555, |
| "learning_rate": 0.0001, |
| "loss": 4817724.0, |
| "loss/crossentropy": 2.6567792892456055, |
| "loss/hidden": 0.271484375, |
| "loss/logits": 0.08874352276325226, |
| "loss/reg": 4817721.0, |
| "step": 503 |
| }, |
| { |
| "epoch": 0.0126, |
| "grad_norm": 2.0581181049346924, |
| "grad_norm_var": 1.766336065982956, |
| "learning_rate": 0.0001, |
| "loss": 4794940.0, |
| "loss/crossentropy": 2.8475656509399414, |
| "loss/hidden": 0.248046875, |
| "loss/logits": 0.09141740947961807, |
| "loss/reg": 4794936.5, |
| "step": 504 |
| }, |
| { |
| "epoch": 0.012625, |
| "grad_norm": 1.3745167255401611, |
| "grad_norm_var": 1.7605699842919658, |
| "learning_rate": 0.0001, |
| "loss": 4772514.0, |
| "loss/crossentropy": 2.3714818954467773, |
| "loss/hidden": 0.2119140625, |
| "loss/logits": 0.07352016866207123, |
| "loss/reg": 4772511.0, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.01265, |
| "grad_norm": 1.4215030670166016, |
| "grad_norm_var": 1.763382827462783, |
| "learning_rate": 0.0001, |
| "loss": 4750317.5, |
| "loss/crossentropy": 2.579083204269409, |
| "loss/hidden": 0.220703125, |
| "loss/logits": 0.07584069669246674, |
| "loss/reg": 4750314.5, |
| "step": 506 |
| }, |
| { |
| "epoch": 0.012675, |
| "grad_norm": 1.4343539476394653, |
| "grad_norm_var": 1.7548898322517554, |
| "learning_rate": 0.0001, |
| "loss": 4728179.5, |
| "loss/crossentropy": 2.5515999794006348, |
| "loss/hidden": 0.2080078125, |
| "loss/logits": 0.06930013746023178, |
| "loss/reg": 4728176.5, |
| "step": 507 |
| }, |
| { |
| "epoch": 0.0127, |
| "grad_norm": 1.4264485836029053, |
| "grad_norm_var": 1.7510375581354138, |
| "learning_rate": 0.0001, |
| "loss": 4705633.0, |
| "loss/crossentropy": 2.970280408859253, |
| "loss/hidden": 0.2236328125, |
| "loss/logits": 0.07312826812267303, |
| "loss/reg": 4705629.5, |
| "step": 508 |
| }, |
| { |
| "epoch": 0.012725, |
| "grad_norm": 1.388215184211731, |
| "grad_norm_var": 1.7538387698315436, |
| "learning_rate": 0.0001, |
| "loss": 4683271.0, |
| "loss/crossentropy": 2.546029806137085, |
| "loss/hidden": 0.21875, |
| "loss/logits": 0.0809093713760376, |
| "loss/reg": 4683268.0, |
| "step": 509 |
| }, |
| { |
| "epoch": 0.01275, |
| "grad_norm": 1.3780884742736816, |
| "grad_norm_var": 1.7485494521073068, |
| "learning_rate": 0.0001, |
| "loss": 4660980.5, |
| "loss/crossentropy": 2.769453287124634, |
| "loss/hidden": 0.2138671875, |
| "loss/logits": 0.0858239084482193, |
| "loss/reg": 4660977.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.012775, |
| "grad_norm": 2.558802366256714, |
| "grad_norm_var": 1.7745861029243049, |
| "learning_rate": 0.0001, |
| "loss": 4638826.5, |
| "loss/crossentropy": 2.4789929389953613, |
| "loss/hidden": 0.2890625, |
| "loss/logits": 0.1389915645122528, |
| "loss/reg": 4638823.5, |
| "step": 511 |
| }, |
| { |
| "epoch": 0.0128, |
| "grad_norm": 1.2396628856658936, |
| "grad_norm_var": 1.7936953038209895, |
| "learning_rate": 0.0001, |
| "loss": 4616996.0, |
| "loss/crossentropy": 2.5985193252563477, |
| "loss/hidden": 0.2060546875, |
| "loss/logits": 0.06800249218940735, |
| "loss/reg": 4616993.0, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.012825, |
| "grad_norm": 1.3219521045684814, |
| "grad_norm_var": 1.7893806080842751, |
| "learning_rate": 0.0001, |
| "loss": 4595028.0, |
| "loss/crossentropy": 2.645324230194092, |
| "loss/hidden": 0.2119140625, |
| "loss/logits": 0.06861410290002823, |
| "loss/reg": 4595025.0, |
| "step": 513 |
| }, |
| { |
| "epoch": 0.01285, |
| "grad_norm": 1.592073917388916, |
| "grad_norm_var": 1.7773871235847991, |
| "learning_rate": 0.0001, |
| "loss": 4573509.0, |
| "loss/crossentropy": 2.564718246459961, |
| "loss/hidden": 0.2236328125, |
| "loss/logits": 0.08173391222953796, |
| "loss/reg": 4573506.0, |
| "step": 514 |
| }, |
| { |
| "epoch": 0.012875, |
| "grad_norm": 1.5042678117752075, |
| "grad_norm_var": 1.7817447545165805, |
| "learning_rate": 0.0001, |
| "loss": 4551826.5, |
| "loss/crossentropy": 2.8682336807250977, |
| "loss/hidden": 0.220703125, |
| "loss/logits": 0.08383598923683167, |
| "loss/reg": 4551823.0, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.0129, |
| "grad_norm": 1.389059066772461, |
| "grad_norm_var": 1.79528741154141, |
| "learning_rate": 0.0001, |
| "loss": 4529849.0, |
| "loss/crossentropy": 3.0098681449890137, |
| "loss/hidden": 0.2158203125, |
| "loss/logits": 0.08088910579681396, |
| "loss/reg": 4529845.5, |
| "step": 516 |
| }, |
| { |
| "epoch": 0.012925, |
| "grad_norm": 1.309531331062317, |
| "grad_norm_var": 1.7954811930451737, |
| "learning_rate": 0.0001, |
| "loss": 4508461.0, |
| "loss/crossentropy": 2.684901475906372, |
| "loss/hidden": 0.2138671875, |
| "loss/logits": 0.08869834244251251, |
| "loss/reg": 4508458.0, |
| "step": 517 |
| }, |
| { |
| "epoch": 0.01295, |
| "grad_norm": 1.457557201385498, |
| "grad_norm_var": 1.7841511546048072, |
| "learning_rate": 0.0001, |
| "loss": 4487228.5, |
| "loss/crossentropy": 2.9550857543945312, |
| "loss/hidden": 0.224609375, |
| "loss/logits": 0.08202479779720306, |
| "loss/reg": 4487225.0, |
| "step": 518 |
| }, |
| { |
| "epoch": 0.012975, |
| "grad_norm": 3.853179454803467, |
| "grad_norm_var": 0.4486169793116943, |
| "learning_rate": 0.0001, |
| "loss": 4466185.5, |
| "loss/crossentropy": 2.403658151626587, |
| "loss/hidden": 0.2734375, |
| "loss/logits": 0.09220259636640549, |
| "loss/reg": 4466182.5, |
| "step": 519 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 1.5276105403900146, |
| "grad_norm_var": 0.43869758106292994, |
| "learning_rate": 0.0001, |
| "loss": 4444584.5, |
| "loss/crossentropy": 2.8231990337371826, |
| "loss/hidden": 0.2265625, |
| "loss/logits": 0.08691398054361343, |
| "loss/reg": 4444581.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.013025, |
| "grad_norm": 1.322237253189087, |
| "grad_norm_var": 0.44069145526669523, |
| "learning_rate": 0.0001, |
| "loss": 4423106.0, |
| "loss/crossentropy": 2.803400993347168, |
| "loss/hidden": 0.2119140625, |
| "loss/logits": 0.08093398809432983, |
| "loss/reg": 4423102.5, |
| "step": 521 |
| }, |
| { |
| "epoch": 0.01305, |
| "grad_norm": 1.452327013015747, |
| "grad_norm_var": 0.4398825027601672, |
| "learning_rate": 0.0001, |
| "loss": 4401950.5, |
| "loss/crossentropy": 2.6843457221984863, |
| "loss/hidden": 0.2265625, |
| "loss/logits": 0.08903183043003082, |
| "loss/reg": 4401947.5, |
| "step": 522 |
| }, |
| { |
| "epoch": 0.013075, |
| "grad_norm": 1.6516999006271362, |
| "grad_norm_var": 0.43702873350537935, |
| "learning_rate": 0.0001, |
| "loss": 4381061.0, |
| "loss/crossentropy": 3.0740303993225098, |
| "loss/hidden": 0.228515625, |
| "loss/logits": 0.07274787127971649, |
| "loss/reg": 4381057.5, |
| "step": 523 |
| }, |
| { |
| "epoch": 0.0131, |
| "grad_norm": 1.4187898635864258, |
| "grad_norm_var": 0.4372589403424643, |
| "learning_rate": 0.0001, |
| "loss": 4360238.0, |
| "loss/crossentropy": 2.82788348197937, |
| "loss/hidden": 0.23046875, |
| "loss/logits": 0.08973852545022964, |
| "loss/reg": 4360234.5, |
| "step": 524 |
| }, |
| { |
| "epoch": 0.013125, |
| "grad_norm": 1.463365912437439, |
| "grad_norm_var": 0.43501069298036155, |
| "learning_rate": 0.0001, |
| "loss": 4339327.5, |
| "loss/crossentropy": 2.939164876937866, |
| "loss/hidden": 0.23046875, |
| "loss/logits": 0.0972735732793808, |
| "loss/reg": 4339324.0, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.01315, |
| "grad_norm": 4.741021633148193, |
| "grad_norm_var": 1.0187937271477372, |
| "learning_rate": 0.0001, |
| "loss": 4318519.0, |
| "loss/crossentropy": 3.286886692047119, |
| "loss/hidden": 0.248046875, |
| "loss/logits": 0.09269572794437408, |
| "loss/reg": 4318515.0, |
| "step": 526 |
| }, |
| { |
| "epoch": 0.013175, |
| "grad_norm": 1.2664167881011963, |
| "grad_norm_var": 1.0032333211277396, |
| "learning_rate": 0.0001, |
| "loss": 4297953.5, |
| "loss/crossentropy": 2.5890755653381348, |
| "loss/hidden": 0.21875, |
| "loss/logits": 0.08711511641740799, |
| "loss/reg": 4297950.5, |
| "step": 527 |
| }, |
| { |
| "epoch": 0.0132, |
| "grad_norm": 1.6065011024475098, |
| "grad_norm_var": 0.9851211125146219, |
| "learning_rate": 0.0001, |
| "loss": 4277420.5, |
| "loss/crossentropy": 2.9780211448669434, |
| "loss/hidden": 0.23046875, |
| "loss/logits": 0.09367707371711731, |
| "loss/reg": 4277417.0, |
| "step": 528 |
| }, |
| { |
| "epoch": 0.013225, |
| "grad_norm": 1.2903633117675781, |
| "grad_norm_var": 0.9872173640655272, |
| "learning_rate": 0.0001, |
| "loss": 4256947.5, |
| "loss/crossentropy": 2.571004867553711, |
| "loss/hidden": 0.2099609375, |
| "loss/logits": 0.07217580080032349, |
| "loss/reg": 4256944.5, |
| "step": 529 |
| }, |
| { |
| "epoch": 0.01325, |
| "grad_norm": 1.4145756959915161, |
| "grad_norm_var": 0.9941753773012881, |
| "learning_rate": 0.0001, |
| "loss": 4236667.5, |
| "loss/crossentropy": 2.5526325702667236, |
| "loss/hidden": 0.220703125, |
| "loss/logits": 0.08166811615228653, |
| "loss/reg": 4236664.5, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.013275, |
| "grad_norm": 1.4388494491577148, |
| "grad_norm_var": 0.9969506731667859, |
| "learning_rate": 0.0001, |
| "loss": 4216488.0, |
| "loss/crossentropy": 2.7336323261260986, |
| "loss/hidden": 0.240234375, |
| "loss/logits": 0.09991507232189178, |
| "loss/reg": 4216485.0, |
| "step": 531 |
| }, |
| { |
| "epoch": 0.0133, |
| "grad_norm": 1.9652752876281738, |
| "grad_norm_var": 0.9870756774410997, |
| "learning_rate": 0.0001, |
| "loss": 4196365.5, |
| "loss/crossentropy": 2.6363699436187744, |
| "loss/hidden": 0.232421875, |
| "loss/logits": 0.10048555582761765, |
| "loss/reg": 4196362.5, |
| "step": 532 |
| }, |
| { |
| "epoch": 0.013325, |
| "grad_norm": 1.3856992721557617, |
| "grad_norm_var": 0.9822164542156329, |
| "learning_rate": 0.0001, |
| "loss": 4175844.25, |
| "loss/crossentropy": 2.9322404861450195, |
| "loss/hidden": 0.228515625, |
| "loss/logits": 0.08981572091579437, |
| "loss/reg": 4175841.0, |
| "step": 533 |
| }, |
| { |
| "epoch": 0.01335, |
| "grad_norm": 1.2873163223266602, |
| "grad_norm_var": 0.9924470245861812, |
| "learning_rate": 0.0001, |
| "loss": 4155624.5, |
| "loss/crossentropy": 2.6858222484588623, |
| "loss/hidden": 0.2265625, |
| "loss/logits": 0.08137323707342148, |
| "loss/reg": 4155621.5, |
| "step": 534 |
| }, |
| { |
| "epoch": 0.013375, |
| "grad_norm": 1.7554881572723389, |
| "grad_norm_var": 0.6981941164815708, |
| "learning_rate": 0.0001, |
| "loss": 4135681.0, |
| "loss/crossentropy": 3.2991771697998047, |
| "loss/hidden": 0.255859375, |
| "loss/logits": 0.08769601583480835, |
| "loss/reg": 4135677.5, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.0134, |
| "grad_norm": 1.4162452220916748, |
| "grad_norm_var": 0.7013318424396686, |
| "learning_rate": 0.0001, |
| "loss": 4115880.0, |
| "loss/crossentropy": 2.9056992530822754, |
| "loss/hidden": 0.2236328125, |
| "loss/logits": 0.08392970263957977, |
| "loss/reg": 4115876.75, |
| "step": 536 |
| }, |
| { |
| "epoch": 0.013425, |
| "grad_norm": 2.130401134490967, |
| "grad_norm_var": 0.7036273846389391, |
| "learning_rate": 0.0001, |
| "loss": 4095965.0, |
| "loss/crossentropy": 2.9354186058044434, |
| "loss/hidden": 0.25, |
| "loss/logits": 0.14074985682964325, |
| "loss/reg": 4095961.5, |
| "step": 537 |
| }, |
| { |
| "epoch": 0.01345, |
| "grad_norm": 1.2602250576019287, |
| "grad_norm_var": 0.7130529767772941, |
| "learning_rate": 0.0001, |
| "loss": 4076281.75, |
| "loss/crossentropy": 2.497177839279175, |
| "loss/hidden": 0.220703125, |
| "loss/logits": 0.0769551545381546, |
| "loss/reg": 4076279.0, |
| "step": 538 |
| }, |
| { |
| "epoch": 0.013475, |
| "grad_norm": 1.3924604654312134, |
| "grad_norm_var": 0.7195541216876576, |
| "learning_rate": 0.0001, |
| "loss": 4056686.0, |
| "loss/crossentropy": 2.7258286476135254, |
| "loss/hidden": 0.23828125, |
| "loss/logits": 0.0933137983083725, |
| "loss/reg": 4056683.0, |
| "step": 539 |
| }, |
| { |
| "epoch": 0.0135, |
| "grad_norm": 2.866610288619995, |
| "grad_norm_var": 0.7958819636931788, |
| "learning_rate": 0.0001, |
| "loss": 4036986.0, |
| "loss/crossentropy": 2.7008776664733887, |
| "loss/hidden": 0.30859375, |
| "loss/logits": 0.10422666370868683, |
| "loss/reg": 4036982.75, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.013525, |
| "grad_norm": 1.7634690999984741, |
| "grad_norm_var": 0.7883389035979513, |
| "learning_rate": 0.0001, |
| "loss": 4017336.75, |
| "loss/crossentropy": 2.6597087383270264, |
| "loss/hidden": 0.2451171875, |
| "loss/logits": 0.07720647007226944, |
| "loss/reg": 4017333.75, |
| "step": 541 |
| }, |
| { |
| "epoch": 0.01355, |
| "grad_norm": 1.2585740089416504, |
| "grad_norm_var": 0.1859604752024272, |
| "learning_rate": 0.0001, |
| "loss": 3997783.25, |
| "loss/crossentropy": 2.614624500274658, |
| "loss/hidden": 0.2216796875, |
| "loss/logits": 0.07174636423587799, |
| "loss/reg": 3997780.5, |
| "step": 542 |
| }, |
| { |
| "epoch": 0.013575, |
| "grad_norm": 1.4309707880020142, |
| "grad_norm_var": 0.18047308329312628, |
| "learning_rate": 0.0001, |
| "loss": 3978639.25, |
| "loss/crossentropy": 2.5386619567871094, |
| "loss/hidden": 0.2373046875, |
| "loss/logits": 0.08103261142969131, |
| "loss/reg": 3978636.5, |
| "step": 543 |
| }, |
| { |
| "epoch": 0.0136, |
| "grad_norm": 1.3246599435806274, |
| "grad_norm_var": 0.18534145648971023, |
| "learning_rate": 0.0001, |
| "loss": 3959602.5, |
| "loss/crossentropy": 2.798149824142456, |
| "loss/hidden": 0.2333984375, |
| "loss/logits": 0.0809333324432373, |
| "loss/reg": 3959599.5, |
| "step": 544 |
| }, |
| { |
| "epoch": 0.013625, |
| "grad_norm": 1.5243966579437256, |
| "grad_norm_var": 0.1795293935396505, |
| "learning_rate": 0.0001, |
| "loss": 3940683.0, |
| "loss/crossentropy": 2.776409387588501, |
| "loss/hidden": 0.2333984375, |
| "loss/logits": 0.08192727714776993, |
| "loss/reg": 3940680.0, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.01365, |
| "grad_norm": 1.468361735343933, |
| "grad_norm_var": 0.1783736165268431, |
| "learning_rate": 0.0001, |
| "loss": 3921531.75, |
| "loss/crossentropy": 2.4836275577545166, |
| "loss/hidden": 0.23046875, |
| "loss/logits": 0.08119748532772064, |
| "loss/reg": 3921529.0, |
| "step": 546 |
| }, |
| { |
| "epoch": 0.013675, |
| "grad_norm": 1.337628960609436, |
| "grad_norm_var": 0.18124706828382955, |
| "learning_rate": 0.0001, |
| "loss": 3902678.5, |
| "loss/crossentropy": 2.6022684574127197, |
| "loss/hidden": 0.2255859375, |
| "loss/logits": 0.07433047145605087, |
| "loss/reg": 3902675.75, |
| "step": 547 |
| }, |
| { |
| "epoch": 0.0137, |
| "grad_norm": 2.9525156021118164, |
| "grad_norm_var": 0.29050926943303634, |
| "learning_rate": 0.0001, |
| "loss": 3883748.5, |
| "loss/crossentropy": 3.0395596027374268, |
| "loss/hidden": 0.26953125, |
| "loss/logits": 0.08879055827856064, |
| "loss/reg": 3883745.25, |
| "step": 548 |
| }, |
| { |
| "epoch": 0.013725, |
| "grad_norm": 1.4882827997207642, |
| "grad_norm_var": 0.2874194040817021, |
| "learning_rate": 0.0001, |
| "loss": 3864961.25, |
| "loss/crossentropy": 2.58965802192688, |
| "loss/hidden": 0.2490234375, |
| "loss/logits": 0.08821944892406464, |
| "loss/reg": 3864958.5, |
| "step": 549 |
| }, |
| { |
| "epoch": 0.01375, |
| "grad_norm": 1.4185281991958618, |
| "grad_norm_var": 0.28186864307369486, |
| "learning_rate": 0.0001, |
| "loss": 3846269.25, |
| "loss/crossentropy": 2.8426175117492676, |
| "loss/hidden": 0.2470703125, |
| "loss/logits": 0.08191045373678207, |
| "loss/reg": 3846266.25, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.013775, |
| "grad_norm": 1.4052224159240723, |
| "grad_norm_var": 0.2857449192181614, |
| "learning_rate": 0.0001, |
| "loss": 3827682.5, |
| "loss/crossentropy": 2.516321897506714, |
| "loss/hidden": 0.2421875, |
| "loss/logits": 0.08413630723953247, |
| "loss/reg": 3827679.75, |
| "step": 551 |
| }, |
| { |
| "epoch": 0.0138, |
| "grad_norm": 1.2860373258590698, |
| "grad_norm_var": 0.2909046111507085, |
| "learning_rate": 0.0001, |
| "loss": 3809231.0, |
| "loss/crossentropy": 2.62768816947937, |
| "loss/hidden": 0.2353515625, |
| "loss/logits": 0.08362919837236404, |
| "loss/reg": 3809228.0, |
| "step": 552 |
| }, |
| { |
| "epoch": 0.013825, |
| "grad_norm": 1.4769209623336792, |
| "grad_norm_var": 0.27523757444729235, |
| "learning_rate": 0.0001, |
| "loss": 3790908.75, |
| "loss/crossentropy": 2.761497974395752, |
| "loss/hidden": 0.2216796875, |
| "loss/logits": 0.07314087450504303, |
| "loss/reg": 3790905.75, |
| "step": 553 |
| }, |
| { |
| "epoch": 0.01385, |
| "grad_norm": 1.9931138753890991, |
| "grad_norm_var": 0.2752705712016956, |
| "learning_rate": 0.0001, |
| "loss": 3772703.75, |
| "loss/crossentropy": 2.827181577682495, |
| "loss/hidden": 0.283203125, |
| "loss/logits": 0.10602318495512009, |
| "loss/reg": 3772700.5, |
| "step": 554 |
| }, |
| { |
| "epoch": 0.013875, |
| "grad_norm": 1.2844096422195435, |
| "grad_norm_var": 0.2796995446016789, |
| "learning_rate": 0.0001, |
| "loss": 3754508.5, |
| "loss/crossentropy": 2.7405292987823486, |
| "loss/hidden": 0.228515625, |
| "loss/logits": 0.08318239450454712, |
| "loss/reg": 3754505.5, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.0139, |
| "grad_norm": 1.3420419692993164, |
| "grad_norm_var": 0.17613307877471077, |
| "learning_rate": 0.0001, |
| "loss": 3736414.0, |
| "loss/crossentropy": 2.578117847442627, |
| "loss/hidden": 0.2421875, |
| "loss/logits": 0.08419770002365112, |
| "loss/reg": 3736411.25, |
| "step": 556 |
| }, |
| { |
| "epoch": 0.013925, |
| "grad_norm": 1.2973062992095947, |
| "grad_norm_var": 0.17627234255116353, |
| "learning_rate": 0.0001, |
| "loss": 3718432.75, |
| "loss/crossentropy": 2.2757911682128906, |
| "loss/hidden": 0.220703125, |
| "loss/logits": 0.07288214564323425, |
| "loss/reg": 3718430.25, |
| "step": 557 |
| }, |
| { |
| "epoch": 0.01395, |
| "grad_norm": 1.2561750411987305, |
| "grad_norm_var": 0.1763557022681423, |
| "learning_rate": 0.0001, |
| "loss": 3700496.25, |
| "loss/crossentropy": 2.60646390914917, |
| "loss/hidden": 0.2255859375, |
| "loss/logits": 0.07780743390321732, |
| "loss/reg": 3700493.5, |
| "step": 558 |
| }, |
| { |
| "epoch": 0.013975, |
| "grad_norm": 1.81352698802948, |
| "grad_norm_var": 0.18106793992723738, |
| "learning_rate": 0.0001, |
| "loss": 3682621.25, |
| "loss/crossentropy": 2.6849279403686523, |
| "loss/hidden": 0.234375, |
| "loss/logits": 0.08057552576065063, |
| "loss/reg": 3682618.25, |
| "step": 559 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 1.5537123680114746, |
| "grad_norm_var": 0.1777148488587196, |
| "learning_rate": 0.0001, |
| "loss": 3664605.5, |
| "loss/crossentropy": 2.805983304977417, |
| "loss/hidden": 0.2236328125, |
| "loss/logits": 0.07649838924407959, |
| "loss/reg": 3664602.5, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.014025, |
| "grad_norm": 1.540603756904602, |
| "grad_norm_var": 0.17766267806223343, |
| "learning_rate": 0.0001, |
| "loss": 3646892.5, |
| "loss/crossentropy": 2.6105356216430664, |
| "loss/hidden": 0.23046875, |
| "loss/logits": 0.07570961117744446, |
| "loss/reg": 3646889.75, |
| "step": 561 |
| }, |
| { |
| "epoch": 0.01405, |
| "grad_norm": 1.5530180931091309, |
| "grad_norm_var": 0.17710840643804027, |
| "learning_rate": 0.0001, |
| "loss": 3629055.75, |
| "loss/crossentropy": 2.653092861175537, |
| "loss/hidden": 0.248046875, |
| "loss/logits": 0.08653946220874786, |
| "loss/reg": 3629052.75, |
| "step": 562 |
| }, |
| { |
| "epoch": 0.014075, |
| "grad_norm": 1.3190182447433472, |
| "grad_norm_var": 0.1776879071100419, |
| "learning_rate": 0.0001, |
| "loss": 3611479.25, |
| "loss/crossentropy": 2.7003443241119385, |
| "loss/hidden": 0.2255859375, |
| "loss/logits": 0.07111045718193054, |
| "loss/reg": 3611476.25, |
| "step": 563 |
| }, |
| { |
| "epoch": 0.0141, |
| "grad_norm": 1.4643640518188477, |
| "grad_norm_var": 0.04005026552640434, |
| "learning_rate": 0.0001, |
| "loss": 3594013.0, |
| "loss/crossentropy": 2.859811782836914, |
| "loss/hidden": 0.23046875, |
| "loss/logits": 0.08538360893726349, |
| "loss/reg": 3594010.0, |
| "step": 564 |
| }, |
| { |
| "epoch": 0.014125, |
| "grad_norm": 1.521527886390686, |
| "grad_norm_var": 0.040208063583109244, |
| "learning_rate": 0.0001, |
| "loss": 3576612.25, |
| "loss/crossentropy": 3.092327833175659, |
| "loss/hidden": 0.251953125, |
| "loss/logits": 0.09326502680778503, |
| "loss/reg": 3576609.0, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.01415, |
| "grad_norm": 1.434308409690857, |
| "grad_norm_var": 0.040114602083886494, |
| "learning_rate": 0.0001, |
| "loss": 3559244.25, |
| "loss/crossentropy": 2.9351813793182373, |
| "loss/hidden": 0.2373046875, |
| "loss/logits": 0.08874573558568954, |
| "loss/reg": 3559241.0, |
| "step": 566 |
| }, |
| { |
| "epoch": 0.014175, |
| "grad_norm": 1.3384209871292114, |
| "grad_norm_var": 0.04098233003794292, |
| "learning_rate": 0.0001, |
| "loss": 3541978.5, |
| "loss/crossentropy": 2.6194591522216797, |
| "loss/hidden": 0.24609375, |
| "loss/logits": 0.09305374324321747, |
| "loss/reg": 3541975.75, |
| "step": 567 |
| }, |
| { |
| "epoch": 0.0142, |
| "grad_norm": 1.4581677913665771, |
| "grad_norm_var": 0.038677316348570835, |
| "learning_rate": 0.0001, |
| "loss": 3524741.75, |
| "loss/crossentropy": 2.8271114826202393, |
| "loss/hidden": 0.232421875, |
| "loss/logits": 0.07766708731651306, |
| "loss/reg": 3524738.75, |
| "step": 568 |
| }, |
| { |
| "epoch": 0.014225, |
| "grad_norm": 1.6008046865463257, |
| "grad_norm_var": 0.03962009932452961, |
| "learning_rate": 0.0001, |
| "loss": 3507646.5, |
| "loss/crossentropy": 2.831601619720459, |
| "loss/hidden": 0.234375, |
| "loss/logits": 0.09303203225135803, |
| "loss/reg": 3507643.5, |
| "step": 569 |
| }, |
| { |
| "epoch": 0.01425, |
| "grad_norm": 1.3187658786773682, |
| "grad_norm_var": 0.022414717439751777, |
| "learning_rate": 0.0001, |
| "loss": 3490411.75, |
| "loss/crossentropy": 2.8522350788116455, |
| "loss/hidden": 0.232421875, |
| "loss/logits": 0.08207453787326813, |
| "loss/reg": 3490408.75, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.014275, |
| "grad_norm": 1.2787755727767944, |
| "grad_norm_var": 0.02253621959062381, |
| "learning_rate": 0.0001, |
| "loss": 3473327.25, |
| "loss/crossentropy": 2.7266457080841064, |
| "loss/hidden": 0.232421875, |
| "loss/logits": 0.08422526717185974, |
| "loss/reg": 3473324.25, |
| "step": 571 |
| }, |
| { |
| "epoch": 0.0143, |
| "grad_norm": 1.3208447694778442, |
| "grad_norm_var": 0.02285008750252846, |
| "learning_rate": 0.0001, |
| "loss": 3456409.25, |
| "loss/crossentropy": 2.662264347076416, |
| "loss/hidden": 0.2265625, |
| "loss/logits": 0.07577495276927948, |
| "loss/reg": 3456406.25, |
| "step": 572 |
| }, |
| { |
| "epoch": 0.014325, |
| "grad_norm": 1.1715630292892456, |
| "grad_norm_var": 0.026261412888491265, |
| "learning_rate": 0.0001, |
| "loss": 3439551.25, |
| "loss/crossentropy": 2.455469846725464, |
| "loss/hidden": 0.2119140625, |
| "loss/logits": 0.07150474935770035, |
| "loss/reg": 3439548.5, |
| "step": 573 |
| }, |
| { |
| "epoch": 0.01435, |
| "grad_norm": 1.3703027963638306, |
| "grad_norm_var": 0.024369898722689184, |
| "learning_rate": 0.0001, |
| "loss": 3422793.25, |
| "loss/crossentropy": 2.8517136573791504, |
| "loss/hidden": 0.232421875, |
| "loss/logits": 0.08173410594463348, |
| "loss/reg": 3422790.25, |
| "step": 574 |
| }, |
| { |
| "epoch": 0.014375, |
| "grad_norm": 1.5435969829559326, |
| "grad_norm_var": 0.015520172739490808, |
| "learning_rate": 0.0001, |
| "loss": 3406115.0, |
| "loss/crossentropy": 3.0161821842193604, |
| "loss/hidden": 0.25390625, |
| "loss/logits": 0.0937599390745163, |
| "loss/reg": 3406111.75, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.0144, |
| "grad_norm": 1.330981969833374, |
| "grad_norm_var": 0.014775650730167436, |
| "learning_rate": 0.0001, |
| "loss": 3389492.5, |
| "loss/crossentropy": 2.882615327835083, |
| "loss/hidden": 0.248046875, |
| "loss/logits": 0.09555500000715256, |
| "loss/reg": 3389489.25, |
| "step": 576 |
| }, |
| { |
| "epoch": 0.014425, |
| "grad_norm": 2.3669495582580566, |
| "grad_norm_var": 0.07180858297566312, |
| "learning_rate": 0.0001, |
| "loss": 3373007.75, |
| "loss/crossentropy": 2.8147549629211426, |
| "loss/hidden": 0.26953125, |
| "loss/logits": 0.16876861453056335, |
| "loss/reg": 3373004.5, |
| "step": 577 |
| }, |
| { |
| "epoch": 0.01445, |
| "grad_norm": 1.4470807313919067, |
| "grad_norm_var": 0.07122385414334928, |
| "learning_rate": 0.0001, |
| "loss": 3355929.0, |
| "loss/crossentropy": 2.8622498512268066, |
| "loss/hidden": 0.240234375, |
| "loss/logits": 0.08209867775440216, |
| "loss/reg": 3355926.0, |
| "step": 578 |
| }, |
| { |
| "epoch": 0.014475, |
| "grad_norm": 1.2843523025512695, |
| "grad_norm_var": 0.071929068026696, |
| "learning_rate": 0.0001, |
| "loss": 3339414.25, |
| "loss/crossentropy": 2.458169460296631, |
| "loss/hidden": 0.248046875, |
| "loss/logits": 0.08410149067640305, |
| "loss/reg": 3339411.5, |
| "step": 579 |
| }, |
| { |
| "epoch": 0.0145, |
| "grad_norm": 1.4458677768707275, |
| "grad_norm_var": 0.07192285707583412, |
| "learning_rate": 0.0001, |
| "loss": 3323023.0, |
| "loss/crossentropy": 2.719388008117676, |
| "loss/hidden": 0.248046875, |
| "loss/logits": 0.08330346643924713, |
| "loss/reg": 3323020.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.014525, |
| "grad_norm": 1.4275226593017578, |
| "grad_norm_var": 0.07160394759432975, |
| "learning_rate": 0.0001, |
| "loss": 3306730.5, |
| "loss/crossentropy": 2.7679271697998047, |
| "loss/hidden": 0.248046875, |
| "loss/logits": 0.09723743051290512, |
| "loss/reg": 3306727.5, |
| "step": 581 |
| }, |
| { |
| "epoch": 0.01455, |
| "grad_norm": 1.2826558351516724, |
| "grad_norm_var": 0.07328067615778006, |
| "learning_rate": 0.0001, |
| "loss": 3290506.25, |
| "loss/crossentropy": 2.6306045055389404, |
| "loss/hidden": 0.2275390625, |
| "loss/logits": 0.07503408193588257, |
| "loss/reg": 3290503.25, |
| "step": 582 |
| }, |
| { |
| "epoch": 0.014575, |
| "grad_norm": 1.4242373704910278, |
| "grad_norm_var": 0.07261681873861031, |
| "learning_rate": 0.0001, |
| "loss": 3274338.75, |
| "loss/crossentropy": 2.78100323677063, |
| "loss/hidden": 0.23828125, |
| "loss/logits": 0.07994034141302109, |
| "loss/reg": 3274335.75, |
| "step": 583 |
| }, |
| { |
| "epoch": 0.0146, |
| "grad_norm": 1.2628331184387207, |
| "grad_norm_var": 0.07458122645065804, |
| "learning_rate": 0.0001, |
| "loss": 3258250.75, |
| "loss/crossentropy": 3.0057835578918457, |
| "loss/hidden": 0.2236328125, |
| "loss/logits": 0.07975363731384277, |
| "loss/reg": 3258247.5, |
| "step": 584 |
| }, |
| { |
| "epoch": 0.014625, |
| "grad_norm": 1.549954891204834, |
| "grad_norm_var": 0.07358356766954817, |
| "learning_rate": 0.0001, |
| "loss": 3242258.0, |
| "loss/crossentropy": 2.63142466545105, |
| "loss/hidden": 0.248046875, |
| "loss/logits": 0.10649195313453674, |
| "loss/reg": 3242255.0, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.01465, |
| "grad_norm": 1.525122880935669, |
| "grad_norm_var": 0.07327686326856994, |
| "learning_rate": 0.0001, |
| "loss": 3226354.75, |
| "loss/crossentropy": 2.832843780517578, |
| "loss/hidden": 0.2373046875, |
| "loss/logits": 0.08181619644165039, |
| "loss/reg": 3226351.75, |
| "step": 586 |
| }, |
| { |
| "epoch": 0.014675, |
| "grad_norm": 1.5030587911605835, |
| "grad_norm_var": 0.07161322578152837, |
| "learning_rate": 0.0001, |
| "loss": 3210539.75, |
| "loss/crossentropy": 2.7488560676574707, |
| "loss/hidden": 0.2353515625, |
| "loss/logits": 0.08145460486412048, |
| "loss/reg": 3210536.75, |
| "step": 587 |
| }, |
| { |
| "epoch": 0.0147, |
| "grad_norm": 1.4565194845199585, |
| "grad_norm_var": 0.07036292812022485, |
| "learning_rate": 0.0001, |
| "loss": 3194821.5, |
| "loss/crossentropy": 2.6408228874206543, |
| "loss/hidden": 0.228515625, |
| "loss/logits": 0.07606947422027588, |
| "loss/reg": 3194818.5, |
| "step": 588 |
| }, |
| { |
| "epoch": 0.014725, |
| "grad_norm": 1.281499981880188, |
| "grad_norm_var": 0.0668604608876672, |
| "learning_rate": 0.0001, |
| "loss": 3178915.75, |
| "loss/crossentropy": 2.688347339630127, |
| "loss/hidden": 0.232421875, |
| "loss/logits": 0.078568235039711, |
| "loss/reg": 3178912.75, |
| "step": 589 |
| }, |
| { |
| "epoch": 0.01475, |
| "grad_norm": 1.8308510780334473, |
| "grad_norm_var": 0.07406197654711923, |
| "learning_rate": 0.0001, |
| "loss": 3163237.75, |
| "loss/crossentropy": 2.55411958694458, |
| "loss/hidden": 0.2578125, |
| "loss/logits": 0.0765468031167984, |
| "loss/reg": 3163235.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.014775, |
| "grad_norm": 1.4850999116897583, |
| "grad_norm_var": 0.07391781135352975, |
| "learning_rate": 0.0001, |
| "loss": 3147509.5, |
| "loss/crossentropy": 2.748263359069824, |
| "loss/hidden": 0.2470703125, |
| "loss/logits": 0.08163708448410034, |
| "loss/reg": 3147506.5, |
| "step": 591 |
| }, |
| { |
| "epoch": 0.0148, |
| "grad_norm": 1.3978896141052246, |
| "grad_norm_var": 0.07274298588645009, |
| "learning_rate": 0.0001, |
| "loss": 3131992.0, |
| "loss/crossentropy": 2.6332292556762695, |
| "loss/hidden": 0.240234375, |
| "loss/logits": 0.09071440249681473, |
| "loss/reg": 3131989.0, |
| "step": 592 |
| }, |
| { |
| "epoch": 0.014825, |
| "grad_norm": 1.430786371231079, |
| "grad_norm_var": 0.019081542167886725, |
| "learning_rate": 0.0001, |
| "loss": 3116583.5, |
| "loss/crossentropy": 2.812922716140747, |
| "loss/hidden": 0.2431640625, |
| "loss/logits": 0.08107170462608337, |
| "loss/reg": 3116580.5, |
| "step": 593 |
| }, |
| { |
| "epoch": 0.01485, |
| "grad_norm": 1.2611074447631836, |
| "grad_norm_var": 0.021060361085469975, |
| "learning_rate": 0.0001, |
| "loss": 3101238.75, |
| "loss/crossentropy": 2.510676145553589, |
| "loss/hidden": 0.2373046875, |
| "loss/logits": 0.0801520049571991, |
| "loss/reg": 3101236.0, |
| "step": 594 |
| }, |
| { |
| "epoch": 0.014875, |
| "grad_norm": 1.4165650606155396, |
| "grad_norm_var": 0.019619102112795814, |
| "learning_rate": 0.0001, |
| "loss": 3085774.25, |
| "loss/crossentropy": 2.652660846710205, |
| "loss/hidden": 0.23046875, |
| "loss/logits": 0.07323139905929565, |
| "loss/reg": 3085771.25, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.0149, |
| "grad_norm": 1.5486196279525757, |
| "grad_norm_var": 0.02040939318021605, |
| "learning_rate": 0.0001, |
| "loss": 3070596.25, |
| "loss/crossentropy": 3.0037014484405518, |
| "loss/hidden": 0.240234375, |
| "loss/logits": 0.08134207129478455, |
| "loss/reg": 3070593.0, |
| "step": 596 |
| }, |
| { |
| "epoch": 0.014925, |
| "grad_norm": 2.229902982711792, |
| "grad_norm_var": 0.05901652992943885, |
| "learning_rate": 0.0001, |
| "loss": 3055410.0, |
| "loss/crossentropy": 2.739027976989746, |
| "loss/hidden": 0.251953125, |
| "loss/logits": 0.08962370455265045, |
| "loss/reg": 3055407.0, |
| "step": 597 |
| }, |
| { |
| "epoch": 0.01495, |
| "grad_norm": 1.3188974857330322, |
| "grad_norm_var": 0.05808258298563936, |
| "learning_rate": 0.0001, |
| "loss": 3040404.0, |
| "loss/crossentropy": 2.712406873703003, |
| "loss/hidden": 0.2294921875, |
| "loss/logits": 0.07478486746549606, |
| "loss/reg": 3040401.0, |
| "step": 598 |
| }, |
| { |
| "epoch": 0.014975, |
| "grad_norm": 1.5142409801483154, |
| "grad_norm_var": 0.05773747832359177, |
| "learning_rate": 0.0001, |
| "loss": 3025186.0, |
| "loss/crossentropy": 2.8170316219329834, |
| "loss/hidden": 0.2451171875, |
| "loss/logits": 0.08254105597734451, |
| "loss/reg": 3025183.0, |
| "step": 599 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 1.3381177186965942, |
| "grad_norm_var": 0.055702921218860446, |
| "learning_rate": 0.0001, |
| "loss": 3010206.0, |
| "loss/crossentropy": 2.7413084506988525, |
| "loss/hidden": 0.251953125, |
| "loss/logits": 0.09831676632165909, |
| "loss/reg": 3010203.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.015025, |
| "grad_norm": 1.5618406534194946, |
| "grad_norm_var": 0.055782178172492536, |
| "learning_rate": 0.0001, |
| "loss": 2995277.0, |
| "loss/crossentropy": 2.4582359790802, |
| "loss/hidden": 0.27734375, |
| "loss/logits": 0.11780478060245514, |
| "loss/reg": 2995274.0, |
| "step": 601 |
| }, |
| { |
| "epoch": 0.01505, |
| "grad_norm": 1.6309309005737305, |
| "grad_norm_var": 0.056748034489492956, |
| "learning_rate": 0.0001, |
| "loss": 2980375.25, |
| "loss/crossentropy": 2.8804776668548584, |
| "loss/hidden": 0.251953125, |
| "loss/logits": 0.0939045250415802, |
| "loss/reg": 2980372.0, |
| "step": 602 |
| }, |
| { |
| "epoch": 0.015075, |
| "grad_norm": 1.3542622327804565, |
| "grad_norm_var": 0.05832647038522296, |
| "learning_rate": 0.0001, |
| "loss": 2965587.5, |
| "loss/crossentropy": 3.06191349029541, |
| "loss/hidden": 0.2578125, |
| "loss/logits": 0.08932670205831528, |
| "loss/reg": 2965584.25, |
| "step": 603 |
| }, |
| { |
| "epoch": 0.0151, |
| "grad_norm": 1.4876539707183838, |
| "grad_norm_var": 0.058191733008463524, |
| "learning_rate": 0.0001, |
| "loss": 2950823.75, |
| "loss/crossentropy": 2.732553243637085, |
| "loss/hidden": 0.265625, |
| "loss/logits": 0.09219442307949066, |
| "loss/reg": 2950820.75, |
| "step": 604 |
| }, |
| { |
| "epoch": 0.015125, |
| "grad_norm": 1.2470945119857788, |
| "grad_norm_var": 0.05929336958462266, |
| "learning_rate": 0.0001, |
| "loss": 2936088.0, |
| "loss/crossentropy": 2.6155288219451904, |
| "loss/hidden": 0.2470703125, |
| "loss/logits": 0.08115285634994507, |
| "loss/reg": 2936085.25, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.01515, |
| "grad_norm": 1.7692153453826904, |
| "grad_norm_var": 0.05683950277813204, |
| "learning_rate": 0.0001, |
| "loss": 2921443.25, |
| "loss/crossentropy": 2.7273483276367188, |
| "loss/hidden": 0.26953125, |
| "loss/logits": 0.09945277124643326, |
| "loss/reg": 2921440.25, |
| "step": 606 |
| }, |
| { |
| "epoch": 0.015175, |
| "grad_norm": 1.5427806377410889, |
| "grad_norm_var": 0.056936588678564465, |
| "learning_rate": 0.0001, |
| "loss": 2906859.5, |
| "loss/crossentropy": 3.0468246936798096, |
| "loss/hidden": 0.255859375, |
| "loss/logits": 0.09627999365329742, |
| "loss/reg": 2906856.25, |
| "step": 607 |
| }, |
| { |
| "epoch": 0.0152, |
| "grad_norm": 1.4511830806732178, |
| "grad_norm_var": 0.05636636159272825, |
| "learning_rate": 0.0001, |
| "loss": 2892392.5, |
| "loss/crossentropy": 2.790069103240967, |
| "loss/hidden": 0.25, |
| "loss/logits": 0.08508322387933731, |
| "loss/reg": 2892389.5, |
| "step": 608 |
| }, |
| { |
| "epoch": 0.015225, |
| "grad_norm": 1.4937046766281128, |
| "grad_norm_var": 0.05597903137950902, |
| "learning_rate": 0.0001, |
| "loss": 2877963.0, |
| "loss/crossentropy": 2.611232280731201, |
| "loss/hidden": 0.240234375, |
| "loss/logits": 0.08240636438131332, |
| "loss/reg": 2877960.25, |
| "step": 609 |
| }, |
| { |
| "epoch": 0.01525, |
| "grad_norm": 2.138087272644043, |
| "grad_norm_var": 0.07489950951997214, |
| "learning_rate": 0.0001, |
| "loss": 2863604.5, |
| "loss/crossentropy": 2.896639108657837, |
| "loss/hidden": 0.2734375, |
| "loss/logits": 0.0961982011795044, |
| "loss/reg": 2863601.25, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.015275, |
| "grad_norm": 1.2840114831924438, |
| "grad_norm_var": 0.07862449480424415, |
| "learning_rate": 0.0001, |
| "loss": 2849164.0, |
| "loss/crossentropy": 2.6192398071289062, |
| "loss/hidden": 0.2421875, |
| "loss/logits": 0.08849970996379852, |
| "loss/reg": 2849161.25, |
| "step": 611 |
| }, |
| { |
| "epoch": 0.0153, |
| "grad_norm": 1.2716068029403687, |
| "grad_norm_var": 0.08372666868291573, |
| "learning_rate": 0.0001, |
| "loss": 2835042.25, |
| "loss/crossentropy": 2.512448310852051, |
| "loss/hidden": 0.2421875, |
| "loss/logits": 0.07710960507392883, |
| "loss/reg": 2835039.5, |
| "step": 612 |
| }, |
| { |
| "epoch": 0.015325, |
| "grad_norm": 1.9938853979110718, |
| "grad_norm_var": 0.06548489885640431, |
| "learning_rate": 0.0001, |
| "loss": 2821004.25, |
| "loss/crossentropy": 2.9246652126312256, |
| "loss/hidden": 0.265625, |
| "loss/logits": 0.10318771004676819, |
| "loss/reg": 2821001.0, |
| "step": 613 |
| }, |
| { |
| "epoch": 0.01535, |
| "grad_norm": 1.4347602128982544, |
| "grad_norm_var": 0.06314236410329238, |
| "learning_rate": 0.0001, |
| "loss": 2807005.5, |
| "loss/crossentropy": 2.6279404163360596, |
| "loss/hidden": 0.267578125, |
| "loss/logits": 0.10144165903329849, |
| "loss/reg": 2807002.5, |
| "step": 614 |
| }, |
| { |
| "epoch": 0.015375, |
| "grad_norm": 1.769286036491394, |
| "grad_norm_var": 0.06660102528055797, |
| "learning_rate": 0.0001, |
| "loss": 2793071.5, |
| "loss/crossentropy": 2.8249123096466064, |
| "loss/hidden": 0.26953125, |
| "loss/logits": 0.09426470100879669, |
| "loss/reg": 2793068.5, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.0154, |
| "grad_norm": 1.3249356746673584, |
| "grad_norm_var": 0.06698082224293239, |
| "learning_rate": 0.0001, |
| "loss": 2779191.5, |
| "loss/crossentropy": 2.7158865928649902, |
| "loss/hidden": 0.244140625, |
| "loss/logits": 0.08663683384656906, |
| "loss/reg": 2779188.5, |
| "step": 616 |
| }, |
| { |
| "epoch": 0.015425, |
| "grad_norm": 1.5031248331069946, |
| "grad_norm_var": 0.0670816945975289, |
| "learning_rate": 0.0001, |
| "loss": 2765357.5, |
| "loss/crossentropy": 2.7861616611480713, |
| "loss/hidden": 0.279296875, |
| "loss/logits": 0.10829424858093262, |
| "loss/reg": 2765354.25, |
| "step": 617 |
| }, |
| { |
| "epoch": 0.01545, |
| "grad_norm": 1.2966117858886719, |
| "grad_norm_var": 0.07017142176690493, |
| "learning_rate": 0.0001, |
| "loss": 2751362.75, |
| "loss/crossentropy": 2.721094846725464, |
| "loss/hidden": 0.2353515625, |
| "loss/logits": 0.07855356484651566, |
| "loss/reg": 2751359.75, |
| "step": 618 |
| }, |
| { |
| "epoch": 0.015475, |
| "grad_norm": 1.241206169128418, |
| "grad_norm_var": 0.07350839274628147, |
| "learning_rate": 0.0001, |
| "loss": 2737241.0, |
| "loss/crossentropy": 2.6991958618164062, |
| "loss/hidden": 0.2353515625, |
| "loss/logits": 0.08357013761997223, |
| "loss/reg": 2737238.0, |
| "step": 619 |
| }, |
| { |
| "epoch": 0.0155, |
| "grad_norm": 1.4571808576583862, |
| "grad_norm_var": 0.0736798631018359, |
| "learning_rate": 0.0001, |
| "loss": 2723549.25, |
| "loss/crossentropy": 2.79105544090271, |
| "loss/hidden": 0.2578125, |
| "loss/logits": 0.08794891089200974, |
| "loss/reg": 2723546.25, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.015525, |
| "grad_norm": 1.5453085899353027, |
| "grad_norm_var": 0.06863866096782066, |
| "learning_rate": 0.0001, |
| "loss": 2709937.75, |
| "loss/crossentropy": 2.8297457695007324, |
| "loss/hidden": 0.26171875, |
| "loss/logits": 0.09096057713031769, |
| "loss/reg": 2709934.75, |
| "step": 621 |
| }, |
| { |
| "epoch": 0.01555, |
| "grad_norm": 1.1728241443634033, |
| "grad_norm_var": 0.07203002630450683, |
| "learning_rate": 0.0001, |
| "loss": 2696413.5, |
| "loss/crossentropy": 2.516026258468628, |
| "loss/hidden": 0.2275390625, |
| "loss/logits": 0.07980786263942719, |
| "loss/reg": 2696410.75, |
| "step": 622 |
| }, |
| { |
| "epoch": 0.015575, |
| "grad_norm": 1.4331055879592896, |
| "grad_norm_var": 0.0720835571657668, |
| "learning_rate": 0.0001, |
| "loss": 2682994.5, |
| "loss/crossentropy": 2.8644509315490723, |
| "loss/hidden": 0.25390625, |
| "loss/logits": 0.08371005952358246, |
| "loss/reg": 2682991.5, |
| "step": 623 |
| }, |
| { |
| "epoch": 0.0156, |
| "grad_norm": 1.4316741228103638, |
| "grad_norm_var": 0.07220357147847097, |
| "learning_rate": 0.0001, |
| "loss": 2669638.25, |
| "loss/crossentropy": 2.965606927871704, |
| "loss/hidden": 0.2470703125, |
| "loss/logits": 0.086640864610672, |
| "loss/reg": 2669635.0, |
| "step": 624 |
| }, |
| { |
| "epoch": 0.015625, |
| "grad_norm": 1.2847670316696167, |
| "grad_norm_var": 0.074744028910672, |
| "learning_rate": 0.0001, |
| "loss": 2656332.5, |
| "loss/crossentropy": 2.599846839904785, |
| "loss/hidden": 0.2470703125, |
| "loss/logits": 0.08327412605285645, |
| "loss/reg": 2656329.75, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.01565, |
| "grad_norm": 1.4208719730377197, |
| "grad_norm_var": 0.04337837727891814, |
| "learning_rate": 0.0001, |
| "loss": 2643119.0, |
| "loss/crossentropy": 2.6468188762664795, |
| "loss/hidden": 0.265625, |
| "loss/logits": 0.08926676213741302, |
| "loss/reg": 2643116.0, |
| "step": 626 |
| }, |
| { |
| "epoch": 0.015675, |
| "grad_norm": 1.6385719776153564, |
| "grad_norm_var": 0.0443777259339057, |
| "learning_rate": 0.0001, |
| "loss": 2629890.0, |
| "loss/crossentropy": 2.9938015937805176, |
| "loss/hidden": 0.26953125, |
| "loss/logits": 0.09494272619485855, |
| "loss/reg": 2629886.75, |
| "step": 627 |
| }, |
| { |
| "epoch": 0.0157, |
| "grad_norm": 1.3339931964874268, |
| "grad_norm_var": 0.04312681926855338, |
| "learning_rate": 0.0001, |
| "loss": 2616745.0, |
| "loss/crossentropy": 2.8771309852600098, |
| "loss/hidden": 0.25390625, |
| "loss/logits": 0.08746660500764847, |
| "loss/reg": 2616741.75, |
| "step": 628 |
| }, |
| { |
| "epoch": 0.015725, |
| "grad_norm": 1.6058975458145142, |
| "grad_norm_var": 0.02466457936247461, |
| "learning_rate": 0.0001, |
| "loss": 2603684.0, |
| "loss/crossentropy": 2.695460796356201, |
| "loss/hidden": 0.26171875, |
| "loss/logits": 0.08056588470935822, |
| "loss/reg": 2603681.0, |
| "step": 629 |
| }, |
| { |
| "epoch": 0.01575, |
| "grad_norm": 1.3286347389221191, |
| "grad_norm_var": 0.0253136227637194, |
| "learning_rate": 0.0001, |
| "loss": 2590668.75, |
| "loss/crossentropy": 2.827113389968872, |
| "loss/hidden": 0.251953125, |
| "loss/logits": 0.07437729090452194, |
| "loss/reg": 2590665.75, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.015775, |
| "grad_norm": 1.291136622428894, |
| "grad_norm_var": 0.017605608309390598, |
| "learning_rate": 0.0001, |
| "loss": 2577704.5, |
| "loss/crossentropy": 2.5001721382141113, |
| "loss/hidden": 0.2421875, |
| "loss/logits": 0.08372145891189575, |
| "loss/reg": 2577701.75, |
| "step": 631 |
| }, |
| { |
| "epoch": 0.0158, |
| "grad_norm": 1.3402228355407715, |
| "grad_norm_var": 0.017478696803058292, |
| "learning_rate": 0.0001, |
| "loss": 2564854.0, |
| "loss/crossentropy": 2.6086809635162354, |
| "loss/hidden": 0.2470703125, |
| "loss/logits": 0.071901336312294, |
| "loss/reg": 2564851.25, |
| "step": 632 |
| }, |
| { |
| "epoch": 0.015825, |
| "grad_norm": 1.231552004814148, |
| "grad_norm_var": 0.018184629764281545, |
| "learning_rate": 0.0001, |
| "loss": 2552061.5, |
| "loss/crossentropy": 2.5290377140045166, |
| "loss/hidden": 0.232421875, |
| "loss/logits": 0.07277534157037735, |
| "loss/reg": 2552058.75, |
| "step": 633 |
| }, |
| { |
| "epoch": 0.01585, |
| "grad_norm": 1.3197014331817627, |
| "grad_norm_var": 0.01796631748485703, |
| "learning_rate": 0.0001, |
| "loss": 2539139.25, |
| "loss/crossentropy": 2.9119789600372314, |
| "loss/hidden": 0.259765625, |
| "loss/logits": 0.10156615078449249, |
| "loss/reg": 2539136.0, |
| "step": 634 |
| }, |
| { |
| "epoch": 0.015875, |
| "grad_norm": 1.2754802703857422, |
| "grad_norm_var": 0.017406423088270097, |
| "learning_rate": 0.0001, |
| "loss": 2526388.0, |
| "loss/crossentropy": 2.8255531787872314, |
| "loss/hidden": 0.2421875, |
| "loss/logits": 0.07959705591201782, |
| "loss/reg": 2526385.0, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.0159, |
| "grad_norm": 1.3430273532867432, |
| "grad_norm_var": 0.01707554962471344, |
| "learning_rate": 0.0001, |
| "loss": 2513683.25, |
| "loss/crossentropy": 2.4952664375305176, |
| "loss/hidden": 0.259765625, |
| "loss/logits": 0.07670697569847107, |
| "loss/reg": 2513680.5, |
| "step": 636 |
| }, |
| { |
| "epoch": 0.015925, |
| "grad_norm": 1.5899494886398315, |
| "grad_norm_var": 0.01821499917132782, |
| "learning_rate": 0.0001, |
| "loss": 2500978.0, |
| "loss/crossentropy": 3.2368414402008057, |
| "loss/hidden": 0.263671875, |
| "loss/logits": 0.0830172747373581, |
| "loss/reg": 2500974.5, |
| "step": 637 |
| }, |
| { |
| "epoch": 0.01595, |
| "grad_norm": 1.3665083646774292, |
| "grad_norm_var": 0.015271656960122944, |
| "learning_rate": 0.0001, |
| "loss": 2488377.0, |
| "loss/crossentropy": 2.7981770038604736, |
| "loss/hidden": 0.251953125, |
| "loss/logits": 0.08188901841640472, |
| "loss/reg": 2488374.0, |
| "step": 638 |
| }, |
| { |
| "epoch": 0.015975, |
| "grad_norm": 1.446820616722107, |
| "grad_norm_var": 0.015362799906699583, |
| "learning_rate": 0.0001, |
| "loss": 2475795.0, |
| "loss/crossentropy": 2.947805404663086, |
| "loss/hidden": 0.263671875, |
| "loss/logits": 0.09453658759593964, |
| "loss/reg": 2475791.75, |
| "step": 639 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 1.3775579929351807, |
| "grad_norm_var": 0.01524910849525624, |
| "learning_rate": 0.0001, |
| "loss": 2463265.5, |
| "loss/crossentropy": 2.8322534561157227, |
| "loss/hidden": 0.259765625, |
| "loss/logits": 0.0860058069229126, |
| "loss/reg": 2463262.5, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.016025, |
| "grad_norm": 1.4722238779067993, |
| "grad_norm_var": 0.014885919302685539, |
| "learning_rate": 0.0001, |
| "loss": 2450749.0, |
| "loss/crossentropy": 2.6698105335235596, |
| "loss/hidden": 0.263671875, |
| "loss/logits": 0.09445904940366745, |
| "loss/reg": 2450746.0, |
| "step": 641 |
| }, |
| { |
| "epoch": 0.01605, |
| "grad_norm": 1.7905011177062988, |
| "grad_norm_var": 0.024508659212942933, |
| "learning_rate": 0.0001, |
| "loss": 2438309.0, |
| "loss/crossentropy": 2.874283790588379, |
| "loss/hidden": 0.26953125, |
| "loss/logits": 0.09145442396402359, |
| "loss/reg": 2438306.0, |
| "step": 642 |
| }, |
| { |
| "epoch": 0.016075, |
| "grad_norm": 1.4462361335754395, |
| "grad_norm_var": 0.021266432453360965, |
| "learning_rate": 0.0001, |
| "loss": 2425945.5, |
| "loss/crossentropy": 2.972928524017334, |
| "loss/hidden": 0.26953125, |
| "loss/logits": 0.0968450978398323, |
| "loss/reg": 2425942.25, |
| "step": 643 |
| }, |
| { |
| "epoch": 0.0161, |
| "grad_norm": 1.4527603387832642, |
| "grad_norm_var": 0.02094497049471992, |
| "learning_rate": 0.0001, |
| "loss": 2413649.75, |
| "loss/crossentropy": 2.5875918865203857, |
| "loss/hidden": 0.26953125, |
| "loss/logits": 0.08963285386562347, |
| "loss/reg": 2413647.0, |
| "step": 644 |
| }, |
| { |
| "epoch": 0.016125, |
| "grad_norm": 1.356290340423584, |
| "grad_norm_var": 0.01856518220642389, |
| "learning_rate": 0.0001, |
| "loss": 2401381.0, |
| "loss/crossentropy": 2.8125293254852295, |
| "loss/hidden": 0.267578125, |
| "loss/logits": 0.09375730156898499, |
| "loss/reg": 2401378.0, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.01615, |
| "grad_norm": 1.411806344985962, |
| "grad_norm_var": 0.01818629349637977, |
| "learning_rate": 0.0001, |
| "loss": 2389297.25, |
| "loss/crossentropy": 2.768796682357788, |
| "loss/hidden": 0.263671875, |
| "loss/logits": 0.08215779066085815, |
| "loss/reg": 2389294.25, |
| "step": 646 |
| }, |
| { |
| "epoch": 0.016175, |
| "grad_norm": 1.4484037160873413, |
| "grad_norm_var": 0.017302863978746964, |
| "learning_rate": 0.0001, |
| "loss": 2377280.75, |
| "loss/crossentropy": 2.6766011714935303, |
| "loss/hidden": 0.26171875, |
| "loss/logits": 0.08591602742671967, |
| "loss/reg": 2377277.75, |
| "step": 647 |
| }, |
| { |
| "epoch": 0.0162, |
| "grad_norm": 1.2438198328018188, |
| "grad_norm_var": 0.018868207238877468, |
| "learning_rate": 0.0001, |
| "loss": 2365153.75, |
| "loss/crossentropy": 2.7085185050964355, |
| "loss/hidden": 0.2490234375, |
| "loss/logits": 0.08322304487228394, |
| "loss/reg": 2365150.75, |
| "step": 648 |
| }, |
| { |
| "epoch": 0.016225, |
| "grad_norm": 1.3128260374069214, |
| "grad_norm_var": 0.01733873024749807, |
| "learning_rate": 0.0001, |
| "loss": 2353249.0, |
| "loss/crossentropy": 2.4449381828308105, |
| "loss/hidden": 0.26171875, |
| "loss/logits": 0.08062805235385895, |
| "loss/reg": 2353246.25, |
| "step": 649 |
| }, |
| { |
| "epoch": 0.01625, |
| "grad_norm": 1.6177732944488525, |
| "grad_norm_var": 0.019069654984459086, |
| "learning_rate": 0.0001, |
| "loss": 2341361.0, |
| "loss/crossentropy": 2.8387019634246826, |
| "loss/hidden": 0.259765625, |
| "loss/logits": 0.07232898473739624, |
| "loss/reg": 2341358.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.016275, |
| "grad_norm": 1.5757468938827515, |
| "grad_norm_var": 0.01833825273563807, |
| "learning_rate": 0.0001, |
| "loss": 2329516.25, |
| "loss/crossentropy": 2.938122510910034, |
| "loss/hidden": 0.275390625, |
| "loss/logits": 0.09084507077932358, |
| "loss/reg": 2329513.0, |
| "step": 651 |
| }, |
| { |
| "epoch": 0.0163, |
| "grad_norm": 1.4291303157806396, |
| "grad_norm_var": 0.017536030090174334, |
| "learning_rate": 0.0001, |
| "loss": 2317727.0, |
| "loss/crossentropy": 2.9339261054992676, |
| "loss/hidden": 0.26953125, |
| "loss/logits": 0.09039770066738129, |
| "loss/reg": 2317723.75, |
| "step": 652 |
| }, |
| { |
| "epoch": 0.016325, |
| "grad_norm": 1.532281756401062, |
| "grad_norm_var": 0.016734290448841154, |
| "learning_rate": 0.0001, |
| "loss": 2305619.0, |
| "loss/crossentropy": 2.7189085483551025, |
| "loss/hidden": 0.26171875, |
| "loss/logits": 0.08377526700496674, |
| "loss/reg": 2305616.0, |
| "step": 653 |
| }, |
| { |
| "epoch": 0.01635, |
| "grad_norm": 1.4484657049179077, |
| "grad_norm_var": 0.01618662890306357, |
| "learning_rate": 0.0001, |
| "loss": 2293863.0, |
| "loss/crossentropy": 2.8304450511932373, |
| "loss/hidden": 0.275390625, |
| "loss/logits": 0.09900549054145813, |
| "loss/reg": 2293860.0, |
| "step": 654 |
| }, |
| { |
| "epoch": 0.016375, |
| "grad_norm": 1.258310079574585, |
| "grad_norm_var": 0.018743057175980956, |
| "learning_rate": 0.0001, |
| "loss": 2281760.75, |
| "loss/crossentropy": 2.55017352104187, |
| "loss/hidden": 0.26171875, |
| "loss/logits": 0.08573046326637268, |
| "loss/reg": 2281758.0, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.0164, |
| "grad_norm": 1.7164244651794434, |
| "grad_norm_var": 0.022719916864244995, |
| "learning_rate": 0.0001, |
| "loss": 2270146.5, |
| "loss/crossentropy": 2.7571001052856445, |
| "loss/hidden": 0.2734375, |
| "loss/logits": 0.09479528665542603, |
| "loss/reg": 2270143.5, |
| "step": 656 |
| }, |
| { |
| "epoch": 0.016425, |
| "grad_norm": 1.3879799842834473, |
| "grad_norm_var": 0.023133587662155185, |
| "learning_rate": 0.0001, |
| "loss": 2258389.75, |
| "loss/crossentropy": 2.764295816421509, |
| "loss/hidden": 0.2578125, |
| "loss/logits": 0.08921615779399872, |
| "loss/reg": 2258386.75, |
| "step": 657 |
| }, |
| { |
| "epoch": 0.01645, |
| "grad_norm": 1.4852516651153564, |
| "grad_norm_var": 0.01568069173080975, |
| "learning_rate": 0.0001, |
| "loss": 2246825.0, |
| "loss/crossentropy": 2.713266611099243, |
| "loss/hidden": 0.267578125, |
| "loss/logits": 0.07621665298938751, |
| "loss/reg": 2246822.0, |
| "step": 658 |
| }, |
| { |
| "epoch": 0.016475, |
| "grad_norm": 1.5711143016815186, |
| "grad_norm_var": 0.01667228421715231, |
| "learning_rate": 0.0001, |
| "loss": 2235355.5, |
| "loss/crossentropy": 3.096885919570923, |
| "loss/hidden": 0.28125, |
| "loss/logits": 0.10053817927837372, |
| "loss/reg": 2235352.0, |
| "step": 659 |
| }, |
| { |
| "epoch": 0.0165, |
| "grad_norm": 1.4312915802001953, |
| "grad_norm_var": 0.016701845864124227, |
| "learning_rate": 0.0001, |
| "loss": 2224000.25, |
| "loss/crossentropy": 2.567885160446167, |
| "loss/hidden": 0.263671875, |
| "loss/logits": 0.08533942699432373, |
| "loss/reg": 2223997.5, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.016525, |
| "grad_norm": 1.6274508237838745, |
| "grad_norm_var": 0.01784847679871806, |
| "learning_rate": 0.0001, |
| "loss": 2212235.5, |
| "loss/crossentropy": 3.113525867462158, |
| "loss/hidden": 0.2734375, |
| "loss/logits": 0.09047147631645203, |
| "loss/reg": 2212232.25, |
| "step": 661 |
| }, |
| { |
| "epoch": 0.01655, |
| "grad_norm": 1.7273080348968506, |
| "grad_norm_var": 0.021679422934408818, |
| "learning_rate": 0.0001, |
| "loss": 2200927.75, |
| "loss/crossentropy": 2.708819627761841, |
| "loss/hidden": 0.267578125, |
| "loss/logits": 0.09549328684806824, |
| "loss/reg": 2200924.75, |
| "step": 662 |
| }, |
| { |
| "epoch": 0.016575, |
| "grad_norm": 1.4437838792800903, |
| "grad_norm_var": 0.021705362078621646, |
| "learning_rate": 0.0001, |
| "loss": 2189703.5, |
| "loss/crossentropy": 2.662740468978882, |
| "loss/hidden": 0.267578125, |
| "loss/logits": 0.08861515671014786, |
| "loss/reg": 2189700.5, |
| "step": 663 |
| }, |
| { |
| "epoch": 0.0166, |
| "grad_norm": 1.836953043937683, |
| "grad_norm_var": 0.02437771268321184, |
| "learning_rate": 0.0001, |
| "loss": 2178561.5, |
| "loss/crossentropy": 2.8463456630706787, |
| "loss/hidden": 0.27734375, |
| "loss/logits": 0.08543583750724792, |
| "loss/reg": 2178558.5, |
| "step": 664 |
| }, |
| { |
| "epoch": 0.016625, |
| "grad_norm": 1.5805258750915527, |
| "grad_norm_var": 0.02127880490721831, |
| "learning_rate": 0.0001, |
| "loss": 2167317.0, |
| "loss/crossentropy": 2.733736038208008, |
| "loss/hidden": 0.26953125, |
| "loss/logits": 0.09479932487010956, |
| "loss/reg": 2167314.0, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.01665, |
| "grad_norm": 1.406731128692627, |
| "grad_norm_var": 0.021926414578393623, |
| "learning_rate": 0.0001, |
| "loss": 2156269.5, |
| "loss/crossentropy": 2.5590898990631104, |
| "loss/hidden": 0.2734375, |
| "loss/logits": 0.08474697172641754, |
| "loss/reg": 2156266.75, |
| "step": 666 |
| }, |
| { |
| "epoch": 0.016675, |
| "grad_norm": 1.2672498226165771, |
| "grad_norm_var": 0.02593823200260766, |
| "learning_rate": 0.0001, |
| "loss": 2145251.75, |
| "loss/crossentropy": 2.6033432483673096, |
| "loss/hidden": 0.26171875, |
| "loss/logits": 0.09319953620433807, |
| "loss/reg": 2145249.0, |
| "step": 667 |
| }, |
| { |
| "epoch": 0.0167, |
| "grad_norm": 1.365149974822998, |
| "grad_norm_var": 0.026878753660014295, |
| "learning_rate": 0.0001, |
| "loss": 2134018.25, |
| "loss/crossentropy": 2.7518537044525146, |
| "loss/hidden": 0.26953125, |
| "loss/logits": 0.08686228096485138, |
| "loss/reg": 2134015.25, |
| "step": 668 |
| }, |
| { |
| "epoch": 0.016725, |
| "grad_norm": 1.3523788452148438, |
| "grad_norm_var": 0.028256563870441672, |
| "learning_rate": 0.0001, |
| "loss": 2123115.0, |
| "loss/crossentropy": 2.671623468399048, |
| "loss/hidden": 0.26953125, |
| "loss/logits": 0.09382728487253189, |
| "loss/reg": 2123112.0, |
| "step": 669 |
| }, |
| { |
| "epoch": 0.01675, |
| "grad_norm": 1.3600950241088867, |
| "grad_norm_var": 0.029282914239964875, |
| "learning_rate": 0.0001, |
| "loss": 2112223.75, |
| "loss/crossentropy": 3.019261598587036, |
| "loss/hidden": 0.263671875, |
| "loss/logits": 0.08051824569702148, |
| "loss/reg": 2112220.5, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.016775, |
| "grad_norm": 1.2829819917678833, |
| "grad_norm_var": 0.02856331722990717, |
| "learning_rate": 0.0001, |
| "loss": 2101444.25, |
| "loss/crossentropy": 2.4927473068237305, |
| "loss/hidden": 0.26171875, |
| "loss/logits": 0.08024908602237701, |
| "loss/reg": 2101441.5, |
| "step": 671 |
| }, |
| { |
| "epoch": 0.0168, |
| "grad_norm": 1.3175451755523682, |
| "grad_norm_var": 0.026474099429740505, |
| "learning_rate": 0.0001, |
| "loss": 2090783.25, |
| "loss/crossentropy": 2.7559521198272705, |
| "loss/hidden": 0.26171875, |
| "loss/logits": 0.08549220114946365, |
| "loss/reg": 2090780.125, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.016825, |
| "grad_norm": 1.4911922216415405, |
| "grad_norm_var": 0.02607671543871053, |
| "learning_rate": 0.0001, |
| "loss": 2080020.375, |
| "loss/crossentropy": 2.666325330734253, |
| "loss/hidden": 0.26953125, |
| "loss/logits": 0.09159214049577713, |
| "loss/reg": 2080017.375, |
| "step": 673 |
| }, |
| { |
| "epoch": 0.01685, |
| "grad_norm": 1.2553210258483887, |
| "grad_norm_var": 0.028965135823322365, |
| "learning_rate": 0.0001, |
| "loss": 2069264.0, |
| "loss/crossentropy": 2.7345547676086426, |
| "loss/hidden": 0.2578125, |
| "loss/logits": 0.08430467545986176, |
| "loss/reg": 2069260.875, |
| "step": 674 |
| }, |
| { |
| "epoch": 0.016875, |
| "grad_norm": 1.6348696947097778, |
| "grad_norm_var": 0.030186541201639015, |
| "learning_rate": 0.0001, |
| "loss": 2058733.625, |
| "loss/crossentropy": 2.903059720993042, |
| "loss/hidden": 0.2734375, |
| "loss/logits": 0.09770302474498749, |
| "loss/reg": 2058730.375, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.0169, |
| "grad_norm": 1.495425820350647, |
| "grad_norm_var": 0.030186992248218244, |
| "learning_rate": 0.0001, |
| "loss": 2047956.5, |
| "loss/crossentropy": 2.7519662380218506, |
| "loss/hidden": 0.287109375, |
| "loss/logits": 0.09977154433727264, |
| "loss/reg": 2047953.375, |
| "step": 676 |
| }, |
| { |
| "epoch": 0.016925, |
| "grad_norm": 1.443090558052063, |
| "grad_norm_var": 0.028325646750880593, |
| "learning_rate": 0.0001, |
| "loss": 2037529.875, |
| "loss/crossentropy": 2.70141339302063, |
| "loss/hidden": 0.2734375, |
| "loss/logits": 0.09986162185668945, |
| "loss/reg": 2037526.75, |
| "step": 677 |
| }, |
| { |
| "epoch": 0.01695, |
| "grad_norm": 1.5672184228897095, |
| "grad_norm_var": 0.0240890694755952, |
| "learning_rate": 0.0001, |
| "loss": 2027123.125, |
| "loss/crossentropy": 2.952836036682129, |
| "loss/hidden": 0.2890625, |
| "loss/logits": 0.10231655836105347, |
| "loss/reg": 2027119.75, |
| "step": 678 |
| }, |
| { |
| "epoch": 0.016975, |
| "grad_norm": 1.3892264366149902, |
| "grad_norm_var": 0.02427508819388837, |
| "learning_rate": 0.0001, |
| "loss": 2016799.625, |
| "loss/crossentropy": 2.668879747390747, |
| "loss/hidden": 0.2734375, |
| "loss/logits": 0.09450025856494904, |
| "loss/reg": 2016796.625, |
| "step": 679 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 1.5410244464874268, |
| "grad_norm_var": 0.01410049788804386, |
| "learning_rate": 0.0001, |
| "loss": 2006495.125, |
| "loss/crossentropy": 2.7758939266204834, |
| "loss/hidden": 0.296875, |
| "loss/logits": 0.11185262352228165, |
| "loss/reg": 2006492.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.017025, |
| "grad_norm": 1.372971773147583, |
| "grad_norm_var": 0.012402477418251578, |
| "learning_rate": 0.0001, |
| "loss": 1996098.75, |
| "loss/crossentropy": 2.7703683376312256, |
| "loss/hidden": 0.27734375, |
| "loss/logits": 0.10315593332052231, |
| "loss/reg": 1996095.625, |
| "step": 681 |
| }, |
| { |
| "epoch": 0.01705, |
| "grad_norm": 1.5245753526687622, |
| "grad_norm_var": 0.013236281648512182, |
| "learning_rate": 0.0001, |
| "loss": 1985838.125, |
| "loss/crossentropy": 2.8839073181152344, |
| "loss/hidden": 0.279296875, |
| "loss/logits": 0.09429013729095459, |
| "loss/reg": 1985834.875, |
| "step": 682 |
| }, |
| { |
| "epoch": 0.017075, |
| "grad_norm": 1.4698330163955688, |
| "grad_norm_var": 0.011776086515029267, |
| "learning_rate": 0.0001, |
| "loss": 1975638.0, |
| "loss/crossentropy": 2.58784818649292, |
| "loss/hidden": 0.267578125, |
| "loss/logits": 0.08762283623218536, |
| "loss/reg": 1975635.0, |
| "step": 683 |
| }, |
| { |
| "epoch": 0.0171, |
| "grad_norm": 1.3299224376678467, |
| "grad_norm_var": 0.012153228626881922, |
| "learning_rate": 0.0001, |
| "loss": 1965559.875, |
| "loss/crossentropy": 2.6097471714019775, |
| "loss/hidden": 0.263671875, |
| "loss/logits": 0.08294347673654556, |
| "loss/reg": 1965556.875, |
| "step": 684 |
| }, |
| { |
| "epoch": 0.017125, |
| "grad_norm": 1.5453976392745972, |
| "grad_norm_var": 0.012568267668038343, |
| "learning_rate": 0.0001, |
| "loss": 1955569.0, |
| "loss/crossentropy": 2.8411436080932617, |
| "loss/hidden": 0.2734375, |
| "loss/logits": 0.09534774720668793, |
| "loss/reg": 1955565.75, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.01715, |
| "grad_norm": 1.3991481065750122, |
| "grad_norm_var": 0.012253801660424557, |
| "learning_rate": 0.0001, |
| "loss": 1945548.0, |
| "loss/crossentropy": 2.5438945293426514, |
| "loss/hidden": 0.2734375, |
| "loss/logits": 0.09679535031318665, |
| "loss/reg": 1945545.125, |
| "step": 686 |
| }, |
| { |
| "epoch": 0.017175, |
| "grad_norm": 1.522202730178833, |
| "grad_norm_var": 0.010782839892040326, |
| "learning_rate": 0.0001, |
| "loss": 1935343.5, |
| "loss/crossentropy": 2.807229518890381, |
| "loss/hidden": 0.279296875, |
| "loss/logits": 0.08741327375173569, |
| "loss/reg": 1935340.375, |
| "step": 687 |
| }, |
| { |
| "epoch": 0.0172, |
| "grad_norm": 1.5719587802886963, |
| "grad_norm_var": 0.010125307901564835, |
| "learning_rate": 0.0001, |
| "loss": 1925302.0, |
| "loss/crossentropy": 2.3671627044677734, |
| "loss/hidden": 0.267578125, |
| "loss/logits": 0.08417488634586334, |
| "loss/reg": 1925299.25, |
| "step": 688 |
| }, |
| { |
| "epoch": 0.017225, |
| "grad_norm": 1.948159098625183, |
| "grad_norm_var": 0.02434059033064096, |
| "learning_rate": 0.0001, |
| "loss": 1915358.0, |
| "loss/crossentropy": 2.807757616043091, |
| "loss/hidden": 0.298828125, |
| "loss/logits": 0.09410851448774338, |
| "loss/reg": 1915354.875, |
| "step": 689 |
| }, |
| { |
| "epoch": 0.01725, |
| "grad_norm": 1.5537142753601074, |
| "grad_norm_var": 0.020145033152763798, |
| "learning_rate": 0.0001, |
| "loss": 1905466.75, |
| "loss/crossentropy": 2.467728853225708, |
| "loss/hidden": 0.28125, |
| "loss/logits": 0.08724828064441681, |
| "loss/reg": 1905463.875, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.017275, |
| "grad_norm": 1.6481668949127197, |
| "grad_norm_var": 0.02036099137757219, |
| "learning_rate": 0.0001, |
| "loss": 1895480.125, |
| "loss/crossentropy": 2.517580032348633, |
| "loss/hidden": 0.28515625, |
| "loss/logits": 0.09558097273111343, |
| "loss/reg": 1895477.25, |
| "step": 691 |
| }, |
| { |
| "epoch": 0.0173, |
| "grad_norm": 1.4165465831756592, |
| "grad_norm_var": 0.021009652774829548, |
| "learning_rate": 0.0001, |
| "loss": 1885636.375, |
| "loss/crossentropy": 2.7031702995300293, |
| "loss/hidden": 0.28125, |
| "loss/logits": 0.10026144236326218, |
| "loss/reg": 1885633.25, |
| "step": 692 |
| }, |
| { |
| "epoch": 0.017325, |
| "grad_norm": 1.4102911949157715, |
| "grad_norm_var": 0.021392230791177743, |
| "learning_rate": 0.0001, |
| "loss": 1875895.625, |
| "loss/crossentropy": 2.758114814758301, |
| "loss/hidden": 0.279296875, |
| "loss/logits": 0.08785344660282135, |
| "loss/reg": 1875892.5, |
| "step": 693 |
| }, |
| { |
| "epoch": 0.01735, |
| "grad_norm": 1.2953568696975708, |
| "grad_norm_var": 0.024051544419150635, |
| "learning_rate": 0.0001, |
| "loss": 1866247.625, |
| "loss/crossentropy": 2.647961378097534, |
| "loss/hidden": 0.259765625, |
| "loss/logits": 0.07807814329862595, |
| "loss/reg": 1866244.625, |
| "step": 694 |
| }, |
| { |
| "epoch": 0.017375, |
| "grad_norm": 1.3350023031234741, |
| "grad_norm_var": 0.02500839868469497, |
| "learning_rate": 0.0001, |
| "loss": 1856635.375, |
| "loss/crossentropy": 2.789102554321289, |
| "loss/hidden": 0.26953125, |
| "loss/logits": 0.08717681467533112, |
| "loss/reg": 1856632.25, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.0174, |
| "grad_norm": 1.416372537612915, |
| "grad_norm_var": 0.025177478274089952, |
| "learning_rate": 0.0001, |
| "loss": 1847069.625, |
| "loss/crossentropy": 2.754725217819214, |
| "loss/hidden": 0.28125, |
| "loss/logits": 0.08103245496749878, |
| "loss/reg": 1847066.5, |
| "step": 696 |
| }, |
| { |
| "epoch": 0.017425, |
| "grad_norm": 1.778734564781189, |
| "grad_norm_var": 0.029408061613268425, |
| "learning_rate": 0.0001, |
| "loss": 1837560.375, |
| "loss/crossentropy": 3.615978956222534, |
| "loss/hidden": 0.30859375, |
| "loss/logits": 0.10127097368240356, |
| "loss/reg": 1837556.375, |
| "step": 697 |
| }, |
| { |
| "epoch": 0.01745, |
| "grad_norm": 2.715648651123047, |
| "grad_norm_var": 0.12033532174570111, |
| "learning_rate": 0.0001, |
| "loss": 1828083.0, |
| "loss/crossentropy": 2.980289936065674, |
| "loss/hidden": 0.396484375, |
| "loss/logits": 0.14694812893867493, |
| "loss/reg": 1828079.5, |
| "step": 698 |
| }, |
| { |
| "epoch": 0.017475, |
| "grad_norm": 1.7156271934509277, |
| "grad_norm_var": 0.1203441885791977, |
| "learning_rate": 0.0001, |
| "loss": 1818683.25, |
| "loss/crossentropy": 2.4088120460510254, |
| "loss/hidden": 0.306640625, |
| "loss/logits": 0.09742574393749237, |
| "loss/reg": 1818680.5, |
| "step": 699 |
| }, |
| { |
| "epoch": 0.0175, |
| "grad_norm": 1.3298466205596924, |
| "grad_norm_var": 0.12034692055991224, |
| "learning_rate": 0.0001, |
| "loss": 1809326.625, |
| "loss/crossentropy": 2.6984193325042725, |
| "loss/hidden": 0.271484375, |
| "loss/logits": 0.08075416088104248, |
| "loss/reg": 1809323.5, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.017525, |
| "grad_norm": 1.4029943943023682, |
| "grad_norm_var": 0.12265365869172858, |
| "learning_rate": 0.0001, |
| "loss": 1800045.625, |
| "loss/crossentropy": 2.8971612453460693, |
| "loss/hidden": 0.28515625, |
| "loss/logits": 0.0874754935503006, |
| "loss/reg": 1800042.375, |
| "step": 701 |
| }, |
| { |
| "epoch": 0.01755, |
| "grad_norm": 1.4424177408218384, |
| "grad_norm_var": 0.12166246717631275, |
| "learning_rate": 0.0001, |
| "loss": 1790669.875, |
| "loss/crossentropy": 2.6962056159973145, |
| "loss/hidden": 0.28125, |
| "loss/logits": 0.08885666728019714, |
| "loss/reg": 1790666.75, |
| "step": 702 |
| }, |
| { |
| "epoch": 0.017575, |
| "grad_norm": 1.360426425933838, |
| "grad_norm_var": 0.12484557643351521, |
| "learning_rate": 0.0001, |
| "loss": 1781452.0, |
| "loss/crossentropy": 2.8030874729156494, |
| "loss/hidden": 0.294921875, |
| "loss/logits": 0.09631684422492981, |
| "loss/reg": 1781448.875, |
| "step": 703 |
| }, |
| { |
| "epoch": 0.0176, |
| "grad_norm": 1.3303438425064087, |
| "grad_norm_var": 0.12887659081385602, |
| "learning_rate": 0.0001, |
| "loss": 1771971.625, |
| "loss/crossentropy": 2.7509398460388184, |
| "loss/hidden": 0.287109375, |
| "loss/logits": 0.0879264622926712, |
| "loss/reg": 1771968.5, |
| "step": 704 |
| }, |
| { |
| "epoch": 0.017625, |
| "grad_norm": 1.3302745819091797, |
| "grad_norm_var": 0.121478646562222, |
| "learning_rate": 0.0001, |
| "loss": 1762800.25, |
| "loss/crossentropy": 2.6595773696899414, |
| "loss/hidden": 0.28515625, |
| "loss/logits": 0.09676483273506165, |
| "loss/reg": 1762797.25, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.01765, |
| "grad_norm": 1.4356485605239868, |
| "grad_norm_var": 0.12197828997232325, |
| "learning_rate": 0.0001, |
| "loss": 1753640.75, |
| "loss/crossentropy": 3.1999614238739014, |
| "loss/hidden": 0.28515625, |
| "loss/logits": 0.09220694750547409, |
| "loss/reg": 1753637.125, |
| "step": 706 |
| }, |
| { |
| "epoch": 0.017675, |
| "grad_norm": 1.5354570150375366, |
| "grad_norm_var": 0.12088721394604622, |
| "learning_rate": 0.0001, |
| "loss": 1744514.25, |
| "loss/crossentropy": 3.041424036026001, |
| "loss/hidden": 0.296875, |
| "loss/logits": 0.09336571395397186, |
| "loss/reg": 1744510.875, |
| "step": 707 |
| }, |
| { |
| "epoch": 0.0177, |
| "grad_norm": 1.3805537223815918, |
| "grad_norm_var": 0.12144396057639799, |
| "learning_rate": 0.0001, |
| "loss": 1735446.75, |
| "loss/crossentropy": 2.5854897499084473, |
| "loss/hidden": 0.287109375, |
| "loss/logits": 0.09615238010883331, |
| "loss/reg": 1735443.75, |
| "step": 708 |
| }, |
| { |
| "epoch": 0.017725, |
| "grad_norm": 1.4598007202148438, |
| "grad_norm_var": 0.12091626509147435, |
| "learning_rate": 0.0001, |
| "loss": 1726416.875, |
| "loss/crossentropy": 2.6842403411865234, |
| "loss/hidden": 0.283203125, |
| "loss/logits": 0.09128544479608536, |
| "loss/reg": 1726413.875, |
| "step": 709 |
| }, |
| { |
| "epoch": 0.01775, |
| "grad_norm": 1.6501895189285278, |
| "grad_norm_var": 0.11832140065959554, |
| "learning_rate": 0.0001, |
| "loss": 1717432.25, |
| "loss/crossentropy": 2.8198394775390625, |
| "loss/hidden": 0.294921875, |
| "loss/logits": 0.09514346718788147, |
| "loss/reg": 1717429.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.017775, |
| "grad_norm": 1.7380198240280151, |
| "grad_norm_var": 0.11752654889572127, |
| "learning_rate": 0.0001, |
| "loss": 1708309.875, |
| "loss/crossentropy": 2.7238965034484863, |
| "loss/hidden": 0.30078125, |
| "loss/logits": 0.10496436059474945, |
| "loss/reg": 1708306.75, |
| "step": 711 |
| }, |
| { |
| "epoch": 0.0178, |
| "grad_norm": 1.5756787061691284, |
| "grad_norm_var": 0.11597915624212476, |
| "learning_rate": 0.0001, |
| "loss": 1699377.5, |
| "loss/crossentropy": 2.994483709335327, |
| "loss/hidden": 0.296875, |
| "loss/logits": 0.0975675880908966, |
| "loss/reg": 1699374.125, |
| "step": 712 |
| }, |
| { |
| "epoch": 0.017825, |
| "grad_norm": 1.560275673866272, |
| "grad_norm_var": 0.112994189896767, |
| "learning_rate": 0.0001, |
| "loss": 1690524.25, |
| "loss/crossentropy": 2.897491455078125, |
| "loss/hidden": 0.28515625, |
| "loss/logits": 0.08929288387298584, |
| "loss/reg": 1690521.0, |
| "step": 713 |
| }, |
| { |
| "epoch": 0.01785, |
| "grad_norm": 1.2879559993743896, |
| "grad_norm_var": 0.020438298954069487, |
| "learning_rate": 0.0001, |
| "loss": 1681702.125, |
| "loss/crossentropy": 2.7950549125671387, |
| "loss/hidden": 0.27734375, |
| "loss/logits": 0.08926893770694733, |
| "loss/reg": 1681699.0, |
| "step": 714 |
| }, |
| { |
| "epoch": 0.017875, |
| "grad_norm": 1.490525245666504, |
| "grad_norm_var": 0.016262170201533383, |
| "learning_rate": 0.0001, |
| "loss": 1672951.625, |
| "loss/crossentropy": 2.6606791019439697, |
| "loss/hidden": 0.287109375, |
| "loss/logits": 0.09324029833078384, |
| "loss/reg": 1672948.625, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.0179, |
| "grad_norm": 1.3075960874557495, |
| "grad_norm_var": 0.016670048740370133, |
| "learning_rate": 0.0001, |
| "loss": 1664214.125, |
| "loss/crossentropy": 2.814445972442627, |
| "loss/hidden": 0.279296875, |
| "loss/logits": 0.08748777210712433, |
| "loss/reg": 1664210.875, |
| "step": 716 |
| }, |
| { |
| "epoch": 0.017925, |
| "grad_norm": 1.282886028289795, |
| "grad_norm_var": 0.01841268150060949, |
| "learning_rate": 0.0001, |
| "loss": 1655466.75, |
| "loss/crossentropy": 2.548518657684326, |
| "loss/hidden": 0.2734375, |
| "loss/logits": 0.08179747313261032, |
| "loss/reg": 1655463.875, |
| "step": 717 |
| }, |
| { |
| "epoch": 0.01795, |
| "grad_norm": 1.3141812086105347, |
| "grad_norm_var": 0.01953596922449486, |
| "learning_rate": 0.0001, |
| "loss": 1646782.0, |
| "loss/crossentropy": 2.7778007984161377, |
| "loss/hidden": 0.283203125, |
| "loss/logits": 0.08982248604297638, |
| "loss/reg": 1646778.875, |
| "step": 718 |
| }, |
| { |
| "epoch": 0.017975, |
| "grad_norm": 1.3043862581253052, |
| "grad_norm_var": 0.02032673877171168, |
| "learning_rate": 0.0001, |
| "loss": 1638170.25, |
| "loss/crossentropy": 2.8804094791412354, |
| "loss/hidden": 0.283203125, |
| "loss/logits": 0.0922764241695404, |
| "loss/reg": 1638167.0, |
| "step": 719 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 1.3722143173217773, |
| "grad_norm_var": 0.019843747857923155, |
| "learning_rate": 0.0001, |
| "loss": 1629606.625, |
| "loss/crossentropy": 2.7471835613250732, |
| "loss/hidden": 0.28515625, |
| "loss/logits": 0.09904342889785767, |
| "loss/reg": 1629603.5, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.018025, |
| "grad_norm": 1.3812546730041504, |
| "grad_norm_var": 0.019266441033083424, |
| "learning_rate": 0.0001, |
| "loss": 1621141.875, |
| "loss/crossentropy": 2.5772976875305176, |
| "loss/hidden": 0.28125, |
| "loss/logits": 0.08777609467506409, |
| "loss/reg": 1621138.875, |
| "step": 721 |
| }, |
| { |
| "epoch": 0.01805, |
| "grad_norm": 1.3220105171203613, |
| "grad_norm_var": 0.0201741551288355, |
| "learning_rate": 0.0001, |
| "loss": 1612723.875, |
| "loss/crossentropy": 2.7439324855804443, |
| "loss/hidden": 0.279296875, |
| "loss/logits": 0.09025467932224274, |
| "loss/reg": 1612720.75, |
| "step": 722 |
| }, |
| { |
| "epoch": 0.018075, |
| "grad_norm": 1.3902816772460938, |
| "grad_norm_var": 0.019550491929175885, |
| "learning_rate": 0.0001, |
| "loss": 1604333.375, |
| "loss/crossentropy": 3.090822696685791, |
| "loss/hidden": 0.28125, |
| "loss/logits": 0.0924043133854866, |
| "loss/reg": 1604329.875, |
| "step": 723 |
| }, |
| { |
| "epoch": 0.0181, |
| "grad_norm": 1.4879413843154907, |
| "grad_norm_var": 0.019618912944197812, |
| "learning_rate": 0.0001, |
| "loss": 1595975.125, |
| "loss/crossentropy": 3.115288019180298, |
| "loss/hidden": 0.30078125, |
| "loss/logits": 0.09891839325428009, |
| "loss/reg": 1595971.625, |
| "step": 724 |
| }, |
| { |
| "epoch": 0.018125, |
| "grad_norm": 1.580330729484558, |
| "grad_norm_var": 0.02096040065091902, |
| "learning_rate": 0.0001, |
| "loss": 1587522.75, |
| "loss/crossentropy": 2.840017318725586, |
| "loss/hidden": 0.296875, |
| "loss/logits": 0.08952584862709045, |
| "loss/reg": 1587519.5, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.01815, |
| "grad_norm": 1.6188689470291138, |
| "grad_norm_var": 0.02014543932713379, |
| "learning_rate": 0.0001, |
| "loss": 1579213.875, |
| "loss/crossentropy": 2.6023218631744385, |
| "loss/hidden": 0.291015625, |
| "loss/logits": 0.09345083683729172, |
| "loss/reg": 1579210.875, |
| "step": 726 |
| }, |
| { |
| "epoch": 0.018175, |
| "grad_norm": 1.624415636062622, |
| "grad_norm_var": 0.0164136567789825, |
| "learning_rate": 0.0001, |
| "loss": 1570745.875, |
| "loss/crossentropy": 2.5875203609466553, |
| "loss/hidden": 0.291015625, |
| "loss/logits": 0.08750325441360474, |
| "loss/reg": 1570742.875, |
| "step": 727 |
| }, |
| { |
| "epoch": 0.0182, |
| "grad_norm": 1.9229004383087158, |
| "grad_norm_var": 0.030633020970230913, |
| "learning_rate": 0.0001, |
| "loss": 1562491.375, |
| "loss/crossentropy": 2.869746685028076, |
| "loss/hidden": 0.2890625, |
| "loss/logits": 0.08150084316730499, |
| "loss/reg": 1562488.125, |
| "step": 728 |
| }, |
| { |
| "epoch": 0.018225, |
| "grad_norm": 1.4346904754638672, |
| "grad_norm_var": 0.029822476337221578, |
| "learning_rate": 0.0001, |
| "loss": 1554275.875, |
| "loss/crossentropy": 2.5582826137542725, |
| "loss/hidden": 0.28515625, |
| "loss/logits": 0.0940503478050232, |
| "loss/reg": 1554273.0, |
| "step": 729 |
| }, |
| { |
| "epoch": 0.01825, |
| "grad_norm": 1.3634611368179321, |
| "grad_norm_var": 0.0285962382343986, |
| "learning_rate": 0.0001, |
| "loss": 1546131.5, |
| "loss/crossentropy": 2.7651305198669434, |
| "loss/hidden": 0.294921875, |
| "loss/logits": 0.0977095291018486, |
| "loss/reg": 1546128.375, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.018275, |
| "grad_norm": 1.6234840154647827, |
| "grad_norm_var": 0.03042181748097903, |
| "learning_rate": 0.0001, |
| "loss": 1538049.25, |
| "loss/crossentropy": 2.7815349102020264, |
| "loss/hidden": 0.294921875, |
| "loss/logits": 0.09932485222816467, |
| "loss/reg": 1538046.125, |
| "step": 731 |
| }, |
| { |
| "epoch": 0.0183, |
| "grad_norm": 1.3555150032043457, |
| "grad_norm_var": 0.02960321294779735, |
| "learning_rate": 0.0001, |
| "loss": 1529881.125, |
| "loss/crossentropy": 2.652169942855835, |
| "loss/hidden": 0.28125, |
| "loss/logits": 0.08897815644741058, |
| "loss/reg": 1529878.125, |
| "step": 732 |
| }, |
| { |
| "epoch": 0.018325, |
| "grad_norm": 1.366592288017273, |
| "grad_norm_var": 0.028051264680811035, |
| "learning_rate": 0.0001, |
| "loss": 1521844.625, |
| "loss/crossentropy": 2.373582363128662, |
| "loss/hidden": 0.28515625, |
| "loss/logits": 0.09881549328565598, |
| "loss/reg": 1521841.875, |
| "step": 733 |
| }, |
| { |
| "epoch": 0.01835, |
| "grad_norm": 1.5974054336547852, |
| "grad_norm_var": 0.027316185282404072, |
| "learning_rate": 0.0001, |
| "loss": 1513843.75, |
| "loss/crossentropy": 2.923377752304077, |
| "loss/hidden": 0.330078125, |
| "loss/logits": 0.1185537725687027, |
| "loss/reg": 1513840.375, |
| "step": 734 |
| }, |
| { |
| "epoch": 0.018375, |
| "grad_norm": 1.5125086307525635, |
| "grad_norm_var": 0.025036109290563737, |
| "learning_rate": 0.0001, |
| "loss": 1505878.625, |
| "loss/crossentropy": 2.700101613998413, |
| "loss/hidden": 0.30859375, |
| "loss/logits": 0.09588044881820679, |
| "loss/reg": 1505875.5, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.0184, |
| "grad_norm": 1.7064833641052246, |
| "grad_norm_var": 0.026452773892465822, |
| "learning_rate": 0.0001, |
| "loss": 1497990.375, |
| "loss/crossentropy": 3.0229055881500244, |
| "loss/hidden": 0.3125, |
| "loss/logits": 0.09158538281917572, |
| "loss/reg": 1497987.0, |
| "step": 736 |
| }, |
| { |
| "epoch": 0.018425, |
| "grad_norm": 1.3627216815948486, |
| "grad_norm_var": 0.02681216983450933, |
| "learning_rate": 0.0001, |
| "loss": 1490112.875, |
| "loss/crossentropy": 2.7470643520355225, |
| "loss/hidden": 0.291015625, |
| "loss/logits": 0.09724022448062897, |
| "loss/reg": 1490109.75, |
| "step": 737 |
| }, |
| { |
| "epoch": 0.01845, |
| "grad_norm": 2.0246007442474365, |
| "grad_norm_var": 0.03941185866335649, |
| "learning_rate": 0.0001, |
| "loss": 1482296.5, |
| "loss/crossentropy": 3.0359678268432617, |
| "loss/hidden": 0.306640625, |
| "loss/logits": 0.09346877783536911, |
| "loss/reg": 1482293.125, |
| "step": 738 |
| }, |
| { |
| "epoch": 0.018475, |
| "grad_norm": 1.4852879047393799, |
| "grad_norm_var": 0.03781642855113982, |
| "learning_rate": 0.0001, |
| "loss": 1474496.5, |
| "loss/crossentropy": 2.6871392726898193, |
| "loss/hidden": 0.294921875, |
| "loss/logits": 0.09486332535743713, |
| "loss/reg": 1474493.5, |
| "step": 739 |
| }, |
| { |
| "epoch": 0.0185, |
| "grad_norm": 1.4777488708496094, |
| "grad_norm_var": 0.037929955273074197, |
| "learning_rate": 0.0001, |
| "loss": 1466744.75, |
| "loss/crossentropy": 2.679792642593384, |
| "loss/hidden": 0.3046875, |
| "loss/logits": 0.1035013273358345, |
| "loss/reg": 1466741.75, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.018525, |
| "grad_norm": 1.5449753999710083, |
| "grad_norm_var": 0.03794082367796528, |
| "learning_rate": 0.0001, |
| "loss": 1458748.25, |
| "loss/crossentropy": 2.730419397354126, |
| "loss/hidden": 0.28515625, |
| "loss/logits": 0.0843164399266243, |
| "loss/reg": 1458745.125, |
| "step": 741 |
| }, |
| { |
| "epoch": 0.01855, |
| "grad_norm": 1.441712737083435, |
| "grad_norm_var": 0.03860283929822727, |
| "learning_rate": 0.0001, |
| "loss": 1451065.125, |
| "loss/crossentropy": 2.8322675228118896, |
| "loss/hidden": 0.298828125, |
| "loss/logits": 0.09410291910171509, |
| "loss/reg": 1451061.875, |
| "step": 742 |
| }, |
| { |
| "epoch": 0.018575, |
| "grad_norm": 1.4545038938522339, |
| "grad_norm_var": 0.0387843498530079, |
| "learning_rate": 0.0001, |
| "loss": 1443457.5, |
| "loss/crossentropy": 2.955693244934082, |
| "loss/hidden": 0.30078125, |
| "loss/logits": 0.09330207109451294, |
| "loss/reg": 1443454.125, |
| "step": 743 |
| }, |
| { |
| "epoch": 0.0186, |
| "grad_norm": 1.4677369594573975, |
| "grad_norm_var": 0.028626281003366217, |
| "learning_rate": 0.0001, |
| "loss": 1435910.125, |
| "loss/crossentropy": 2.7071373462677, |
| "loss/hidden": 0.30859375, |
| "loss/logits": 0.09569920599460602, |
| "loss/reg": 1435907.0, |
| "step": 744 |
| }, |
| { |
| "epoch": 0.018625, |
| "grad_norm": 1.3055301904678345, |
| "grad_norm_var": 0.03102982805048094, |
| "learning_rate": 0.0001, |
| "loss": 1428393.25, |
| "loss/crossentropy": 2.5493621826171875, |
| "loss/hidden": 0.294921875, |
| "loss/logits": 0.08571361750364304, |
| "loss/reg": 1428390.375, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.01865, |
| "grad_norm": 1.495094895362854, |
| "grad_norm_var": 0.029617357370123366, |
| "learning_rate": 0.0001, |
| "loss": 1420877.375, |
| "loss/crossentropy": 2.951340436935425, |
| "loss/hidden": 0.30859375, |
| "loss/logits": 0.10248555988073349, |
| "loss/reg": 1420874.0, |
| "step": 746 |
| }, |
| { |
| "epoch": 0.018675, |
| "grad_norm": 1.3942012786865234, |
| "grad_norm_var": 0.029551970311414523, |
| "learning_rate": 0.0001, |
| "loss": 1413380.25, |
| "loss/crossentropy": 2.8395426273345947, |
| "loss/hidden": 0.296875, |
| "loss/logits": 0.09555436670780182, |
| "loss/reg": 1413377.0, |
| "step": 747 |
| }, |
| { |
| "epoch": 0.0187, |
| "grad_norm": 2.2666401863098145, |
| "grad_norm_var": 0.06393980615764552, |
| "learning_rate": 0.0001, |
| "loss": 1405928.0, |
| "loss/crossentropy": 2.5989503860473633, |
| "loss/hidden": 0.31640625, |
| "loss/logits": 0.09109900891780853, |
| "loss/reg": 1405925.0, |
| "step": 748 |
| }, |
| { |
| "epoch": 0.018725, |
| "grad_norm": 1.503852367401123, |
| "grad_norm_var": 0.06164205269133583, |
| "learning_rate": 0.0001, |
| "loss": 1398508.875, |
| "loss/crossentropy": 2.908634901046753, |
| "loss/hidden": 0.306640625, |
| "loss/logits": 0.08344586193561554, |
| "loss/reg": 1398505.625, |
| "step": 749 |
| }, |
| { |
| "epoch": 0.01875, |
| "grad_norm": 1.4267386198043823, |
| "grad_norm_var": 0.06272652467218881, |
| "learning_rate": 0.0001, |
| "loss": 1391127.625, |
| "loss/crossentropy": 2.618389368057251, |
| "loss/hidden": 0.296875, |
| "loss/logits": 0.10475891828536987, |
| "loss/reg": 1391124.625, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.018775, |
| "grad_norm": 1.495357632637024, |
| "grad_norm_var": 0.06284069774018229, |
| "learning_rate": 0.0001, |
| "loss": 1383656.125, |
| "loss/crossentropy": 2.6676816940307617, |
| "loss/hidden": 0.279296875, |
| "loss/logits": 0.08022835850715637, |
| "loss/reg": 1383653.125, |
| "step": 751 |
| }, |
| { |
| "epoch": 0.0188, |
| "grad_norm": 1.5672823190689087, |
| "grad_norm_var": 0.06120909994798822, |
| "learning_rate": 0.0001, |
| "loss": 1376294.75, |
| "loss/crossentropy": 2.6863832473754883, |
| "loss/hidden": 0.310546875, |
| "loss/logits": 0.10240334272384644, |
| "loss/reg": 1376291.75, |
| "step": 752 |
| }, |
| { |
| "epoch": 0.018825, |
| "grad_norm": 1.6938062906265259, |
| "grad_norm_var": 0.060030151225192147, |
| "learning_rate": 0.0001, |
| "loss": 1369030.25, |
| "loss/crossentropy": 2.991760492324829, |
| "loss/hidden": 0.33203125, |
| "loss/logits": 0.10976439714431763, |
| "loss/reg": 1369026.75, |
| "step": 753 |
| }, |
| { |
| "epoch": 0.01885, |
| "grad_norm": 1.4029737710952759, |
| "grad_norm_var": 0.046114309728728625, |
| "learning_rate": 0.0001, |
| "loss": 1361782.625, |
| "loss/crossentropy": 2.539726972579956, |
| "loss/hidden": 0.30078125, |
| "loss/logits": 0.09395290911197662, |
| "loss/reg": 1361779.75, |
| "step": 754 |
| }, |
| { |
| "epoch": 0.018875, |
| "grad_norm": 1.9776279926300049, |
| "grad_norm_var": 0.058561134125315564, |
| "learning_rate": 0.0001, |
| "loss": 1354602.75, |
| "loss/crossentropy": 2.583235502243042, |
| "loss/hidden": 0.349609375, |
| "loss/logits": 0.1163034588098526, |
| "loss/reg": 1354599.625, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.0189, |
| "grad_norm": 1.5575007200241089, |
| "grad_norm_var": 0.058113418806139745, |
| "learning_rate": 0.0001, |
| "loss": 1347304.125, |
| "loss/crossentropy": 2.5159335136413574, |
| "loss/hidden": 0.310546875, |
| "loss/logits": 0.08959998190402985, |
| "loss/reg": 1347301.25, |
| "step": 756 |
| }, |
| { |
| "epoch": 0.018925, |
| "grad_norm": 1.2972793579101562, |
| "grad_norm_var": 0.06251755592825793, |
| "learning_rate": 0.0001, |
| "loss": 1340198.0, |
| "loss/crossentropy": 2.5743250846862793, |
| "loss/hidden": 0.298828125, |
| "loss/logits": 0.08666753768920898, |
| "loss/reg": 1340195.0, |
| "step": 757 |
| }, |
| { |
| "epoch": 0.01895, |
| "grad_norm": 1.4923604726791382, |
| "grad_norm_var": 0.061968628425057425, |
| "learning_rate": 0.0001, |
| "loss": 1333169.0, |
| "loss/crossentropy": 2.6868984699249268, |
| "loss/hidden": 0.296875, |
| "loss/logits": 0.09011873602867126, |
| "loss/reg": 1333166.0, |
| "step": 758 |
| }, |
| { |
| "epoch": 0.018975, |
| "grad_norm": 1.3754783868789673, |
| "grad_norm_var": 0.06336416352655191, |
| "learning_rate": 0.0001, |
| "loss": 1326126.625, |
| "loss/crossentropy": 2.7182505130767822, |
| "loss/hidden": 0.283203125, |
| "loss/logits": 0.084558866918087, |
| "loss/reg": 1326123.5, |
| "step": 759 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 1.4301767349243164, |
| "grad_norm_var": 0.06383910373089421, |
| "learning_rate": 0.0001, |
| "loss": 1318915.375, |
| "loss/crossentropy": 2.770848274230957, |
| "loss/hidden": 0.310546875, |
| "loss/logits": 0.10345181077718735, |
| "loss/reg": 1318912.25, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.019025, |
| "grad_norm": 1.422809362411499, |
| "grad_norm_var": 0.0609913461441403, |
| "learning_rate": 0.0001, |
| "loss": 1311936.75, |
| "loss/crossentropy": 2.5759620666503906, |
| "loss/hidden": 0.3046875, |
| "loss/logits": 0.08820918947458267, |
| "loss/reg": 1311933.75, |
| "step": 761 |
| }, |
| { |
| "epoch": 0.01905, |
| "grad_norm": 1.386316180229187, |
| "grad_norm_var": 0.06252648796074087, |
| "learning_rate": 0.0001, |
| "loss": 1304969.75, |
| "loss/crossentropy": 2.6134355068206787, |
| "loss/hidden": 0.310546875, |
| "loss/logits": 0.10319659113883972, |
| "loss/reg": 1304966.75, |
| "step": 762 |
| }, |
| { |
| "epoch": 0.019075, |
| "grad_norm": 1.3958642482757568, |
| "grad_norm_var": 0.0624936344867729, |
| "learning_rate": 0.0001, |
| "loss": 1297955.625, |
| "loss/crossentropy": 2.6670870780944824, |
| "loss/hidden": 0.298828125, |
| "loss/logits": 0.09877242147922516, |
| "loss/reg": 1297952.625, |
| "step": 763 |
| }, |
| { |
| "epoch": 0.0191, |
| "grad_norm": 1.4924622774124146, |
| "grad_norm_var": 0.025282489292838287, |
| "learning_rate": 0.0001, |
| "loss": 1291096.625, |
| "loss/crossentropy": 2.56126070022583, |
| "loss/hidden": 0.2890625, |
| "loss/logits": 0.08578027784824371, |
| "loss/reg": 1291093.75, |
| "step": 764 |
| }, |
| { |
| "epoch": 0.019125, |
| "grad_norm": 1.4032528400421143, |
| "grad_norm_var": 0.025794495039517785, |
| "learning_rate": 0.0001, |
| "loss": 1284286.5, |
| "loss/crossentropy": 2.8328068256378174, |
| "loss/hidden": 0.30859375, |
| "loss/logits": 0.09765258431434631, |
| "loss/reg": 1284283.25, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.01915, |
| "grad_norm": 1.4581797122955322, |
| "grad_norm_var": 0.025597028970939975, |
| "learning_rate": 0.0001, |
| "loss": 1277566.75, |
| "loss/crossentropy": 2.957227945327759, |
| "loss/hidden": 0.306640625, |
| "loss/logits": 0.10244441032409668, |
| "loss/reg": 1277563.375, |
| "step": 766 |
| }, |
| { |
| "epoch": 0.019175, |
| "grad_norm": 1.64083731174469, |
| "grad_norm_var": 0.027013142007775968, |
| "learning_rate": 0.0001, |
| "loss": 1270829.0, |
| "loss/crossentropy": 3.119053602218628, |
| "loss/hidden": 0.3203125, |
| "loss/logits": 0.09958397597074509, |
| "loss/reg": 1270825.5, |
| "step": 767 |
| }, |
| { |
| "epoch": 0.0192, |
| "grad_norm": 1.3009679317474365, |
| "grad_norm_var": 0.02904389746358878, |
| "learning_rate": 0.0001, |
| "loss": 1263982.125, |
| "loss/crossentropy": 2.7419540882110596, |
| "loss/hidden": 0.2890625, |
| "loss/logits": 0.09389052540063858, |
| "loss/reg": 1263979.0, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.019225, |
| "grad_norm": 1.5036202669143677, |
| "grad_norm_var": 0.025958744351864262, |
| "learning_rate": 0.0001, |
| "loss": 1257289.375, |
| "loss/crossentropy": 2.9188363552093506, |
| "loss/hidden": 0.302734375, |
| "loss/logits": 0.09368471056222916, |
| "loss/reg": 1257286.125, |
| "step": 769 |
| }, |
| { |
| "epoch": 0.01925, |
| "grad_norm": 1.5635441541671753, |
| "grad_norm_var": 0.026111487789617627, |
| "learning_rate": 0.0001, |
| "loss": 1250638.75, |
| "loss/crossentropy": 2.8412418365478516, |
| "loss/hidden": 0.30859375, |
| "loss/logits": 0.08815594017505646, |
| "loss/reg": 1250635.5, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.019275, |
| "grad_norm": 1.5642369985580444, |
| "grad_norm_var": 0.009426553673497474, |
| "learning_rate": 0.0001, |
| "loss": 1244009.25, |
| "loss/crossentropy": 2.3495378494262695, |
| "loss/hidden": 0.318359375, |
| "loss/logits": 0.08667473495006561, |
| "loss/reg": 1244006.5, |
| "step": 771 |
| }, |
| { |
| "epoch": 0.0193, |
| "grad_norm": 1.3931305408477783, |
| "grad_norm_var": 0.008875436597914661, |
| "learning_rate": 0.0001, |
| "loss": 1237421.75, |
| "loss/crossentropy": 2.9413013458251953, |
| "loss/hidden": 0.306640625, |
| "loss/logits": 0.10597728192806244, |
| "loss/reg": 1237418.375, |
| "step": 772 |
| }, |
| { |
| "epoch": 0.019325, |
| "grad_norm": 1.387599229812622, |
| "grad_norm_var": 0.007605954661941183, |
| "learning_rate": 0.0001, |
| "loss": 1230903.875, |
| "loss/crossentropy": 2.8374476432800293, |
| "loss/hidden": 0.3203125, |
| "loss/logits": 0.09825296700000763, |
| "loss/reg": 1230900.625, |
| "step": 773 |
| }, |
| { |
| "epoch": 0.01935, |
| "grad_norm": 1.465582251548767, |
| "grad_norm_var": 0.007501944869941468, |
| "learning_rate": 0.0001, |
| "loss": 1224416.25, |
| "loss/crossentropy": 2.7227585315704346, |
| "loss/hidden": 0.3046875, |
| "loss/logits": 0.09277403354644775, |
| "loss/reg": 1224413.125, |
| "step": 774 |
| }, |
| { |
| "epoch": 0.019375, |
| "grad_norm": 1.6337682008743286, |
| "grad_norm_var": 0.009139436356209056, |
| "learning_rate": 0.0001, |
| "loss": 1217947.375, |
| "loss/crossentropy": 2.9455771446228027, |
| "loss/hidden": 0.310546875, |
| "loss/logits": 0.09985758364200592, |
| "loss/reg": 1217944.0, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.0194, |
| "grad_norm": 1.2852834463119507, |
| "grad_norm_var": 0.011127155114268265, |
| "learning_rate": 0.0001, |
| "loss": 1211541.875, |
| "loss/crossentropy": 2.722407102584839, |
| "loss/hidden": 0.291015625, |
| "loss/logits": 0.09458113461732864, |
| "loss/reg": 1211538.75, |
| "step": 776 |
| }, |
| { |
| "epoch": 0.019425, |
| "grad_norm": 1.3524373769760132, |
| "grad_norm_var": 0.011748947343355808, |
| "learning_rate": 0.0001, |
| "loss": 1205176.375, |
| "loss/crossentropy": 2.8407390117645264, |
| "loss/hidden": 0.28125, |
| "loss/logits": 0.07722481340169907, |
| "loss/reg": 1205173.125, |
| "step": 777 |
| }, |
| { |
| "epoch": 0.01945, |
| "grad_norm": 1.4920310974121094, |
| "grad_norm_var": 0.0115259221637008, |
| "learning_rate": 0.0001, |
| "loss": 1198818.5, |
| "loss/crossentropy": 2.814127206802368, |
| "loss/hidden": 0.31640625, |
| "loss/logits": 0.10287363082170486, |
| "loss/reg": 1198815.25, |
| "step": 778 |
| }, |
| { |
| "epoch": 0.019475, |
| "grad_norm": 1.6447800397872925, |
| "grad_norm_var": 0.013326202563744498, |
| "learning_rate": 0.0001, |
| "loss": 1192490.875, |
| "loss/crossentropy": 2.394585132598877, |
| "loss/hidden": 0.31640625, |
| "loss/logits": 0.10890543460845947, |
| "loss/reg": 1192488.125, |
| "step": 779 |
| }, |
| { |
| "epoch": 0.0195, |
| "grad_norm": 1.3950115442276, |
| "grad_norm_var": 0.013677997872572215, |
| "learning_rate": 0.0001, |
| "loss": 1186171.25, |
| "loss/crossentropy": 2.6615471839904785, |
| "loss/hidden": 0.287109375, |
| "loss/logits": 0.08540309220552444, |
| "loss/reg": 1186168.25, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.019525, |
| "grad_norm": 1.5021816492080688, |
| "grad_norm_var": 0.013438712633327002, |
| "learning_rate": 0.0001, |
| "loss": 1179875.125, |
| "loss/crossentropy": 3.0551717281341553, |
| "loss/hidden": 0.322265625, |
| "loss/logits": 0.10148952901363373, |
| "loss/reg": 1179871.75, |
| "step": 781 |
| }, |
| { |
| "epoch": 0.01955, |
| "grad_norm": 1.3547725677490234, |
| "grad_norm_var": 0.014324455095329765, |
| "learning_rate": 0.0001, |
| "loss": 1173610.25, |
| "loss/crossentropy": 2.6172921657562256, |
| "loss/hidden": 0.3203125, |
| "loss/logits": 0.10562514513731003, |
| "loss/reg": 1173607.25, |
| "step": 782 |
| }, |
| { |
| "epoch": 0.019575, |
| "grad_norm": 1.3919837474822998, |
| "grad_norm_var": 0.012443100067775721, |
| "learning_rate": 0.0001, |
| "loss": 1167396.5, |
| "loss/crossentropy": 2.5991079807281494, |
| "loss/hidden": 0.294921875, |
| "loss/logits": 0.08324473351240158, |
| "loss/reg": 1167393.5, |
| "step": 783 |
| }, |
| { |
| "epoch": 0.0196, |
| "grad_norm": 1.3813422918319702, |
| "grad_norm_var": 0.0112290209550124, |
| "learning_rate": 0.0001, |
| "loss": 1161233.375, |
| "loss/crossentropy": 2.727051019668579, |
| "loss/hidden": 0.3125, |
| "loss/logits": 0.10817626863718033, |
| "loss/reg": 1161230.25, |
| "step": 784 |
| }, |
| { |
| "epoch": 0.019625, |
| "grad_norm": 1.4223248958587646, |
| "grad_norm_var": 0.01113627405784238, |
| "learning_rate": 0.0001, |
| "loss": 1155096.25, |
| "loss/crossentropy": 2.914677858352661, |
| "loss/hidden": 0.302734375, |
| "loss/logits": 0.08892448246479034, |
| "loss/reg": 1155093.0, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.01965, |
| "grad_norm": 1.5389269590377808, |
| "grad_norm_var": 0.010807620661734064, |
| "learning_rate": 0.0001, |
| "loss": 1148997.125, |
| "loss/crossentropy": 2.7384488582611084, |
| "loss/hidden": 0.330078125, |
| "loss/logits": 0.11149714887142181, |
| "loss/reg": 1148993.875, |
| "step": 786 |
| }, |
| { |
| "epoch": 0.019675, |
| "grad_norm": 1.5660094022750854, |
| "grad_norm_var": 0.010834733891906012, |
| "learning_rate": 0.0001, |
| "loss": 1142947.75, |
| "loss/crossentropy": 2.450967311859131, |
| "loss/hidden": 0.310546875, |
| "loss/logits": 0.09985432773828506, |
| "loss/reg": 1142944.875, |
| "step": 787 |
| }, |
| { |
| "epoch": 0.0197, |
| "grad_norm": 1.561070442199707, |
| "grad_norm_var": 0.011314024115481658, |
| "learning_rate": 0.0001, |
| "loss": 1136928.75, |
| "loss/crossentropy": 2.735142946243286, |
| "loss/hidden": 0.306640625, |
| "loss/logits": 0.09043477475643158, |
| "loss/reg": 1136925.625, |
| "step": 788 |
| }, |
| { |
| "epoch": 0.019725, |
| "grad_norm": 1.511995553970337, |
| "grad_norm_var": 0.011064666464858182, |
| "learning_rate": 0.0001, |
| "loss": 1130955.125, |
| "loss/crossentropy": 2.7919270992279053, |
| "loss/hidden": 0.30859375, |
| "loss/logits": 0.0940331518650055, |
| "loss/reg": 1130952.0, |
| "step": 789 |
| }, |
| { |
| "epoch": 0.01975, |
| "grad_norm": 1.234368920326233, |
| "grad_norm_var": 0.014502587941771594, |
| "learning_rate": 0.0001, |
| "loss": 1124976.75, |
| "loss/crossentropy": 2.707932949066162, |
| "loss/hidden": 0.30859375, |
| "loss/logits": 0.09382109344005585, |
| "loss/reg": 1124973.625, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.019775, |
| "grad_norm": 1.2562097311019897, |
| "grad_norm_var": 0.014375745427114594, |
| "learning_rate": 0.0001, |
| "loss": 1118810.375, |
| "loss/crossentropy": 2.620636463165283, |
| "loss/hidden": 0.291015625, |
| "loss/logits": 0.09227173030376434, |
| "loss/reg": 1118807.375, |
| "step": 791 |
| }, |
| { |
| "epoch": 0.0198, |
| "grad_norm": 1.3169291019439697, |
| "grad_norm_var": 0.013824886306606684, |
| "learning_rate": 0.0001, |
| "loss": 1112859.0, |
| "loss/crossentropy": 2.735628128051758, |
| "loss/hidden": 0.302734375, |
| "loss/logits": 0.08817663788795471, |
| "loss/reg": 1112855.875, |
| "step": 792 |
| }, |
| { |
| "epoch": 0.019825, |
| "grad_norm": 1.271102786064148, |
| "grad_norm_var": 0.015108201593608268, |
| "learning_rate": 0.0001, |
| "loss": 1106920.75, |
| "loss/crossentropy": 2.761826515197754, |
| "loss/hidden": 0.29296875, |
| "loss/logits": 0.08847036212682724, |
| "loss/reg": 1106917.625, |
| "step": 793 |
| }, |
| { |
| "epoch": 0.01985, |
| "grad_norm": 1.448644757270813, |
| "grad_norm_var": 0.014852923860655428, |
| "learning_rate": 0.0001, |
| "loss": 1101017.25, |
| "loss/crossentropy": 2.816882610321045, |
| "loss/hidden": 0.322265625, |
| "loss/logits": 0.10003934800624847, |
| "loss/reg": 1101014.0, |
| "step": 794 |
| }, |
| { |
| "epoch": 0.019875, |
| "grad_norm": 1.4269227981567383, |
| "grad_norm_var": 0.011430936803937126, |
| "learning_rate": 0.0001, |
| "loss": 1095140.875, |
| "loss/crossentropy": 2.4692585468292236, |
| "loss/hidden": 0.31640625, |
| "loss/logits": 0.091438889503479, |
| "loss/reg": 1095138.0, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.0199, |
| "grad_norm": 1.7071237564086914, |
| "grad_norm_var": 0.01684407875522401, |
| "learning_rate": 0.0001, |
| "loss": 1089289.5, |
| "loss/crossentropy": 2.5952391624450684, |
| "loss/hidden": 0.341796875, |
| "loss/logits": 0.10839910805225372, |
| "loss/reg": 1089286.375, |
| "step": 796 |
| }, |
| { |
| "epoch": 0.019925, |
| "grad_norm": 1.450836181640625, |
| "grad_norm_var": 0.016519786763948652, |
| "learning_rate": 0.0001, |
| "loss": 1083514.75, |
| "loss/crossentropy": 2.6975083351135254, |
| "loss/hidden": 0.31640625, |
| "loss/logits": 0.08662910014390945, |
| "loss/reg": 1083511.625, |
| "step": 797 |
| }, |
| { |
| "epoch": 0.01995, |
| "grad_norm": 1.597856044769287, |
| "grad_norm_var": 0.017854564756884805, |
| "learning_rate": 0.0001, |
| "loss": 1077768.25, |
| "loss/crossentropy": 2.5531835556030273, |
| "loss/hidden": 0.3203125, |
| "loss/logits": 0.10884322971105576, |
| "loss/reg": 1077765.375, |
| "step": 798 |
| }, |
| { |
| "epoch": 0.019975, |
| "grad_norm": 1.5088669061660767, |
| "grad_norm_var": 0.01791760011660983, |
| "learning_rate": 0.0001, |
| "loss": 1072064.25, |
| "loss/crossentropy": 2.4319939613342285, |
| "loss/hidden": 0.326171875, |
| "loss/logits": 0.12858393788337708, |
| "loss/reg": 1072061.375, |
| "step": 799 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 1.5886433124542236, |
| "grad_norm_var": 0.018704832298241967, |
| "learning_rate": 0.0001, |
| "loss": 1066403.375, |
| "loss/crossentropy": 2.721888542175293, |
| "loss/hidden": 0.341796875, |
| "loss/logits": 0.1048218160867691, |
| "loss/reg": 1066400.125, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.020025, |
| "grad_norm": 1.397333025932312, |
| "grad_norm_var": 0.01887937390174083, |
| "learning_rate": 0.0001, |
| "loss": 1060640.75, |
| "loss/crossentropy": 2.718273639678955, |
| "loss/hidden": 0.306640625, |
| "loss/logits": 0.09427271783351898, |
| "loss/reg": 1060637.625, |
| "step": 801 |
| }, |
| { |
| "epoch": 0.02005, |
| "grad_norm": 1.5306111574172974, |
| "grad_norm_var": 0.01879776656405904, |
| "learning_rate": 0.0001, |
| "loss": 1055024.625, |
| "loss/crossentropy": 2.5766334533691406, |
| "loss/hidden": 0.32421875, |
| "loss/logits": 0.09915133565664291, |
| "loss/reg": 1055021.625, |
| "step": 802 |
| }, |
| { |
| "epoch": 0.020075, |
| "grad_norm": 1.5015225410461426, |
| "grad_norm_var": 0.018153986114304566, |
| "learning_rate": 0.0001, |
| "loss": 1049416.125, |
| "loss/crossentropy": 2.781911611557007, |
| "loss/hidden": 0.310546875, |
| "loss/logits": 0.09055100381374359, |
| "loss/reg": 1049413.0, |
| "step": 803 |
| }, |
| { |
| "epoch": 0.0201, |
| "grad_norm": 1.362334132194519, |
| "grad_norm_var": 0.017861565949474568, |
| "learning_rate": 0.0001, |
| "loss": 1043867.875, |
| "loss/crossentropy": 2.722330093383789, |
| "loss/hidden": 0.31640625, |
| "loss/logits": 0.09407853335142136, |
| "loss/reg": 1043864.6875, |
| "step": 804 |
| }, |
| { |
| "epoch": 0.020125, |
| "grad_norm": 1.5510976314544678, |
| "grad_norm_var": 0.01830925011105909, |
| "learning_rate": 0.0001, |
| "loss": 1038237.5, |
| "loss/crossentropy": 2.764539957046509, |
| "loss/hidden": 0.318359375, |
| "loss/logits": 0.0987626165151596, |
| "loss/reg": 1038234.3125, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.02015, |
| "grad_norm": 1.9383140802383423, |
| "grad_norm_var": 0.0293323780554239, |
| "learning_rate": 0.0001, |
| "loss": 1032701.375, |
| "loss/crossentropy": 2.852473258972168, |
| "loss/hidden": 0.326171875, |
| "loss/logits": 0.1315414011478424, |
| "loss/reg": 1032698.0625, |
| "step": 806 |
| }, |
| { |
| "epoch": 0.020175, |
| "grad_norm": 1.42233407497406, |
| "grad_norm_var": 0.02585891229697547, |
| "learning_rate": 0.0001, |
| "loss": 1027211.5, |
| "loss/crossentropy": 2.926844596862793, |
| "loss/hidden": 0.32421875, |
| "loss/logits": 0.10032892227172852, |
| "loss/reg": 1027208.125, |
| "step": 807 |
| }, |
| { |
| "epoch": 0.0202, |
| "grad_norm": 1.3749264478683472, |
| "grad_norm_var": 0.024643565075129874, |
| "learning_rate": 0.0001, |
| "loss": 1021757.375, |
| "loss/crossentropy": 2.604473114013672, |
| "loss/hidden": 0.330078125, |
| "loss/logits": 0.10353030264377594, |
| "loss/reg": 1021754.3125, |
| "step": 808 |
| }, |
| { |
| "epoch": 0.020225, |
| "grad_norm": 1.7052720785140991, |
| "grad_norm_var": 0.02289040624687188, |
| "learning_rate": 0.0001, |
| "loss": 1016303.5625, |
| "loss/crossentropy": 2.7987732887268066, |
| "loss/hidden": 0.341796875, |
| "loss/logits": 0.11831298470497131, |
| "loss/reg": 1016300.3125, |
| "step": 809 |
| }, |
| { |
| "epoch": 0.02025, |
| "grad_norm": 1.324113368988037, |
| "grad_norm_var": 0.02524436934666762, |
| "learning_rate": 0.0001, |
| "loss": 1010888.125, |
| "loss/crossentropy": 2.2193870544433594, |
| "loss/hidden": 0.298828125, |
| "loss/logits": 0.07667031139135361, |
| "loss/reg": 1010885.5, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.020275, |
| "grad_norm": 1.4208396673202515, |
| "grad_norm_var": 0.025325628125159896, |
| "learning_rate": 0.0001, |
| "loss": 1005486.5625, |
| "loss/crossentropy": 2.596322774887085, |
| "loss/hidden": 0.318359375, |
| "loss/logits": 0.1107511892914772, |
| "loss/reg": 1005483.5, |
| "step": 811 |
| }, |
| { |
| "epoch": 0.0203, |
| "grad_norm": 1.345226526260376, |
| "grad_norm_var": 0.024669006407863906, |
| "learning_rate": 0.0001, |
| "loss": 1000114.9375, |
| "loss/crossentropy": 2.8060107231140137, |
| "loss/hidden": 0.322265625, |
| "loss/logits": 0.09114284068346024, |
| "loss/reg": 1000111.6875, |
| "step": 812 |
| }, |
| { |
| "epoch": 0.020325, |
| "grad_norm": 1.3228507041931152, |
| "grad_norm_var": 0.026553207915559027, |
| "learning_rate": 0.0001, |
| "loss": 994774.75, |
| "loss/crossentropy": 2.67539119720459, |
| "loss/hidden": 0.3125, |
| "loss/logits": 0.10133929550647736, |
| "loss/reg": 994771.625, |
| "step": 813 |
| }, |
| { |
| "epoch": 0.02035, |
| "grad_norm": 1.3640375137329102, |
| "grad_norm_var": 0.026709250543470318, |
| "learning_rate": 0.0001, |
| "loss": 989477.9375, |
| "loss/crossentropy": 2.671464443206787, |
| "loss/hidden": 0.31640625, |
| "loss/logits": 0.10128870606422424, |
| "loss/reg": 989474.8125, |
| "step": 814 |
| }, |
| { |
| "epoch": 0.020375, |
| "grad_norm": 1.5935741662979126, |
| "grad_norm_var": 0.027499041117244792, |
| "learning_rate": 0.0001, |
| "loss": 984168.375, |
| "loss/crossentropy": 2.7444188594818115, |
| "loss/hidden": 0.326171875, |
| "loss/logits": 0.09428627789020538, |
| "loss/reg": 984165.1875, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.0204, |
| "grad_norm": 1.4187277555465698, |
| "grad_norm_var": 0.02693138737542332, |
| "learning_rate": 0.0001, |
| "loss": 978899.625, |
| "loss/crossentropy": 2.8652889728546143, |
| "loss/hidden": 0.333984375, |
| "loss/logits": 0.10547365993261337, |
| "loss/reg": 978896.3125, |
| "step": 816 |
| }, |
| { |
| "epoch": 0.020425, |
| "grad_norm": 1.335964322090149, |
| "grad_norm_var": 0.027788530063783647, |
| "learning_rate": 0.0001, |
| "loss": 973632.9375, |
| "loss/crossentropy": 2.5848679542541504, |
| "loss/hidden": 0.302734375, |
| "loss/logits": 0.08393712341785431, |
| "loss/reg": 973630.0, |
| "step": 817 |
| }, |
| { |
| "epoch": 0.02045, |
| "grad_norm": 1.5134330987930298, |
| "grad_norm_var": 0.027666967111131414, |
| "learning_rate": 0.0001, |
| "loss": 968422.5625, |
| "loss/crossentropy": 2.580274820327759, |
| "loss/hidden": 0.326171875, |
| "loss/logits": 0.10585898905992508, |
| "loss/reg": 968419.5625, |
| "step": 818 |
| }, |
| { |
| "epoch": 0.020475, |
| "grad_norm": 2.1845602989196777, |
| "grad_norm_var": 0.05984132255864907, |
| "learning_rate": 0.0001, |
| "loss": 963242.75, |
| "loss/crossentropy": 3.1511080265045166, |
| "loss/hidden": 0.341796875, |
| "loss/logits": 0.10218004882335663, |
| "loss/reg": 963239.1875, |
| "step": 819 |
| }, |
| { |
| "epoch": 0.0205, |
| "grad_norm": 1.5324558019638062, |
| "grad_norm_var": 0.05827571093695549, |
| "learning_rate": 0.0001, |
| "loss": 958125.0625, |
| "loss/crossentropy": 2.8154571056365967, |
| "loss/hidden": 0.34765625, |
| "loss/logits": 0.12289270013570786, |
| "loss/reg": 958121.75, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.020525, |
| "grad_norm": 1.556467056274414, |
| "grad_norm_var": 0.058298535701974664, |
| "learning_rate": 0.0001, |
| "loss": 953012.3125, |
| "loss/crossentropy": 3.0191445350646973, |
| "loss/hidden": 0.330078125, |
| "loss/logits": 0.11106862127780914, |
| "loss/reg": 953008.875, |
| "step": 821 |
| }, |
| { |
| "epoch": 0.02055, |
| "grad_norm": 1.4595040082931519, |
| "grad_norm_var": 0.04605355552315752, |
| "learning_rate": 0.0001, |
| "loss": 947911.75, |
| "loss/crossentropy": 2.804565906524658, |
| "loss/hidden": 0.333984375, |
| "loss/logits": 0.09651564061641693, |
| "loss/reg": 947908.5, |
| "step": 822 |
| }, |
| { |
| "epoch": 0.020575, |
| "grad_norm": 1.3791453838348389, |
| "grad_norm_var": 0.046572128210712015, |
| "learning_rate": 0.0001, |
| "loss": 942830.6875, |
| "loss/crossentropy": 2.4445128440856934, |
| "loss/hidden": 0.302734375, |
| "loss/logits": 0.08769936114549637, |
| "loss/reg": 942827.875, |
| "step": 823 |
| }, |
| { |
| "epoch": 0.0206, |
| "grad_norm": 1.5223854780197144, |
| "grad_norm_var": 0.045679592731233204, |
| "learning_rate": 0.0001, |
| "loss": 937800.0, |
| "loss/crossentropy": 2.5925703048706055, |
| "loss/hidden": 0.322265625, |
| "loss/logits": 0.08651132881641388, |
| "loss/reg": 937797.0, |
| "step": 824 |
| }, |
| { |
| "epoch": 0.020625, |
| "grad_norm": 1.5347764492034912, |
| "grad_norm_var": 0.04279952542354977, |
| "learning_rate": 0.0001, |
| "loss": 932779.625, |
| "loss/crossentropy": 2.716421604156494, |
| "loss/hidden": 0.330078125, |
| "loss/logits": 0.1000697985291481, |
| "loss/reg": 932776.5, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.02065, |
| "grad_norm": 1.44983971118927, |
| "grad_norm_var": 0.04104008990080619, |
| "learning_rate": 0.0001, |
| "loss": 927537.625, |
| "loss/crossentropy": 2.7922215461730957, |
| "loss/hidden": 0.3359375, |
| "loss/logits": 0.1129508763551712, |
| "loss/reg": 927534.375, |
| "step": 826 |
| }, |
| { |
| "epoch": 0.020675, |
| "grad_norm": 1.3995459079742432, |
| "grad_norm_var": 0.04128142919209097, |
| "learning_rate": 0.0001, |
| "loss": 922552.8125, |
| "loss/crossentropy": 2.769749164581299, |
| "loss/hidden": 0.318359375, |
| "loss/logits": 0.08981427550315857, |
| "loss/reg": 922549.625, |
| "step": 827 |
| }, |
| { |
| "epoch": 0.0207, |
| "grad_norm": 1.382501244544983, |
| "grad_norm_var": 0.04062622991926705, |
| "learning_rate": 0.0001, |
| "loss": 917603.6875, |
| "loss/crossentropy": 2.8601768016815186, |
| "loss/hidden": 0.328125, |
| "loss/logits": 0.1030910313129425, |
| "loss/reg": 917600.375, |
| "step": 828 |
| }, |
| { |
| "epoch": 0.020725, |
| "grad_norm": 1.3095468282699585, |
| "grad_norm_var": 0.04094595938651437, |
| "learning_rate": 0.0001, |
| "loss": 912653.25, |
| "loss/crossentropy": 2.443389654159546, |
| "loss/hidden": 0.31640625, |
| "loss/logits": 0.08721599727869034, |
| "loss/reg": 912650.4375, |
| "step": 829 |
| }, |
| { |
| "epoch": 0.02075, |
| "grad_norm": 1.5734272003173828, |
| "grad_norm_var": 0.040001189358893185, |
| "learning_rate": 0.0001, |
| "loss": 907724.0, |
| "loss/crossentropy": 3.173234224319458, |
| "loss/hidden": 0.341796875, |
| "loss/logits": 0.10380718857049942, |
| "loss/reg": 907720.375, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.020775, |
| "grad_norm": 1.5119266510009766, |
| "grad_norm_var": 0.03949839335605905, |
| "learning_rate": 0.0001, |
| "loss": 902822.1875, |
| "loss/crossentropy": 3.0053024291992188, |
| "loss/hidden": 0.326171875, |
| "loss/logits": 0.10051050037145615, |
| "loss/reg": 902818.75, |
| "step": 831 |
| }, |
| { |
| "epoch": 0.0208, |
| "grad_norm": 1.5602474212646484, |
| "grad_norm_var": 0.03914086088575815, |
| "learning_rate": 0.0001, |
| "loss": 897974.4375, |
| "loss/crossentropy": 2.5293502807617188, |
| "loss/hidden": 0.32421875, |
| "loss/logits": 0.10388641059398651, |
| "loss/reg": 897971.5, |
| "step": 832 |
| }, |
| { |
| "epoch": 0.020825, |
| "grad_norm": 1.5012003183364868, |
| "grad_norm_var": 0.03695006877639813, |
| "learning_rate": 0.0001, |
| "loss": 893135.75, |
| "loss/crossentropy": 2.7674946784973145, |
| "loss/hidden": 0.330078125, |
| "loss/logits": 0.09107282757759094, |
| "loss/reg": 893132.5625, |
| "step": 833 |
| }, |
| { |
| "epoch": 0.02085, |
| "grad_norm": 2.0714211463928223, |
| "grad_norm_var": 0.05568394590641829, |
| "learning_rate": 0.0001, |
| "loss": 888323.0, |
| "loss/crossentropy": 2.7505407333374023, |
| "loss/hidden": 0.3359375, |
| "loss/logits": 0.10203810036182404, |
| "loss/reg": 888319.8125, |
| "step": 834 |
| }, |
| { |
| "epoch": 0.020875, |
| "grad_norm": 1.5379499197006226, |
| "grad_norm_var": 0.027801912194789224, |
| "learning_rate": 0.0001, |
| "loss": 883551.4375, |
| "loss/crossentropy": 2.743110179901123, |
| "loss/hidden": 0.345703125, |
| "loss/logits": 0.09928973019123077, |
| "loss/reg": 883548.25, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.0209, |
| "grad_norm": 1.4941405057907104, |
| "grad_norm_var": 0.02781800858368418, |
| "learning_rate": 0.0001, |
| "loss": 878780.25, |
| "loss/crossentropy": 2.7615342140197754, |
| "loss/hidden": 0.314453125, |
| "loss/logits": 0.08403706550598145, |
| "loss/reg": 878777.125, |
| "step": 836 |
| }, |
| { |
| "epoch": 0.020925, |
| "grad_norm": 1.326443076133728, |
| "grad_norm_var": 0.029860874900802928, |
| "learning_rate": 0.0001, |
| "loss": 874031.4375, |
| "loss/crossentropy": 2.766791582107544, |
| "loss/hidden": 0.328125, |
| "loss/logits": 0.10534004122018814, |
| "loss/reg": 874028.25, |
| "step": 837 |
| }, |
| { |
| "epoch": 0.02095, |
| "grad_norm": 1.5630557537078857, |
| "grad_norm_var": 0.029959853950456507, |
| "learning_rate": 0.0001, |
| "loss": 869294.0625, |
| "loss/crossentropy": 2.536632537841797, |
| "loss/hidden": 0.330078125, |
| "loss/logits": 0.10914185643196106, |
| "loss/reg": 869291.0625, |
| "step": 838 |
| }, |
| { |
| "epoch": 0.020975, |
| "grad_norm": 1.3620904684066772, |
| "grad_norm_var": 0.03026956250526016, |
| "learning_rate": 0.0001, |
| "loss": 864577.0, |
| "loss/crossentropy": 2.8264501094818115, |
| "loss/hidden": 0.33203125, |
| "loss/logits": 0.10620096325874329, |
| "loss/reg": 864573.75, |
| "step": 839 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 1.4597526788711548, |
| "grad_norm_var": 0.03038025367330821, |
| "learning_rate": 0.0001, |
| "loss": 859822.0, |
| "loss/crossentropy": 2.7047665119171143, |
| "loss/hidden": 0.3125, |
| "loss/logits": 0.10196779668331146, |
| "loss/reg": 859818.875, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.021025, |
| "grad_norm": 1.4456199407577515, |
| "grad_norm_var": 0.030491785424536743, |
| "learning_rate": 0.0001, |
| "loss": 855168.5625, |
| "loss/crossentropy": 2.6905078887939453, |
| "loss/hidden": 0.3359375, |
| "loss/logits": 0.11562594771385193, |
| "loss/reg": 855165.4375, |
| "step": 841 |
| }, |
| { |
| "epoch": 0.02105, |
| "grad_norm": 1.4309810400009155, |
| "grad_norm_var": 0.030632080332832365, |
| "learning_rate": 0.0001, |
| "loss": 850570.0625, |
| "loss/crossentropy": 3.0000832080841064, |
| "loss/hidden": 0.337890625, |
| "loss/logits": 0.11315964162349701, |
| "loss/reg": 850566.625, |
| "step": 842 |
| }, |
| { |
| "epoch": 0.021075, |
| "grad_norm": 1.5102511644363403, |
| "grad_norm_var": 0.029980002247365137, |
| "learning_rate": 0.0001, |
| "loss": 845990.3125, |
| "loss/crossentropy": 2.535738945007324, |
| "loss/hidden": 0.328125, |
| "loss/logits": 0.118813157081604, |
| "loss/reg": 845987.3125, |
| "step": 843 |
| }, |
| { |
| "epoch": 0.0211, |
| "grad_norm": 1.5017589330673218, |
| "grad_norm_var": 0.02896024686500116, |
| "learning_rate": 0.0001, |
| "loss": 841451.9375, |
| "loss/crossentropy": 2.733320474624634, |
| "loss/hidden": 0.345703125, |
| "loss/logits": 0.09046341478824615, |
| "loss/reg": 841448.75, |
| "step": 844 |
| }, |
| { |
| "epoch": 0.021125, |
| "grad_norm": 1.4743220806121826, |
| "grad_norm_var": 0.026253470885225737, |
| "learning_rate": 0.0001, |
| "loss": 836923.3125, |
| "loss/crossentropy": 2.793195962905884, |
| "loss/hidden": 0.337890625, |
| "loss/logits": 0.10910402238368988, |
| "loss/reg": 836920.0625, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.02115, |
| "grad_norm": 1.9411284923553467, |
| "grad_norm_var": 0.03730904327888022, |
| "learning_rate": 0.0001, |
| "loss": 832203.125, |
| "loss/crossentropy": 3.039428472518921, |
| "loss/hidden": 0.349609375, |
| "loss/logits": 0.11309809982776642, |
| "loss/reg": 832199.625, |
| "step": 846 |
| }, |
| { |
| "epoch": 0.021175, |
| "grad_norm": 1.4955973625183105, |
| "grad_norm_var": 0.03739394643981934, |
| "learning_rate": 0.0001, |
| "loss": 827708.5, |
| "loss/crossentropy": 3.060068130493164, |
| "loss/hidden": 0.3359375, |
| "loss/logits": 0.10148128867149353, |
| "loss/reg": 827705.0, |
| "step": 847 |
| }, |
| { |
| "epoch": 0.0212, |
| "grad_norm": 1.5868381261825562, |
| "grad_norm_var": 0.037501955384725665, |
| "learning_rate": 0.0001, |
| "loss": 823235.0625, |
| "loss/crossentropy": 2.7904274463653564, |
| "loss/hidden": 0.359375, |
| "loss/logits": 0.10797617584466934, |
| "loss/reg": 823231.8125, |
| "step": 848 |
| }, |
| { |
| "epoch": 0.021225, |
| "grad_norm": 1.4774341583251953, |
| "grad_norm_var": 0.03767259485176832, |
| "learning_rate": 0.0001, |
| "loss": 818779.4375, |
| "loss/crossentropy": 3.0046181678771973, |
| "loss/hidden": 0.3515625, |
| "loss/logits": 0.10740286856889725, |
| "loss/reg": 818776.0, |
| "step": 849 |
| }, |
| { |
| "epoch": 0.02125, |
| "grad_norm": 1.424593210220337, |
| "grad_norm_var": 0.018199063781362903, |
| "learning_rate": 0.0001, |
| "loss": 814370.1875, |
| "loss/crossentropy": 2.6635451316833496, |
| "loss/hidden": 0.3359375, |
| "loss/logits": 0.10733939707279205, |
| "loss/reg": 814367.0625, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.021275, |
| "grad_norm": 1.439581274986267, |
| "grad_norm_var": 0.018332290200674825, |
| "learning_rate": 0.0001, |
| "loss": 809995.875, |
| "loss/crossentropy": 2.7378575801849365, |
| "loss/hidden": 0.328125, |
| "loss/logits": 0.0934763103723526, |
| "loss/reg": 809992.6875, |
| "step": 851 |
| }, |
| { |
| "epoch": 0.0213, |
| "grad_norm": 1.8364348411560059, |
| "grad_norm_var": 0.02557714189718367, |
| "learning_rate": 0.0001, |
| "loss": 805656.875, |
| "loss/crossentropy": 2.7950599193573, |
| "loss/hidden": 0.365234375, |
| "loss/logits": 0.11335933208465576, |
| "loss/reg": 805653.5625, |
| "step": 852 |
| }, |
| { |
| "epoch": 0.021325, |
| "grad_norm": 1.3157246112823486, |
| "grad_norm_var": 0.025856999395070326, |
| "learning_rate": 0.0001, |
| "loss": 801322.875, |
| "loss/crossentropy": 2.8259010314941406, |
| "loss/hidden": 0.328125, |
| "loss/logits": 0.10826624929904938, |
| "loss/reg": 801319.625, |
| "step": 853 |
| }, |
| { |
| "epoch": 0.02135, |
| "grad_norm": 1.3876771926879883, |
| "grad_norm_var": 0.026692402433927023, |
| "learning_rate": 0.0001, |
| "loss": 796998.0, |
| "loss/crossentropy": 2.713148593902588, |
| "loss/hidden": 0.3359375, |
| "loss/logits": 0.10216069221496582, |
| "loss/reg": 796994.875, |
| "step": 854 |
| }, |
| { |
| "epoch": 0.021375, |
| "grad_norm": 1.4406601190567017, |
| "grad_norm_var": 0.025574706135574606, |
| "learning_rate": 0.0001, |
| "loss": 792692.0, |
| "loss/crossentropy": 2.6140224933624268, |
| "loss/hidden": 0.32421875, |
| "loss/logits": 0.09719926118850708, |
| "loss/reg": 792688.9375, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.0214, |
| "grad_norm": 1.3787347078323364, |
| "grad_norm_var": 0.026533383109426486, |
| "learning_rate": 0.0001, |
| "loss": 788443.4375, |
| "loss/crossentropy": 2.7975778579711914, |
| "loss/hidden": 0.33203125, |
| "loss/logits": 0.10502509027719498, |
| "loss/reg": 788440.1875, |
| "step": 856 |
| }, |
| { |
| "epoch": 0.021425, |
| "grad_norm": 1.3740670680999756, |
| "grad_norm_var": 0.027424254981732035, |
| "learning_rate": 0.0001, |
| "loss": 784228.125, |
| "loss/crossentropy": 2.7014317512512207, |
| "loss/hidden": 0.3359375, |
| "loss/logits": 0.10709477961063385, |
| "loss/reg": 784225.0, |
| "step": 857 |
| }, |
| { |
| "epoch": 0.02145, |
| "grad_norm": 1.4766355752944946, |
| "grad_norm_var": 0.027128383640571425, |
| "learning_rate": 0.0001, |
| "loss": 780042.8125, |
| "loss/crossentropy": 2.9924182891845703, |
| "loss/hidden": 0.35546875, |
| "loss/logits": 0.11039350926876068, |
| "loss/reg": 780039.375, |
| "step": 858 |
| }, |
| { |
| "epoch": 0.021475, |
| "grad_norm": 1.3134477138519287, |
| "grad_norm_var": 0.029380874846922703, |
| "learning_rate": 0.0001, |
| "loss": 775767.8125, |
| "loss/crossentropy": 2.684462070465088, |
| "loss/hidden": 0.32421875, |
| "loss/logits": 0.0945153534412384, |
| "loss/reg": 775764.6875, |
| "step": 859 |
| }, |
| { |
| "epoch": 0.0215, |
| "grad_norm": 1.3462331295013428, |
| "grad_norm_var": 0.030680728492464896, |
| "learning_rate": 0.0001, |
| "loss": 771587.0, |
| "loss/crossentropy": 2.76127552986145, |
| "loss/hidden": 0.333984375, |
| "loss/logits": 0.09334437549114227, |
| "loss/reg": 771583.8125, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.021525, |
| "grad_norm": 1.4259929656982422, |
| "grad_norm_var": 0.030875021496669732, |
| "learning_rate": 0.0001, |
| "loss": 767432.3125, |
| "loss/crossentropy": 2.8973875045776367, |
| "loss/hidden": 0.345703125, |
| "loss/logits": 0.1067136824131012, |
| "loss/reg": 767429.0, |
| "step": 861 |
| }, |
| { |
| "epoch": 0.02155, |
| "grad_norm": 1.438728928565979, |
| "grad_norm_var": 0.01568045494984839, |
| "learning_rate": 0.0001, |
| "loss": 763304.5, |
| "loss/crossentropy": 2.5314440727233887, |
| "loss/hidden": 0.33984375, |
| "loss/logits": 0.10614284873008728, |
| "loss/reg": 763301.5, |
| "step": 862 |
| }, |
| { |
| "epoch": 0.021575, |
| "grad_norm": 1.539600133895874, |
| "grad_norm_var": 0.016084252835268133, |
| "learning_rate": 0.0001, |
| "loss": 759203.125, |
| "loss/crossentropy": 2.924391508102417, |
| "loss/hidden": 0.36328125, |
| "loss/logits": 0.1083289384841919, |
| "loss/reg": 759199.6875, |
| "step": 863 |
| }, |
| { |
| "epoch": 0.0216, |
| "grad_norm": 1.5744701623916626, |
| "grad_norm_var": 0.015868404396441823, |
| "learning_rate": 0.0001, |
| "loss": 755104.8125, |
| "loss/crossentropy": 2.7282729148864746, |
| "loss/hidden": 0.33203125, |
| "loss/logits": 0.10119913518428802, |
| "loss/reg": 755101.625, |
| "step": 864 |
| }, |
| { |
| "epoch": 0.021625, |
| "grad_norm": 1.635954737663269, |
| "grad_norm_var": 0.01803199245141318, |
| "learning_rate": 0.0001, |
| "loss": 751002.5625, |
| "loss/crossentropy": 2.7243566513061523, |
| "loss/hidden": 0.3671875, |
| "loss/logits": 0.1109534353017807, |
| "loss/reg": 750999.3125, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.02165, |
| "grad_norm": 1.4673669338226318, |
| "grad_norm_var": 0.01794849740094495, |
| "learning_rate": 0.0001, |
| "loss": 746943.625, |
| "loss/crossentropy": 2.986279010772705, |
| "loss/hidden": 0.357421875, |
| "loss/logits": 0.10028649866580963, |
| "loss/reg": 746940.1875, |
| "step": 866 |
| }, |
| { |
| "epoch": 0.021675, |
| "grad_norm": 1.4833115339279175, |
| "grad_norm_var": 0.017937552900055623, |
| "learning_rate": 0.0001, |
| "loss": 742912.8125, |
| "loss/crossentropy": 2.8367719650268555, |
| "loss/hidden": 0.365234375, |
| "loss/logits": 0.10635752975940704, |
| "loss/reg": 742909.5, |
| "step": 867 |
| }, |
| { |
| "epoch": 0.0217, |
| "grad_norm": 1.3037302494049072, |
| "grad_norm_var": 0.009269416341834773, |
| "learning_rate": 0.0001, |
| "loss": 738908.25, |
| "loss/crossentropy": 2.768047332763672, |
| "loss/hidden": 0.333984375, |
| "loss/logits": 0.10067752748727798, |
| "loss/reg": 738905.0625, |
| "step": 868 |
| }, |
| { |
| "epoch": 0.021725, |
| "grad_norm": 1.4508569240570068, |
| "grad_norm_var": 0.008326587343749203, |
| "learning_rate": 0.0001, |
| "loss": 734910.0, |
| "loss/crossentropy": 2.659773111343384, |
| "loss/hidden": 0.341796875, |
| "loss/logits": 0.10860970616340637, |
| "loss/reg": 734906.875, |
| "step": 869 |
| }, |
| { |
| "epoch": 0.02175, |
| "grad_norm": 1.360392689704895, |
| "grad_norm_var": 0.008562886320024887, |
| "learning_rate": 0.0001, |
| "loss": 730938.9375, |
| "loss/crossentropy": 2.6630847454071045, |
| "loss/hidden": 0.32421875, |
| "loss/logits": 0.09844067692756653, |
| "loss/reg": 730935.8125, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.021775, |
| "grad_norm": 1.4256236553192139, |
| "grad_norm_var": 0.008571957711574818, |
| "learning_rate": 0.0001, |
| "loss": 726759.625, |
| "loss/crossentropy": 2.699617862701416, |
| "loss/hidden": 0.34765625, |
| "loss/logits": 0.1213647648692131, |
| "loss/reg": 726756.4375, |
| "step": 871 |
| }, |
| { |
| "epoch": 0.0218, |
| "grad_norm": 1.3895364999771118, |
| "grad_norm_var": 0.008495050900990897, |
| "learning_rate": 0.0001, |
| "loss": 722792.625, |
| "loss/crossentropy": 2.631438970565796, |
| "loss/hidden": 0.34375, |
| "loss/logits": 0.1037633940577507, |
| "loss/reg": 722789.5625, |
| "step": 872 |
| }, |
| { |
| "epoch": 0.021825, |
| "grad_norm": 1.4019267559051514, |
| "grad_norm_var": 0.008306550360458628, |
| "learning_rate": 0.0001, |
| "loss": 718848.375, |
| "loss/crossentropy": 2.7362027168273926, |
| "loss/hidden": 0.34765625, |
| "loss/logits": 0.09886644780635834, |
| "loss/reg": 718845.1875, |
| "step": 873 |
| }, |
| { |
| "epoch": 0.02185, |
| "grad_norm": 2.7452619075775146, |
| "grad_norm_var": 0.11515721750659888, |
| "learning_rate": 0.0001, |
| "loss": 714946.375, |
| "loss/crossentropy": 2.532700300216675, |
| "loss/hidden": 0.3671875, |
| "loss/logits": 0.10763721913099289, |
| "loss/reg": 714943.3125, |
| "step": 874 |
| }, |
| { |
| "epoch": 0.021875, |
| "grad_norm": 1.5478562116622925, |
| "grad_norm_var": 0.11217005671258935, |
| "learning_rate": 0.0001, |
| "loss": 711063.6875, |
| "loss/crossentropy": 2.844818115234375, |
| "loss/hidden": 0.357421875, |
| "loss/logits": 0.10760362446308136, |
| "loss/reg": 711060.375, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.0219, |
| "grad_norm": 1.5059171915054321, |
| "grad_norm_var": 0.10977548391168739, |
| "learning_rate": 0.0001, |
| "loss": 707126.75, |
| "loss/crossentropy": 2.70940899848938, |
| "loss/hidden": 0.361328125, |
| "loss/logits": 0.10504569113254547, |
| "loss/reg": 707123.625, |
| "step": 876 |
| }, |
| { |
| "epoch": 0.021925, |
| "grad_norm": 1.3768664598464966, |
| "grad_norm_var": 0.1106962328722147, |
| "learning_rate": 0.0001, |
| "loss": 703275.875, |
| "loss/crossentropy": 2.906864881515503, |
| "loss/hidden": 0.33984375, |
| "loss/logits": 0.09524580091238022, |
| "loss/reg": 703272.5, |
| "step": 877 |
| }, |
| { |
| "epoch": 0.02195, |
| "grad_norm": 1.5900100469589233, |
| "grad_norm_var": 0.1100745560630381, |
| "learning_rate": 0.0001, |
| "loss": 699467.4375, |
| "loss/crossentropy": 2.8395020961761475, |
| "loss/hidden": 0.349609375, |
| "loss/logits": 0.10507619380950928, |
| "loss/reg": 699464.1875, |
| "step": 878 |
| }, |
| { |
| "epoch": 0.021975, |
| "grad_norm": 1.2434401512145996, |
| "grad_norm_var": 0.11596389431784875, |
| "learning_rate": 0.0001, |
| "loss": 695687.8125, |
| "loss/crossentropy": 2.7607498168945312, |
| "loss/hidden": 0.328125, |
| "loss/logits": 0.09485349804162979, |
| "loss/reg": 695684.625, |
| "step": 879 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 1.3332220315933228, |
| "grad_norm_var": 0.1182162682754802, |
| "learning_rate": 0.0001, |
| "loss": 691910.9375, |
| "loss/crossentropy": 2.6985464096069336, |
| "loss/hidden": 0.34765625, |
| "loss/logits": 0.10794036090373993, |
| "loss/reg": 691907.8125, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.022025, |
| "grad_norm": 1.3564249277114868, |
| "grad_norm_var": 0.11864132072718636, |
| "learning_rate": 0.0001, |
| "loss": 688147.5, |
| "loss/crossentropy": 2.460688352584839, |
| "loss/hidden": 0.34375, |
| "loss/logits": 0.08934849500656128, |
| "loss/reg": 688144.625, |
| "step": 881 |
| }, |
| { |
| "epoch": 0.02205, |
| "grad_norm": 1.3585158586502075, |
| "grad_norm_var": 0.1198389150353248, |
| "learning_rate": 0.0001, |
| "loss": 684415.375, |
| "loss/crossentropy": 2.7754790782928467, |
| "loss/hidden": 0.33984375, |
| "loss/logits": 0.09920317679643631, |
| "loss/reg": 684412.1875, |
| "step": 882 |
| }, |
| { |
| "epoch": 0.022075, |
| "grad_norm": 2.047518491744995, |
| "grad_norm_var": 0.13907669675013062, |
| "learning_rate": 0.0001, |
| "loss": 680714.0, |
| "loss/crossentropy": 2.8323476314544678, |
| "loss/hidden": 0.36328125, |
| "loss/logits": 0.1093655601143837, |
| "loss/reg": 680710.6875, |
| "step": 883 |
| }, |
| { |
| "epoch": 0.0221, |
| "grad_norm": 1.4928585290908813, |
| "grad_norm_var": 0.13567403633957345, |
| "learning_rate": 0.0001, |
| "loss": 677029.75, |
| "loss/crossentropy": 2.560920238494873, |
| "loss/hidden": 0.36328125, |
| "loss/logits": 0.10798318684101105, |
| "loss/reg": 677026.6875, |
| "step": 884 |
| }, |
| { |
| "epoch": 0.022125, |
| "grad_norm": 1.6336345672607422, |
| "grad_norm_var": 0.13561054361522124, |
| "learning_rate": 0.0001, |
| "loss": 673357.25, |
| "loss/crossentropy": 2.694791078567505, |
| "loss/hidden": 0.36328125, |
| "loss/logits": 0.10669650137424469, |
| "loss/reg": 673354.0625, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.02215, |
| "grad_norm": 1.349848747253418, |
| "grad_norm_var": 0.13588484449853783, |
| "learning_rate": 0.0001, |
| "loss": 669694.6875, |
| "loss/crossentropy": 2.6191961765289307, |
| "loss/hidden": 0.341796875, |
| "loss/logits": 0.09477333724498749, |
| "loss/reg": 669691.625, |
| "step": 886 |
| }, |
| { |
| "epoch": 0.022175, |
| "grad_norm": 1.8617138862609863, |
| "grad_norm_var": 0.14054444384585824, |
| "learning_rate": 0.0001, |
| "loss": 666086.875, |
| "loss/crossentropy": 3.0761778354644775, |
| "loss/hidden": 0.37890625, |
| "loss/logits": 0.11799299716949463, |
| "loss/reg": 666083.3125, |
| "step": 887 |
| }, |
| { |
| "epoch": 0.0222, |
| "grad_norm": 1.4446221590042114, |
| "grad_norm_var": 0.13935605070832124, |
| "learning_rate": 0.0001, |
| "loss": 662291.4375, |
| "loss/crossentropy": 2.6097607612609863, |
| "loss/hidden": 0.3515625, |
| "loss/logits": 0.09730654954910278, |
| "loss/reg": 662288.375, |
| "step": 888 |
| }, |
| { |
| "epoch": 0.022225, |
| "grad_norm": 1.4397261142730713, |
| "grad_norm_var": 0.13854484036644402, |
| "learning_rate": 0.0001, |
| "loss": 658715.5, |
| "loss/crossentropy": 2.758624792098999, |
| "loss/hidden": 0.349609375, |
| "loss/logits": 0.10278956592082977, |
| "loss/reg": 658712.3125, |
| "step": 889 |
| }, |
| { |
| "epoch": 0.02225, |
| "grad_norm": 1.6586854457855225, |
| "grad_norm_var": 0.04394541302448823, |
| "learning_rate": 0.0001, |
| "loss": 655153.75, |
| "loss/crossentropy": 3.131718397140503, |
| "loss/hidden": 0.37890625, |
| "loss/logits": 0.11117362976074219, |
| "loss/reg": 655150.125, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.022275, |
| "grad_norm": 1.414577841758728, |
| "grad_norm_var": 0.04447269469522291, |
| "learning_rate": 0.0001, |
| "loss": 651600.4375, |
| "loss/crossentropy": 2.610245704650879, |
| "loss/hidden": 0.37109375, |
| "loss/logits": 0.11778967827558517, |
| "loss/reg": 651597.3125, |
| "step": 891 |
| }, |
| { |
| "epoch": 0.0223, |
| "grad_norm": 1.2982698678970337, |
| "grad_norm_var": 0.047189867750510865, |
| "learning_rate": 0.0001, |
| "loss": 648062.9375, |
| "loss/crossentropy": 2.7059412002563477, |
| "loss/hidden": 0.35546875, |
| "loss/logits": 0.10352754592895508, |
| "loss/reg": 648059.8125, |
| "step": 892 |
| }, |
| { |
| "epoch": 0.022325, |
| "grad_norm": 1.4710452556610107, |
| "grad_norm_var": 0.04627654470542287, |
| "learning_rate": 0.0001, |
| "loss": 644464.125, |
| "loss/crossentropy": 2.7453603744506836, |
| "loss/hidden": 0.345703125, |
| "loss/logits": 0.1072053387761116, |
| "loss/reg": 644460.9375, |
| "step": 893 |
| }, |
| { |
| "epoch": 0.02235, |
| "grad_norm": 1.3338205814361572, |
| "grad_norm_var": 0.04729142680989463, |
| "learning_rate": 0.0001, |
| "loss": 640950.1875, |
| "loss/crossentropy": 2.6042568683624268, |
| "loss/hidden": 0.34765625, |
| "loss/logits": 0.09676901996135712, |
| "loss/reg": 640947.125, |
| "step": 894 |
| }, |
| { |
| "epoch": 0.022375, |
| "grad_norm": 1.467703938484192, |
| "grad_norm_var": 0.04325298987172251, |
| "learning_rate": 0.0001, |
| "loss": 637457.75, |
| "loss/crossentropy": 2.952686309814453, |
| "loss/hidden": 0.365234375, |
| "loss/logits": 0.12202930450439453, |
| "loss/reg": 637454.3125, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.0224, |
| "grad_norm": 1.3985360860824585, |
| "grad_norm_var": 0.04208779784585938, |
| "learning_rate": 0.0001, |
| "loss": 633993.375, |
| "loss/crossentropy": 2.5179498195648193, |
| "loss/hidden": 0.345703125, |
| "loss/logits": 0.10835272818803787, |
| "loss/reg": 633990.4375, |
| "step": 896 |
| }, |
| { |
| "epoch": 0.022425, |
| "grad_norm": 1.5998706817626953, |
| "grad_norm_var": 0.04107575266419913, |
| "learning_rate": 0.0001, |
| "loss": 630560.125, |
| "loss/crossentropy": 2.9355459213256836, |
| "loss/hidden": 0.359375, |
| "loss/logits": 0.10219253599643707, |
| "loss/reg": 630556.75, |
| "step": 897 |
| }, |
| { |
| "epoch": 0.02245, |
| "grad_norm": 1.2526609897613525, |
| "grad_norm_var": 0.044011995445767366, |
| "learning_rate": 0.0001, |
| "loss": 627130.1875, |
| "loss/crossentropy": 2.4691076278686523, |
| "loss/hidden": 0.34765625, |
| "loss/logits": 0.0991387739777565, |
| "loss/reg": 627127.25, |
| "step": 898 |
| }, |
| { |
| "epoch": 0.022475, |
| "grad_norm": 1.3573890924453735, |
| "grad_norm_var": 0.024347723303161158, |
| "learning_rate": 0.0001, |
| "loss": 623701.1875, |
| "loss/crossentropy": 2.637401819229126, |
| "loss/hidden": 0.341796875, |
| "loss/logits": 0.10063473880290985, |
| "loss/reg": 623698.125, |
| "step": 899 |
| }, |
| { |
| "epoch": 0.0225, |
| "grad_norm": 1.4095149040222168, |
| "grad_norm_var": 0.024496564257023578, |
| "learning_rate": 0.0001, |
| "loss": 620300.0625, |
| "loss/crossentropy": 2.8798859119415283, |
| "loss/hidden": 0.3515625, |
| "loss/logits": 0.09168756008148193, |
| "loss/reg": 620296.75, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.022525, |
| "grad_norm": 1.3833578824996948, |
| "grad_norm_var": 0.022683189164963775, |
| "learning_rate": 0.0001, |
| "loss": 616928.125, |
| "loss/crossentropy": 2.845262289047241, |
| "loss/hidden": 0.35546875, |
| "loss/logits": 0.11270107328891754, |
| "loss/reg": 616924.8125, |
| "step": 901 |
| }, |
| { |
| "epoch": 0.02255, |
| "grad_norm": 1.4046558141708374, |
| "grad_norm_var": 0.022165851322686336, |
| "learning_rate": 0.0001, |
| "loss": 613586.0625, |
| "loss/crossentropy": 2.707932710647583, |
| "loss/hidden": 0.373046875, |
| "loss/logits": 0.12656541168689728, |
| "loss/reg": 613582.875, |
| "step": 902 |
| }, |
| { |
| "epoch": 0.022575, |
| "grad_norm": 1.4385324716567993, |
| "grad_norm_var": 0.010114311042023339, |
| "learning_rate": 0.0001, |
| "loss": 610203.9375, |
| "loss/crossentropy": 2.6956024169921875, |
| "loss/hidden": 0.341796875, |
| "loss/logits": 0.09179414808750153, |
| "loss/reg": 610200.8125, |
| "step": 903 |
| }, |
| { |
| "epoch": 0.0226, |
| "grad_norm": 1.3584307432174683, |
| "grad_norm_var": 0.010333704252685946, |
| "learning_rate": 0.0001, |
| "loss": 606900.6875, |
| "loss/crossentropy": 2.6325361728668213, |
| "loss/hidden": 0.3515625, |
| "loss/logits": 0.11091384291648865, |
| "loss/reg": 606897.625, |
| "step": 904 |
| }, |
| { |
| "epoch": 0.022625, |
| "grad_norm": 1.3237383365631104, |
| "grad_norm_var": 0.010837350455486632, |
| "learning_rate": 0.0001, |
| "loss": 603612.875, |
| "loss/crossentropy": 2.5301826000213623, |
| "loss/hidden": 0.341796875, |
| "loss/logits": 0.1013622134923935, |
| "loss/reg": 603609.9375, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.02265, |
| "grad_norm": 1.519456148147583, |
| "grad_norm_var": 0.007444845918281295, |
| "learning_rate": 0.0001, |
| "loss": 600264.5625, |
| "loss/crossentropy": 2.815253973007202, |
| "loss/hidden": 0.365234375, |
| "loss/logits": 0.1020905077457428, |
| "loss/reg": 600261.3125, |
| "step": 906 |
| }, |
| { |
| "epoch": 0.022675, |
| "grad_norm": 1.399096131324768, |
| "grad_norm_var": 0.007433805910013724, |
| "learning_rate": 0.0001, |
| "loss": 596994.4375, |
| "loss/crossentropy": 2.7425737380981445, |
| "loss/hidden": 0.34375, |
| "loss/logits": 0.10377170145511627, |
| "loss/reg": 596991.25, |
| "step": 907 |
| }, |
| { |
| "epoch": 0.0227, |
| "grad_norm": 1.3674331903457642, |
| "grad_norm_var": 0.006785378943657783, |
| "learning_rate": 0.0001, |
| "loss": 593736.9375, |
| "loss/crossentropy": 2.7962067127227783, |
| "loss/hidden": 0.33203125, |
| "loss/logits": 0.10247787088155746, |
| "loss/reg": 593733.6875, |
| "step": 908 |
| }, |
| { |
| "epoch": 0.022725, |
| "grad_norm": 1.4462602138519287, |
| "grad_norm_var": 0.006606597408582537, |
| "learning_rate": 0.0001, |
| "loss": 590496.0625, |
| "loss/crossentropy": 2.8872458934783936, |
| "loss/hidden": 0.365234375, |
| "loss/logits": 0.1065157949924469, |
| "loss/reg": 590492.6875, |
| "step": 909 |
| }, |
| { |
| "epoch": 0.02275, |
| "grad_norm": 1.4053220748901367, |
| "grad_norm_var": 0.0062591795755129334, |
| "learning_rate": 0.0001, |
| "loss": 587279.8125, |
| "loss/crossentropy": 2.596808671951294, |
| "loss/hidden": 0.345703125, |
| "loss/logits": 0.10411226749420166, |
| "loss/reg": 587276.75, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.022775, |
| "grad_norm": 1.7457410097122192, |
| "grad_norm_var": 0.013294866770012407, |
| "learning_rate": 0.0001, |
| "loss": 584112.8125, |
| "loss/crossentropy": 3.089757204055786, |
| "loss/hidden": 0.37890625, |
| "loss/logits": 0.10723936557769775, |
| "loss/reg": 584109.25, |
| "step": 911 |
| }, |
| { |
| "epoch": 0.0228, |
| "grad_norm": 1.5150556564331055, |
| "grad_norm_var": 0.013722569704263065, |
| "learning_rate": 0.0001, |
| "loss": 580755.8125, |
| "loss/crossentropy": 2.924125909805298, |
| "loss/hidden": 0.361328125, |
| "loss/logits": 0.1008312851190567, |
| "loss/reg": 580752.4375, |
| "step": 912 |
| }, |
| { |
| "epoch": 0.022825, |
| "grad_norm": 2.0299394130706787, |
| "grad_norm_var": 0.034856616839544094, |
| "learning_rate": 0.0001, |
| "loss": 577586.0625, |
| "loss/crossentropy": 2.8558831214904785, |
| "loss/hidden": 0.3828125, |
| "loss/logits": 0.1177232414484024, |
| "loss/reg": 577582.6875, |
| "step": 913 |
| }, |
| { |
| "epoch": 0.02285, |
| "grad_norm": 1.4693039655685425, |
| "grad_norm_var": 0.031807031307608646, |
| "learning_rate": 0.0001, |
| "loss": 574422.9375, |
| "loss/crossentropy": 2.670675277709961, |
| "loss/hidden": 0.365234375, |
| "loss/logits": 0.0941586121916771, |
| "loss/reg": 574419.8125, |
| "step": 914 |
| }, |
| { |
| "epoch": 0.022875, |
| "grad_norm": 1.4337130784988403, |
| "grad_norm_var": 0.03099127312000333, |
| "learning_rate": 0.0001, |
| "loss": 571278.5, |
| "loss/crossentropy": 2.866081714630127, |
| "loss/hidden": 0.37109375, |
| "loss/logits": 0.10846216231584549, |
| "loss/reg": 571275.125, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.0229, |
| "grad_norm": 1.506596565246582, |
| "grad_norm_var": 0.030692585119625863, |
| "learning_rate": 0.0001, |
| "loss": 568169.375, |
| "loss/crossentropy": 2.7422282695770264, |
| "loss/hidden": 0.36328125, |
| "loss/logits": 0.10508774220943451, |
| "loss/reg": 568166.1875, |
| "step": 916 |
| }, |
| { |
| "epoch": 0.022925, |
| "grad_norm": 1.7639127969741821, |
| "grad_norm_var": 0.03462896677127952, |
| "learning_rate": 0.0001, |
| "loss": 565059.375, |
| "loss/crossentropy": 2.5580496788024902, |
| "loss/hidden": 0.39453125, |
| "loss/logits": 0.1190136969089508, |
| "loss/reg": 565056.3125, |
| "step": 917 |
| }, |
| { |
| "epoch": 0.02295, |
| "grad_norm": 1.4424704313278198, |
| "grad_norm_var": 0.03419753824293957, |
| "learning_rate": 0.0001, |
| "loss": 561960.5625, |
| "loss/crossentropy": 2.867959499359131, |
| "loss/hidden": 0.36328125, |
| "loss/logits": 0.10807552933692932, |
| "loss/reg": 561957.1875, |
| "step": 918 |
| }, |
| { |
| "epoch": 0.022975, |
| "grad_norm": 1.4522556066513062, |
| "grad_norm_var": 0.03407796866289529, |
| "learning_rate": 0.0001, |
| "loss": 558795.25, |
| "loss/crossentropy": 2.522718906402588, |
| "loss/hidden": 0.365234375, |
| "loss/logits": 0.08970680087804794, |
| "loss/reg": 558792.3125, |
| "step": 919 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 1.633276104927063, |
| "grad_norm_var": 0.033201914515267994, |
| "learning_rate": 0.0001, |
| "loss": 555707.4375, |
| "loss/crossentropy": 2.574209451675415, |
| "loss/hidden": 0.3671875, |
| "loss/logits": 0.10657516866922379, |
| "loss/reg": 555704.375, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.023025, |
| "grad_norm": 1.5812586545944214, |
| "grad_norm_var": 0.03032121722791885, |
| "learning_rate": 0.0001, |
| "loss": 552654.25, |
| "loss/crossentropy": 2.8102385997772217, |
| "loss/hidden": 0.38671875, |
| "loss/logits": 0.11414695531129837, |
| "loss/reg": 552650.9375, |
| "step": 921 |
| }, |
| { |
| "epoch": 0.02305, |
| "grad_norm": 1.468946933746338, |
| "grad_norm_var": 0.030648942805348866, |
| "learning_rate": 0.0001, |
| "loss": 549605.1875, |
| "loss/crossentropy": 2.8682405948638916, |
| "loss/hidden": 0.373046875, |
| "loss/logits": 0.12543684244155884, |
| "loss/reg": 549601.8125, |
| "step": 922 |
| }, |
| { |
| "epoch": 0.023075, |
| "grad_norm": 1.4886040687561035, |
| "grad_norm_var": 0.029452718305320917, |
| "learning_rate": 0.0001, |
| "loss": 546499.8125, |
| "loss/crossentropy": 2.839935302734375, |
| "loss/hidden": 0.375, |
| "loss/logits": 0.11211074888706207, |
| "loss/reg": 546496.5, |
| "step": 923 |
| }, |
| { |
| "epoch": 0.0231, |
| "grad_norm": 1.3664133548736572, |
| "grad_norm_var": 0.02947718422190988, |
| "learning_rate": 0.0001, |
| "loss": 543490.8125, |
| "loss/crossentropy": 2.6688411235809326, |
| "loss/hidden": 0.365234375, |
| "loss/logits": 0.10546208918094635, |
| "loss/reg": 543487.625, |
| "step": 924 |
| }, |
| { |
| "epoch": 0.023125, |
| "grad_norm": 1.5685945749282837, |
| "grad_norm_var": 0.028772335635443784, |
| "learning_rate": 0.0001, |
| "loss": 540498.8125, |
| "loss/crossentropy": 3.0085909366607666, |
| "loss/hidden": 0.38671875, |
| "loss/logits": 0.11488550901412964, |
| "loss/reg": 540495.3125, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.02315, |
| "grad_norm": 1.4992501735687256, |
| "grad_norm_var": 0.02745594088438607, |
| "learning_rate": 0.0001, |
| "loss": 537511.0, |
| "loss/crossentropy": 2.724285364151001, |
| "loss/hidden": 0.384765625, |
| "loss/logits": 0.11174479126930237, |
| "loss/reg": 537507.75, |
| "step": 926 |
| }, |
| { |
| "epoch": 0.023175, |
| "grad_norm": 1.4157204627990723, |
| "grad_norm_var": 0.026104590172131025, |
| "learning_rate": 0.0001, |
| "loss": 534549.3125, |
| "loss/crossentropy": 2.7532098293304443, |
| "loss/hidden": 0.365234375, |
| "loss/logits": 0.098666250705719, |
| "loss/reg": 534546.125, |
| "step": 927 |
| }, |
| { |
| "epoch": 0.0232, |
| "grad_norm": 1.6552956104278564, |
| "grad_norm_var": 0.026872846045318966, |
| "learning_rate": 0.0001, |
| "loss": 531614.6875, |
| "loss/crossentropy": 3.173334836959839, |
| "loss/hidden": 0.392578125, |
| "loss/logits": 0.09997949749231339, |
| "loss/reg": 531611.0, |
| "step": 928 |
| }, |
| { |
| "epoch": 0.023225, |
| "grad_norm": 1.5316351652145386, |
| "grad_norm_var": 0.01040307279417707, |
| "learning_rate": 0.0001, |
| "loss": 528700.75, |
| "loss/crossentropy": 2.901214838027954, |
| "loss/hidden": 0.384765625, |
| "loss/logits": 0.10912090539932251, |
| "loss/reg": 528697.375, |
| "step": 929 |
| }, |
| { |
| "epoch": 0.02325, |
| "grad_norm": 1.4846031665802002, |
| "grad_norm_var": 0.010319738034611229, |
| "learning_rate": 0.0001, |
| "loss": 525806.375, |
| "loss/crossentropy": 2.8857808113098145, |
| "loss/hidden": 0.373046875, |
| "loss/logits": 0.10793644189834595, |
| "loss/reg": 525803.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.023275, |
| "grad_norm": 1.6378053426742554, |
| "grad_norm_var": 0.010621717180507773, |
| "learning_rate": 0.0001, |
| "loss": 522942.1562, |
| "loss/crossentropy": 2.7658679485321045, |
| "loss/hidden": 0.3671875, |
| "loss/logits": 0.0958789810538292, |
| "loss/reg": 522938.90625, |
| "step": 931 |
| }, |
| { |
| "epoch": 0.0233, |
| "grad_norm": 1.2867034673690796, |
| "grad_norm_var": 0.014360435290397256, |
| "learning_rate": 0.0001, |
| "loss": 520073.375, |
| "loss/crossentropy": 2.6105663776397705, |
| "loss/hidden": 0.353515625, |
| "loss/logits": 0.10034145414829254, |
| "loss/reg": 520070.28125, |
| "step": 932 |
| }, |
| { |
| "epoch": 0.023325, |
| "grad_norm": 1.6469675302505493, |
| "grad_norm_var": 0.01136978488637883, |
| "learning_rate": 0.0001, |
| "loss": 517234.75, |
| "loss/crossentropy": 2.589902400970459, |
| "loss/hidden": 0.38671875, |
| "loss/logits": 0.10840745270252228, |
| "loss/reg": 517231.65625, |
| "step": 933 |
| }, |
| { |
| "epoch": 0.02335, |
| "grad_norm": 1.5106781721115112, |
| "grad_norm_var": 0.011046528115795635, |
| "learning_rate": 0.0001, |
| "loss": 514345.3125, |
| "loss/crossentropy": 2.9396235942840576, |
| "loss/hidden": 0.373046875, |
| "loss/logits": 0.09719346463680267, |
| "loss/reg": 514341.90625, |
| "step": 934 |
| }, |
| { |
| "epoch": 0.023375, |
| "grad_norm": 1.486398696899414, |
| "grad_norm_var": 0.010837161119741031, |
| "learning_rate": 0.0001, |
| "loss": 511510.4688, |
| "loss/crossentropy": 2.666771173477173, |
| "loss/hidden": 0.388671875, |
| "loss/logits": 0.103165403008461, |
| "loss/reg": 511507.3125, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.0234, |
| "grad_norm": 1.5164668560028076, |
| "grad_norm_var": 0.00986939957599251, |
| "learning_rate": 0.0001, |
| "loss": 508676.8125, |
| "loss/crossentropy": 2.5369515419006348, |
| "loss/hidden": 0.380859375, |
| "loss/logits": 0.10893706232309341, |
| "loss/reg": 508673.78125, |
| "step": 936 |
| }, |
| { |
| "epoch": 0.023425, |
| "grad_norm": 1.5231095552444458, |
| "grad_norm_var": 0.00952114559030542, |
| "learning_rate": 0.0001, |
| "loss": 505868.9375, |
| "loss/crossentropy": 3.0632290840148926, |
| "loss/hidden": 0.392578125, |
| "loss/logits": 0.11770186573266983, |
| "loss/reg": 505865.375, |
| "step": 937 |
| }, |
| { |
| "epoch": 0.02345, |
| "grad_norm": 1.433435082435608, |
| "grad_norm_var": 0.009772800643141248, |
| "learning_rate": 0.0001, |
| "loss": 503076.9062, |
| "loss/crossentropy": 2.77193284034729, |
| "loss/hidden": 0.37109375, |
| "loss/logits": 0.11221066862344742, |
| "loss/reg": 503073.65625, |
| "step": 938 |
| }, |
| { |
| "epoch": 0.023475, |
| "grad_norm": 1.6270184516906738, |
| "grad_norm_var": 0.010700282771199218, |
| "learning_rate": 0.0001, |
| "loss": 500303.1875, |
| "loss/crossentropy": 2.7399566173553467, |
| "loss/hidden": 0.39453125, |
| "loss/logits": 0.12337076663970947, |
| "loss/reg": 500299.90625, |
| "step": 939 |
| }, |
| { |
| "epoch": 0.0235, |
| "grad_norm": 1.475893497467041, |
| "grad_norm_var": 0.009325959459162429, |
| "learning_rate": 0.0001, |
| "loss": 497552.5938, |
| "loss/crossentropy": 2.9398655891418457, |
| "loss/hidden": 0.388671875, |
| "loss/logits": 0.11368358880281448, |
| "loss/reg": 497549.15625, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.023525, |
| "grad_norm": 1.5985270738601685, |
| "grad_norm_var": 0.009580992116342675, |
| "learning_rate": 0.0001, |
| "loss": 494813.6875, |
| "loss/crossentropy": 2.8192827701568604, |
| "loss/hidden": 0.3671875, |
| "loss/logits": 0.1037113294005394, |
| "loss/reg": 494810.40625, |
| "step": 941 |
| }, |
| { |
| "epoch": 0.02355, |
| "grad_norm": 1.464031457901001, |
| "grad_norm_var": 0.009758742704438344, |
| "learning_rate": 0.0001, |
| "loss": 492100.3438, |
| "loss/crossentropy": 2.826369524002075, |
| "loss/hidden": 0.365234375, |
| "loss/logits": 0.10558685660362244, |
| "loss/reg": 492097.0625, |
| "step": 942 |
| }, |
| { |
| "epoch": 0.023575, |
| "grad_norm": 1.6829922199249268, |
| "grad_norm_var": 0.010564513585127836, |
| "learning_rate": 0.0001, |
| "loss": 489396.5, |
| "loss/crossentropy": 2.851315975189209, |
| "loss/hidden": 0.37890625, |
| "loss/logits": 0.11695922911167145, |
| "loss/reg": 489393.15625, |
| "step": 943 |
| }, |
| { |
| "epoch": 0.0236, |
| "grad_norm": 1.6212563514709473, |
| "grad_norm_var": 0.010091403632914837, |
| "learning_rate": 0.0001, |
| "loss": 486712.6562, |
| "loss/crossentropy": 2.6733405590057373, |
| "loss/hidden": 0.390625, |
| "loss/logits": 0.11890114843845367, |
| "loss/reg": 486709.46875, |
| "step": 944 |
| }, |
| { |
| "epoch": 0.023625, |
| "grad_norm": 1.4979190826416016, |
| "grad_norm_var": 0.01016845332830479, |
| "learning_rate": 0.0001, |
| "loss": 484035.4062, |
| "loss/crossentropy": 2.674908399581909, |
| "loss/hidden": 0.376953125, |
| "loss/logits": 0.11532513797283173, |
| "loss/reg": 484032.21875, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.02365, |
| "grad_norm": 1.4632714986801147, |
| "grad_norm_var": 0.010328466230860301, |
| "learning_rate": 0.0001, |
| "loss": 481392.9688, |
| "loss/crossentropy": 2.614588499069214, |
| "loss/hidden": 0.36328125, |
| "loss/logits": 0.10456407815217972, |
| "loss/reg": 481389.875, |
| "step": 946 |
| }, |
| { |
| "epoch": 0.023675, |
| "grad_norm": 1.44475519657135, |
| "grad_norm_var": 0.009870720122113616, |
| "learning_rate": 0.0001, |
| "loss": 478764.2188, |
| "loss/crossentropy": 2.7293450832366943, |
| "loss/hidden": 0.365234375, |
| "loss/logits": 0.10890185832977295, |
| "loss/reg": 478761.03125, |
| "step": 947 |
| }, |
| { |
| "epoch": 0.0237, |
| "grad_norm": 1.5215405225753784, |
| "grad_norm_var": 0.006092014213273463, |
| "learning_rate": 0.0001, |
| "loss": 476156.1875, |
| "loss/crossentropy": 2.704072952270508, |
| "loss/hidden": 0.3828125, |
| "loss/logits": 0.11023689061403275, |
| "loss/reg": 476152.96875, |
| "step": 948 |
| }, |
| { |
| "epoch": 0.023725, |
| "grad_norm": 1.602721095085144, |
| "grad_norm_var": 0.005536953386399309, |
| "learning_rate": 0.0001, |
| "loss": 473429.375, |
| "loss/crossentropy": 2.949936866760254, |
| "loss/hidden": 0.3828125, |
| "loss/logits": 0.10948498547077179, |
| "loss/reg": 473425.9375, |
| "step": 949 |
| }, |
| { |
| "epoch": 0.02375, |
| "grad_norm": 1.6481072902679443, |
| "grad_norm_var": 0.006374760606168609, |
| "learning_rate": 0.0001, |
| "loss": 470841.9062, |
| "loss/crossentropy": 2.6340723037719727, |
| "loss/hidden": 0.380859375, |
| "loss/logits": 0.10163669288158417, |
| "loss/reg": 470838.8125, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.023775, |
| "grad_norm": 1.8611422777175903, |
| "grad_norm_var": 0.012575243154465692, |
| "learning_rate": 0.0001, |
| "loss": 468260.4062, |
| "loss/crossentropy": 3.1545968055725098, |
| "loss/hidden": 0.40625, |
| "loss/logits": 0.1330508589744568, |
| "loss/reg": 468256.71875, |
| "step": 951 |
| }, |
| { |
| "epoch": 0.0238, |
| "grad_norm": 1.4752041101455688, |
| "grad_norm_var": 0.012928792129038828, |
| "learning_rate": 0.0001, |
| "loss": 465685.6875, |
| "loss/crossentropy": 2.6174726486206055, |
| "loss/hidden": 0.38671875, |
| "loss/logits": 0.11934126168489456, |
| "loss/reg": 465682.5625, |
| "step": 952 |
| }, |
| { |
| "epoch": 0.023825, |
| "grad_norm": 1.7078447341918945, |
| "grad_norm_var": 0.01418243886351244, |
| "learning_rate": 0.0001, |
| "loss": 463130.4062, |
| "loss/crossentropy": 2.8755438327789307, |
| "loss/hidden": 0.400390625, |
| "loss/logits": 0.11498165130615234, |
| "loss/reg": 463127.03125, |
| "step": 953 |
| }, |
| { |
| "epoch": 0.02385, |
| "grad_norm": 1.4131869077682495, |
| "grad_norm_var": 0.014577710219392042, |
| "learning_rate": 0.0001, |
| "loss": 460584.25, |
| "loss/crossentropy": 2.654987096786499, |
| "loss/hidden": 0.3828125, |
| "loss/logits": 0.11689084768295288, |
| "loss/reg": 460581.09375, |
| "step": 954 |
| }, |
| { |
| "epoch": 0.023875, |
| "grad_norm": 1.3881771564483643, |
| "grad_norm_var": 0.01629821593972262, |
| "learning_rate": 0.0001, |
| "loss": 458048.0, |
| "loss/crossentropy": 2.862352132797241, |
| "loss/hidden": 0.3984375, |
| "loss/logits": 0.12421125173568726, |
| "loss/reg": 458044.59375, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.0239, |
| "grad_norm": 1.464780330657959, |
| "grad_norm_var": 0.01642190764080927, |
| "learning_rate": 0.0001, |
| "loss": 455517.4062, |
| "loss/crossentropy": 2.757857084274292, |
| "loss/hidden": 0.376953125, |
| "loss/logits": 0.10530483722686768, |
| "loss/reg": 455514.1875, |
| "step": 956 |
| }, |
| { |
| "epoch": 0.023925, |
| "grad_norm": 1.5110803842544556, |
| "grad_norm_var": 0.016374449080754517, |
| "learning_rate": 0.0001, |
| "loss": 453010.5625, |
| "loss/crossentropy": 2.597191572189331, |
| "loss/hidden": 0.39453125, |
| "loss/logits": 0.11306219547986984, |
| "loss/reg": 453007.46875, |
| "step": 957 |
| }, |
| { |
| "epoch": 0.02395, |
| "grad_norm": 1.8246955871582031, |
| "grad_norm_var": 0.020466405073577932, |
| "learning_rate": 0.0001, |
| "loss": 450517.25, |
| "loss/crossentropy": 3.0261099338531494, |
| "loss/hidden": 0.412109375, |
| "loss/logits": 0.1447220742702484, |
| "loss/reg": 450513.65625, |
| "step": 958 |
| }, |
| { |
| "epoch": 0.023975, |
| "grad_norm": 1.3534810543060303, |
| "grad_norm_var": 0.02231203272386111, |
| "learning_rate": 0.0001, |
| "loss": 448027.75, |
| "loss/crossentropy": 2.37969970703125, |
| "loss/hidden": 0.37109375, |
| "loss/logits": 0.10020345449447632, |
| "loss/reg": 448024.90625, |
| "step": 959 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 1.389629602432251, |
| "grad_norm_var": 0.02346295240414804, |
| "learning_rate": 0.0001, |
| "loss": 445564.8125, |
| "loss/crossentropy": 2.667879581451416, |
| "loss/hidden": 0.376953125, |
| "loss/logits": 0.09934940934181213, |
| "loss/reg": 445561.6875, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.024025, |
| "grad_norm": 1.7857766151428223, |
| "grad_norm_var": 0.027200542546711403, |
| "learning_rate": 0.0001, |
| "loss": 443126.7188, |
| "loss/crossentropy": 2.803347587585449, |
| "loss/hidden": 0.41015625, |
| "loss/logits": 0.1108841598033905, |
| "loss/reg": 443123.375, |
| "step": 961 |
| }, |
| { |
| "epoch": 0.02405, |
| "grad_norm": 1.6332484483718872, |
| "grad_norm_var": 0.02696225857243135, |
| "learning_rate": 0.0001, |
| "loss": 440698.6562, |
| "loss/crossentropy": 2.7807252407073975, |
| "loss/hidden": 0.3828125, |
| "loss/logits": 0.10776016116142273, |
| "loss/reg": 440695.375, |
| "step": 962 |
| }, |
| { |
| "epoch": 0.024075, |
| "grad_norm": 1.5761369466781616, |
| "grad_norm_var": 0.025950701164052247, |
| "learning_rate": 0.0001, |
| "loss": 438264.6562, |
| "loss/crossentropy": 2.6589338779449463, |
| "loss/hidden": 0.3828125, |
| "loss/logits": 0.09876887500286102, |
| "loss/reg": 438261.53125, |
| "step": 963 |
| }, |
| { |
| "epoch": 0.0241, |
| "grad_norm": 1.6731374263763428, |
| "grad_norm_var": 0.02636111450513988, |
| "learning_rate": 0.0001, |
| "loss": 435853.4062, |
| "loss/crossentropy": 2.9702107906341553, |
| "loss/hidden": 0.421875, |
| "loss/logits": 0.12137308716773987, |
| "loss/reg": 435849.90625, |
| "step": 964 |
| }, |
| { |
| "epoch": 0.024125, |
| "grad_norm": 1.6375683546066284, |
| "grad_norm_var": 0.026534346621109192, |
| "learning_rate": 0.0001, |
| "loss": 433449.5625, |
| "loss/crossentropy": 3.0992355346679688, |
| "loss/hidden": 0.396484375, |
| "loss/logits": 0.10674059391021729, |
| "loss/reg": 433445.96875, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.02415, |
| "grad_norm": 1.5692658424377441, |
| "grad_norm_var": 0.02624840934692158, |
| "learning_rate": 0.0001, |
| "loss": 431072.1562, |
| "loss/crossentropy": 2.6341285705566406, |
| "loss/hidden": 0.39453125, |
| "loss/logits": 0.11036588251590729, |
| "loss/reg": 431069.03125, |
| "step": 966 |
| }, |
| { |
| "epoch": 0.024175, |
| "grad_norm": 1.80268132686615, |
| "grad_norm_var": 0.024262947069229726, |
| "learning_rate": 0.0001, |
| "loss": 428709.2812, |
| "loss/crossentropy": 3.0920138359069824, |
| "loss/hidden": 0.421875, |
| "loss/logits": 0.12939883768558502, |
| "loss/reg": 428705.625, |
| "step": 967 |
| }, |
| { |
| "epoch": 0.0242, |
| "grad_norm": 1.9607547521591187, |
| "grad_norm_var": 0.03251326476113447, |
| "learning_rate": 0.0001, |
| "loss": 426298.1875, |
| "loss/crossentropy": 2.9854230880737305, |
| "loss/hidden": 0.41796875, |
| "loss/logits": 0.1064004972577095, |
| "loss/reg": 426294.65625, |
| "step": 968 |
| }, |
| { |
| "epoch": 0.024225, |
| "grad_norm": 1.4067966938018799, |
| "grad_norm_var": 0.034078187801594065, |
| "learning_rate": 0.0001, |
| "loss": 423898.8438, |
| "loss/crossentropy": 2.7588040828704834, |
| "loss/hidden": 0.376953125, |
| "loss/logits": 0.11026656627655029, |
| "loss/reg": 423895.59375, |
| "step": 969 |
| }, |
| { |
| "epoch": 0.02425, |
| "grad_norm": 1.5999746322631836, |
| "grad_norm_var": 0.031932465417443726, |
| "learning_rate": 0.0001, |
| "loss": 421553.0, |
| "loss/crossentropy": 2.888660430908203, |
| "loss/hidden": 0.400390625, |
| "loss/logits": 0.11845803260803223, |
| "loss/reg": 421549.59375, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.024275, |
| "grad_norm": 2.8322978019714355, |
| "grad_norm_var": 0.12176343888679994, |
| "learning_rate": 0.0001, |
| "loss": 419213.875, |
| "loss/crossentropy": 2.8022220134735107, |
| "loss/hidden": 0.43359375, |
| "loss/logits": 0.15032915771007538, |
| "loss/reg": 419210.46875, |
| "step": 971 |
| }, |
| { |
| "epoch": 0.0243, |
| "grad_norm": 1.9177151918411255, |
| "grad_norm_var": 0.12105456166989696, |
| "learning_rate": 0.0001, |
| "loss": 416887.0, |
| "loss/crossentropy": 2.654383659362793, |
| "loss/hidden": 0.41015625, |
| "loss/logits": 0.11843335628509521, |
| "loss/reg": 416883.8125, |
| "step": 972 |
| }, |
| { |
| "epoch": 0.024325, |
| "grad_norm": 1.474488615989685, |
| "grad_norm_var": 0.1221435914764801, |
| "learning_rate": 0.0001, |
| "loss": 414587.0625, |
| "loss/crossentropy": 2.7339727878570557, |
| "loss/hidden": 0.3828125, |
| "loss/logits": 0.09990103542804718, |
| "loss/reg": 414583.875, |
| "step": 973 |
| }, |
| { |
| "epoch": 0.02435, |
| "grad_norm": 1.6469473838806152, |
| "grad_norm_var": 0.12151500128243284, |
| "learning_rate": 0.0001, |
| "loss": 412298.2188, |
| "loss/crossentropy": 2.8057761192321777, |
| "loss/hidden": 0.404296875, |
| "loss/logits": 0.13082775473594666, |
| "loss/reg": 412294.875, |
| "step": 974 |
| }, |
| { |
| "epoch": 0.024375, |
| "grad_norm": 1.3397822380065918, |
| "grad_norm_var": 0.12216648786908255, |
| "learning_rate": 0.0001, |
| "loss": 410019.8438, |
| "loss/crossentropy": 2.489624500274658, |
| "loss/hidden": 0.365234375, |
| "loss/logits": 0.10059110820293427, |
| "loss/reg": 410016.875, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.0244, |
| "grad_norm": 1.5557494163513184, |
| "grad_norm_var": 0.11695277649215109, |
| "learning_rate": 0.0001, |
| "loss": 407761.875, |
| "loss/crossentropy": 2.9346976280212402, |
| "loss/hidden": 0.388671875, |
| "loss/logits": 0.10529027879238129, |
| "loss/reg": 407758.4375, |
| "step": 976 |
| }, |
| { |
| "epoch": 0.024425, |
| "grad_norm": 1.6195032596588135, |
| "grad_norm_var": 0.1170732498665401, |
| "learning_rate": 0.0001, |
| "loss": 405513.0938, |
| "loss/crossentropy": 2.7136828899383545, |
| "loss/hidden": 0.384765625, |
| "loss/logits": 0.1184212863445282, |
| "loss/reg": 405509.875, |
| "step": 977 |
| }, |
| { |
| "epoch": 0.02445, |
| "grad_norm": 1.4346684217453003, |
| "grad_norm_var": 0.12138148219616761, |
| "learning_rate": 0.0001, |
| "loss": 403287.0, |
| "loss/crossentropy": 2.6480681896209717, |
| "loss/hidden": 0.39453125, |
| "loss/logits": 0.12125460803508759, |
| "loss/reg": 403283.8125, |
| "step": 978 |
| }, |
| { |
| "epoch": 0.024475, |
| "grad_norm": 1.4745851755142212, |
| "grad_norm_var": 0.12357408262968139, |
| "learning_rate": 0.0001, |
| "loss": 401075.8125, |
| "loss/crossentropy": 2.784494161605835, |
| "loss/hidden": 0.390625, |
| "loss/logits": 0.10559864342212677, |
| "loss/reg": 401072.53125, |
| "step": 979 |
| }, |
| { |
| "epoch": 0.0245, |
| "grad_norm": 2.014869213104248, |
| "grad_norm_var": 0.1303724682793377, |
| "learning_rate": 0.0001, |
| "loss": 398870.875, |
| "loss/crossentropy": 2.531445264816284, |
| "loss/hidden": 0.419921875, |
| "loss/logits": 0.11759153753519058, |
| "loss/reg": 398867.8125, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.024525, |
| "grad_norm": 1.4576054811477661, |
| "grad_norm_var": 0.13402612882125067, |
| "learning_rate": 0.0001, |
| "loss": 396677.5312, |
| "loss/crossentropy": 2.649505376815796, |
| "loss/hidden": 0.412109375, |
| "loss/logits": 0.1335502415895462, |
| "loss/reg": 396674.34375, |
| "step": 981 |
| }, |
| { |
| "epoch": 0.02455, |
| "grad_norm": 1.4466009140014648, |
| "grad_norm_var": 0.13701038143998018, |
| "learning_rate": 0.0001, |
| "loss": 394492.3438, |
| "loss/crossentropy": 2.7862565517425537, |
| "loss/hidden": 0.400390625, |
| "loss/logits": 0.10726554691791534, |
| "loss/reg": 394489.0625, |
| "step": 982 |
| }, |
| { |
| "epoch": 0.024575, |
| "grad_norm": 1.5884110927581787, |
| "grad_norm_var": 0.136562460308109, |
| "learning_rate": 0.0001, |
| "loss": 392272.125, |
| "loss/crossentropy": 2.718717575073242, |
| "loss/hidden": 0.40625, |
| "loss/logits": 0.12138716876506805, |
| "loss/reg": 392268.875, |
| "step": 983 |
| }, |
| { |
| "epoch": 0.0246, |
| "grad_norm": 1.4934638738632202, |
| "grad_norm_var": 0.1322920285233766, |
| "learning_rate": 0.0001, |
| "loss": 390101.1562, |
| "loss/crossentropy": 2.674203634262085, |
| "loss/hidden": 0.390625, |
| "loss/logits": 0.10795333981513977, |
| "loss/reg": 390097.96875, |
| "step": 984 |
| }, |
| { |
| "epoch": 0.024625, |
| "grad_norm": 1.5830122232437134, |
| "grad_norm_var": 0.12866037942509595, |
| "learning_rate": 0.0001, |
| "loss": 387958.25, |
| "loss/crossentropy": 2.8068060874938965, |
| "loss/hidden": 0.404296875, |
| "loss/logits": 0.11133777350187302, |
| "loss/reg": 387954.90625, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.02465, |
| "grad_norm": 1.4873050451278687, |
| "grad_norm_var": 0.13028010118781594, |
| "learning_rate": 0.0001, |
| "loss": 385766.5312, |
| "loss/crossentropy": 2.7559468746185303, |
| "loss/hidden": 0.39453125, |
| "loss/logits": 0.09770851582288742, |
| "loss/reg": 385763.28125, |
| "step": 986 |
| }, |
| { |
| "epoch": 0.024675, |
| "grad_norm": 1.4812231063842773, |
| "grad_norm_var": 0.03101327387547883, |
| "learning_rate": 0.0001, |
| "loss": 383635.375, |
| "loss/crossentropy": 2.8192882537841797, |
| "loss/hidden": 0.392578125, |
| "loss/logits": 0.12007346749305725, |
| "loss/reg": 383632.0625, |
| "step": 987 |
| }, |
| { |
| "epoch": 0.0247, |
| "grad_norm": 1.591971755027771, |
| "grad_norm_var": 0.02226044826596739, |
| "learning_rate": 0.0001, |
| "loss": 381504.0, |
| "loss/crossentropy": 2.733438730239868, |
| "loss/hidden": 0.388671875, |
| "loss/logits": 0.0949164554476738, |
| "loss/reg": 381500.8125, |
| "step": 988 |
| }, |
| { |
| "epoch": 0.024725, |
| "grad_norm": 1.3618513345718384, |
| "grad_norm_var": 0.024084373411290015, |
| "learning_rate": 0.0001, |
| "loss": 379391.7812, |
| "loss/crossentropy": 2.6770806312561035, |
| "loss/hidden": 0.384765625, |
| "loss/logits": 0.0888867974281311, |
| "loss/reg": 379388.625, |
| "step": 989 |
| }, |
| { |
| "epoch": 0.02475, |
| "grad_norm": 1.5360465049743652, |
| "grad_norm_var": 0.02321393864459651, |
| "learning_rate": 0.0001, |
| "loss": 377291.3125, |
| "loss/crossentropy": 2.9608163833618164, |
| "loss/hidden": 0.396484375, |
| "loss/logits": 0.11456985771656036, |
| "loss/reg": 377287.84375, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.024775, |
| "grad_norm": 1.3709909915924072, |
| "grad_norm_var": 0.022486757106290868, |
| "learning_rate": 0.0001, |
| "loss": 375203.875, |
| "loss/crossentropy": 2.5483319759368896, |
| "loss/hidden": 0.40625, |
| "loss/logits": 0.1111944392323494, |
| "loss/reg": 375200.78125, |
| "step": 991 |
| }, |
| { |
| "epoch": 0.0248, |
| "grad_norm": 1.5652952194213867, |
| "grad_norm_var": 0.022523804875172714, |
| "learning_rate": 0.0001, |
| "loss": 373132.8125, |
| "loss/crossentropy": 2.7113425731658936, |
| "loss/hidden": 0.41015625, |
| "loss/logits": 0.0936322957277298, |
| "loss/reg": 373129.59375, |
| "step": 992 |
| }, |
| { |
| "epoch": 0.024825, |
| "grad_norm": 1.519127607345581, |
| "grad_norm_var": 0.021978571734465647, |
| "learning_rate": 0.0001, |
| "loss": 371065.4062, |
| "loss/crossentropy": 3.0726349353790283, |
| "loss/hidden": 0.408203125, |
| "loss/logits": 0.1158657968044281, |
| "loss/reg": 371061.8125, |
| "step": 993 |
| }, |
| { |
| "epoch": 0.02485, |
| "grad_norm": 1.562768578529358, |
| "grad_norm_var": 0.02145380726874748, |
| "learning_rate": 0.0001, |
| "loss": 369010.9688, |
| "loss/crossentropy": 2.64787220954895, |
| "loss/hidden": 0.40625, |
| "loss/logits": 0.13141582906246185, |
| "loss/reg": 369007.78125, |
| "step": 994 |
| }, |
| { |
| "epoch": 0.024875, |
| "grad_norm": 1.5851787328720093, |
| "grad_norm_var": 0.02135029757139719, |
| "learning_rate": 0.0001, |
| "loss": 366962.7812, |
| "loss/crossentropy": 3.0664515495300293, |
| "loss/hidden": 0.41015625, |
| "loss/logits": 0.12088686227798462, |
| "loss/reg": 366959.1875, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.0249, |
| "grad_norm": 1.5941710472106934, |
| "grad_norm_var": 0.005795159961090946, |
| "learning_rate": 0.0001, |
| "loss": 364927.5312, |
| "loss/crossentropy": 2.7351346015930176, |
| "loss/hidden": 0.4140625, |
| "loss/logits": 0.1389205902814865, |
| "loss/reg": 364924.21875, |
| "step": 996 |
| }, |
| { |
| "epoch": 0.024925, |
| "grad_norm": 1.509900450706482, |
| "grad_norm_var": 0.005572416712246309, |
| "learning_rate": 0.0001, |
| "loss": 362901.25, |
| "loss/crossentropy": 2.740540027618408, |
| "loss/hidden": 0.408203125, |
| "loss/logits": 0.1137368455529213, |
| "loss/reg": 362897.96875, |
| "step": 997 |
| }, |
| { |
| "epoch": 0.02495, |
| "grad_norm": 1.6150109767913818, |
| "grad_norm_var": 0.00575678589825633, |
| "learning_rate": 0.0001, |
| "loss": 360892.6875, |
| "loss/crossentropy": 2.779423475265503, |
| "loss/hidden": 0.38671875, |
| "loss/logits": 0.09358450770378113, |
| "loss/reg": 360889.4375, |
| "step": 998 |
| }, |
| { |
| "epoch": 0.024975, |
| "grad_norm": 1.5772761106491089, |
| "grad_norm_var": 0.00567463417362853, |
| "learning_rate": 0.0001, |
| "loss": 358897.0, |
| "loss/crossentropy": 2.7118468284606934, |
| "loss/hidden": 0.412109375, |
| "loss/logits": 0.12479842454195023, |
| "loss/reg": 358893.75, |
| "step": 999 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 1.7221057415008545, |
| "grad_norm_var": 0.007914643182362843, |
| "learning_rate": 0.0001, |
| "loss": 356913.7812, |
| "loss/crossentropy": 2.979400396347046, |
| "loss/hidden": 0.435546875, |
| "loss/logits": 0.12324725836515427, |
| "loss/reg": 356910.25, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 40000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": true, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.455688167424e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|