diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4418 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 9.999629974365234, + "learning_rate": 4.25531914893617e-08, + "loss": 1.0338, + "step": 5 + }, + { + "epoch": 0.0032, + "grad_norm": 7.134084224700928, + "learning_rate": 9.574468085106382e-08, + "loss": 1.1826, + "step": 10 + }, + { + "epoch": 0.0048, + "grad_norm": 5.787372589111328, + "learning_rate": 1.4893617021276595e-07, + "loss": 1.145, + "step": 15 + }, + { + "epoch": 0.0064, + "grad_norm": 13.603924751281738, + "learning_rate": 2.0212765957446807e-07, + "loss": 1.1501, + "step": 20 + }, + { + "epoch": 0.008, + "grad_norm": 13.660343170166016, + "learning_rate": 2.5531914893617016e-07, + "loss": 1.1397, + "step": 25 + }, + { + "epoch": 0.0096, + "grad_norm": 5.391168594360352, + "learning_rate": 3.085106382978723e-07, + "loss": 1.0871, + "step": 30 + }, + { + "epoch": 0.0112, + "grad_norm": 9.299227714538574, + "learning_rate": 3.617021276595745e-07, + "loss": 1.2235, + "step": 35 + }, + { + "epoch": 0.0128, + "grad_norm": 4.13670539855957, + "learning_rate": 4.148936170212766e-07, + "loss": 1.0978, + "step": 40 + }, + { + "epoch": 0.0144, + "grad_norm": 5.579084396362305, + "learning_rate": 4.6808510638297873e-07, + "loss": 1.2814, + "step": 45 + }, + { + "epoch": 0.016, + "grad_norm": 13.503495216369629, + "learning_rate": 5.212765957446809e-07, + "loss": 0.9962, + "step": 50 + }, + { + "epoch": 0.0176, + "grad_norm": 16.958608627319336, + "learning_rate": 5.74468085106383e-07, + "loss": 1.0797, + "step": 55 + }, + { + "epoch": 0.0192, + "grad_norm": 8.709441184997559, + "learning_rate": 6.276595744680851e-07, + "loss": 1.1269, + "step": 60 + }, + { + "epoch": 0.0208, + "grad_norm": 6.347010612487793, + "learning_rate": 6.808510638297872e-07, + "loss": 1.1201, + "step": 65 + }, + { + "epoch": 0.0224, + "grad_norm": 12.77128791809082, + "learning_rate": 7.340425531914893e-07, + "loss": 1.0846, + "step": 70 + }, + { + "epoch": 0.024, + "grad_norm": 8.114184379577637, + "learning_rate": 7.872340425531915e-07, + "loss": 1.151, + "step": 75 + }, + { + "epoch": 0.0256, + "grad_norm": 17.945985794067383, + "learning_rate": 8.404255319148936e-07, + "loss": 1.0711, + "step": 80 + }, + { + "epoch": 0.0272, + "grad_norm": 5.469882965087891, + "learning_rate": 8.936170212765957e-07, + "loss": 0.9901, + "step": 85 + }, + { + "epoch": 0.0288, + "grad_norm": 10.689380645751953, + "learning_rate": 9.468085106382978e-07, + "loss": 1.2635, + "step": 90 + }, + { + "epoch": 0.0304, + "grad_norm": 9.217676162719727, + "learning_rate": 1e-06, + "loss": 1.0604, + "step": 95 + }, + { + "epoch": 0.032, + "grad_norm": 4.253987789154053, + "learning_rate": 9.98350379412735e-07, + "loss": 1.1865, + "step": 100 + }, + { + "epoch": 0.0336, + "grad_norm": 10.984722137451172, + "learning_rate": 9.967007588254702e-07, + "loss": 1.1504, + "step": 105 + }, + { + "epoch": 0.0352, + "grad_norm": 6.79901647567749, + "learning_rate": 9.950511382382052e-07, + "loss": 1.0338, + "step": 110 + }, + { + "epoch": 0.0368, + "grad_norm": 10.826879501342773, + "learning_rate": 9.934015176509404e-07, + "loss": 1.1058, + "step": 115 + }, + { + "epoch": 0.0384, + "grad_norm": 6.437893390655518, + "learning_rate": 9.917518970636754e-07, + "loss": 1.0317, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 3.419424295425415, + "learning_rate": 9.901022764764103e-07, + "loss": 1.0573, + "step": 125 + }, + { + "epoch": 0.0416, + "grad_norm": 14.196756362915039, + "learning_rate": 9.884526558891456e-07, + "loss": 1.0892, + "step": 130 + }, + { + "epoch": 0.0432, + "grad_norm": 4.492501258850098, + "learning_rate": 9.868030353018806e-07, + "loss": 0.9772, + "step": 135 + }, + { + "epoch": 0.0448, + "grad_norm": 8.752493858337402, + "learning_rate": 9.851534147146155e-07, + "loss": 0.984, + "step": 140 + }, + { + "epoch": 0.0464, + "grad_norm": 3.8549039363861084, + "learning_rate": 9.835037941273505e-07, + "loss": 0.9935, + "step": 145 + }, + { + "epoch": 0.048, + "grad_norm": 5.623340129852295, + "learning_rate": 9.818541735400857e-07, + "loss": 1.1332, + "step": 150 + }, + { + "epoch": 0.0496, + "grad_norm": 5.848334789276123, + "learning_rate": 9.802045529528207e-07, + "loss": 0.912, + "step": 155 + }, + { + "epoch": 0.0512, + "grad_norm": 7.2841877937316895, + "learning_rate": 9.78554932365556e-07, + "loss": 1.1562, + "step": 160 + }, + { + "epoch": 0.0528, + "grad_norm": 5.735965728759766, + "learning_rate": 9.76905311778291e-07, + "loss": 0.9819, + "step": 165 + }, + { + "epoch": 0.0544, + "grad_norm": 3.197845220565796, + "learning_rate": 9.75255691191026e-07, + "loss": 0.9843, + "step": 170 + }, + { + "epoch": 0.056, + "grad_norm": 8.21580696105957, + "learning_rate": 9.736060706037611e-07, + "loss": 0.9533, + "step": 175 + }, + { + "epoch": 0.0576, + "grad_norm": 5.551872730255127, + "learning_rate": 9.719564500164961e-07, + "loss": 1.0205, + "step": 180 + }, + { + "epoch": 0.0592, + "grad_norm": 5.322945594787598, + "learning_rate": 9.703068294292313e-07, + "loss": 1.0036, + "step": 185 + }, + { + "epoch": 0.0608, + "grad_norm": 4.90363883972168, + "learning_rate": 9.686572088419663e-07, + "loss": 1.0135, + "step": 190 + }, + { + "epoch": 0.0624, + "grad_norm": 8.087169647216797, + "learning_rate": 9.670075882547013e-07, + "loss": 1.0293, + "step": 195 + }, + { + "epoch": 0.064, + "grad_norm": 4.1587910652160645, + "learning_rate": 9.653579676674365e-07, + "loss": 1.0719, + "step": 200 + }, + { + "epoch": 0.0656, + "grad_norm": 3.2837698459625244, + "learning_rate": 9.637083470801715e-07, + "loss": 1.0107, + "step": 205 + }, + { + "epoch": 0.0672, + "grad_norm": 5.359975814819336, + "learning_rate": 9.620587264929065e-07, + "loss": 0.9036, + "step": 210 + }, + { + "epoch": 0.0688, + "grad_norm": 5.2580037117004395, + "learning_rate": 9.604091059056415e-07, + "loss": 0.9704, + "step": 215 + }, + { + "epoch": 0.0704, + "grad_norm": 9.796117782592773, + "learning_rate": 9.587594853183767e-07, + "loss": 0.9456, + "step": 220 + }, + { + "epoch": 0.072, + "grad_norm": 10.24465560913086, + "learning_rate": 9.571098647311117e-07, + "loss": 1.0502, + "step": 225 + }, + { + "epoch": 0.0736, + "grad_norm": 7.899555683135986, + "learning_rate": 9.55460244143847e-07, + "loss": 0.9849, + "step": 230 + }, + { + "epoch": 0.0752, + "grad_norm": 6.677064418792725, + "learning_rate": 9.53810623556582e-07, + "loss": 1.103, + "step": 235 + }, + { + "epoch": 0.0768, + "grad_norm": 8.85185718536377, + "learning_rate": 9.52161002969317e-07, + "loss": 0.9351, + "step": 240 + }, + { + "epoch": 0.0784, + "grad_norm": 7.826456069946289, + "learning_rate": 9.505113823820521e-07, + "loss": 0.9369, + "step": 245 + }, + { + "epoch": 0.08, + "grad_norm": 7.929803371429443, + "learning_rate": 9.488617617947871e-07, + "loss": 0.8399, + "step": 250 + }, + { + "epoch": 0.0816, + "grad_norm": 4.857858180999756, + "learning_rate": 9.472121412075222e-07, + "loss": 0.9738, + "step": 255 + }, + { + "epoch": 0.0832, + "grad_norm": 7.609739303588867, + "learning_rate": 9.455625206202573e-07, + "loss": 1.0397, + "step": 260 + }, + { + "epoch": 0.0848, + "grad_norm": 4.687152862548828, + "learning_rate": 9.439129000329924e-07, + "loss": 1.0207, + "step": 265 + }, + { + "epoch": 0.0864, + "grad_norm": 3.707120180130005, + "learning_rate": 9.422632794457274e-07, + "loss": 1.0404, + "step": 270 + }, + { + "epoch": 0.088, + "grad_norm": 7.183946132659912, + "learning_rate": 9.406136588584625e-07, + "loss": 0.9329, + "step": 275 + }, + { + "epoch": 0.0896, + "grad_norm": 4.107320785522461, + "learning_rate": 9.389640382711976e-07, + "loss": 0.9674, + "step": 280 + }, + { + "epoch": 0.0912, + "grad_norm": 2.987569808959961, + "learning_rate": 9.373144176839326e-07, + "loss": 1.0431, + "step": 285 + }, + { + "epoch": 0.0928, + "grad_norm": 7.492343902587891, + "learning_rate": 9.356647970966677e-07, + "loss": 0.8819, + "step": 290 + }, + { + "epoch": 0.0944, + "grad_norm": 3.460360050201416, + "learning_rate": 9.340151765094027e-07, + "loss": 1.1553, + "step": 295 + }, + { + "epoch": 0.096, + "grad_norm": 4.670125961303711, + "learning_rate": 9.323655559221378e-07, + "loss": 0.9511, + "step": 300 + }, + { + "epoch": 0.0976, + "grad_norm": 6.766526699066162, + "learning_rate": 9.307159353348729e-07, + "loss": 0.9625, + "step": 305 + }, + { + "epoch": 0.0992, + "grad_norm": 3.6760966777801514, + "learning_rate": 9.29066314747608e-07, + "loss": 0.9503, + "step": 310 + }, + { + "epoch": 0.1008, + "grad_norm": 9.794249534606934, + "learning_rate": 9.27416694160343e-07, + "loss": 0.9533, + "step": 315 + }, + { + "epoch": 0.1024, + "grad_norm": 8.894811630249023, + "learning_rate": 9.257670735730781e-07, + "loss": 0.8668, + "step": 320 + }, + { + "epoch": 0.104, + "grad_norm": 3.3411269187927246, + "learning_rate": 9.241174529858132e-07, + "loss": 0.9171, + "step": 325 + }, + { + "epoch": 0.1056, + "grad_norm": 9.227668762207031, + "learning_rate": 9.224678323985483e-07, + "loss": 0.9065, + "step": 330 + }, + { + "epoch": 0.1072, + "grad_norm": 3.217501640319824, + "learning_rate": 9.208182118112834e-07, + "loss": 0.945, + "step": 335 + }, + { + "epoch": 0.1088, + "grad_norm": 3.9283738136291504, + "learning_rate": 9.191685912240184e-07, + "loss": 0.9166, + "step": 340 + }, + { + "epoch": 0.1104, + "grad_norm": 7.905593395233154, + "learning_rate": 9.175189706367535e-07, + "loss": 1.0471, + "step": 345 + }, + { + "epoch": 0.112, + "grad_norm": 3.8356964588165283, + "learning_rate": 9.158693500494886e-07, + "loss": 0.9593, + "step": 350 + }, + { + "epoch": 0.1136, + "grad_norm": 4.25161600112915, + "learning_rate": 9.142197294622237e-07, + "loss": 0.9251, + "step": 355 + }, + { + "epoch": 0.1152, + "grad_norm": 2.885007381439209, + "learning_rate": 9.125701088749587e-07, + "loss": 0.9124, + "step": 360 + }, + { + "epoch": 0.1168, + "grad_norm": 6.2251763343811035, + "learning_rate": 9.109204882876937e-07, + "loss": 1.0204, + "step": 365 + }, + { + "epoch": 0.1184, + "grad_norm": 5.769200801849365, + "learning_rate": 9.092708677004288e-07, + "loss": 0.8877, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 6.167758941650391, + "learning_rate": 9.076212471131639e-07, + "loss": 0.9371, + "step": 375 + }, + { + "epoch": 0.1216, + "grad_norm": 4.101468563079834, + "learning_rate": 9.05971626525899e-07, + "loss": 0.9571, + "step": 380 + }, + { + "epoch": 0.1232, + "grad_norm": 3.352560043334961, + "learning_rate": 9.04322005938634e-07, + "loss": 0.9988, + "step": 385 + }, + { + "epoch": 0.1248, + "grad_norm": 4.724895000457764, + "learning_rate": 9.026723853513691e-07, + "loss": 0.972, + "step": 390 + }, + { + "epoch": 0.1264, + "grad_norm": 5.613089561462402, + "learning_rate": 9.010227647641042e-07, + "loss": 0.8806, + "step": 395 + }, + { + "epoch": 0.128, + "grad_norm": 4.489896774291992, + "learning_rate": 8.993731441768393e-07, + "loss": 0.9119, + "step": 400 + }, + { + "epoch": 0.1296, + "grad_norm": 6.169275760650635, + "learning_rate": 8.977235235895744e-07, + "loss": 0.9942, + "step": 405 + }, + { + "epoch": 0.1312, + "grad_norm": 4.484738826751709, + "learning_rate": 8.960739030023094e-07, + "loss": 0.8567, + "step": 410 + }, + { + "epoch": 0.1328, + "grad_norm": 3.1110870838165283, + "learning_rate": 8.944242824150445e-07, + "loss": 0.912, + "step": 415 + }, + { + "epoch": 0.1344, + "grad_norm": 7.398248672485352, + "learning_rate": 8.927746618277796e-07, + "loss": 0.9913, + "step": 420 + }, + { + "epoch": 0.136, + "grad_norm": 6.651194095611572, + "learning_rate": 8.911250412405147e-07, + "loss": 0.8492, + "step": 425 + }, + { + "epoch": 0.1376, + "grad_norm": 6.037364482879639, + "learning_rate": 8.894754206532498e-07, + "loss": 0.9839, + "step": 430 + }, + { + "epoch": 0.1392, + "grad_norm": 3.9697699546813965, + "learning_rate": 8.878258000659847e-07, + "loss": 0.9343, + "step": 435 + }, + { + "epoch": 0.1408, + "grad_norm": 4.4155497550964355, + "learning_rate": 8.861761794787198e-07, + "loss": 0.9474, + "step": 440 + }, + { + "epoch": 0.1424, + "grad_norm": 4.292988300323486, + "learning_rate": 8.845265588914549e-07, + "loss": 1.0596, + "step": 445 + }, + { + "epoch": 0.144, + "grad_norm": 2.772756338119507, + "learning_rate": 8.8287693830419e-07, + "loss": 0.9765, + "step": 450 + }, + { + "epoch": 0.1456, + "grad_norm": 7.738980770111084, + "learning_rate": 8.81227317716925e-07, + "loss": 0.8599, + "step": 455 + }, + { + "epoch": 0.1472, + "grad_norm": 9.246415138244629, + "learning_rate": 8.795776971296601e-07, + "loss": 0.9711, + "step": 460 + }, + { + "epoch": 0.1488, + "grad_norm": 5.940875053405762, + "learning_rate": 8.779280765423952e-07, + "loss": 0.9433, + "step": 465 + }, + { + "epoch": 0.1504, + "grad_norm": 6.259022235870361, + "learning_rate": 8.762784559551303e-07, + "loss": 0.9859, + "step": 470 + }, + { + "epoch": 0.152, + "grad_norm": 7.941705226898193, + "learning_rate": 8.746288353678654e-07, + "loss": 0.8696, + "step": 475 + }, + { + "epoch": 0.1536, + "grad_norm": 3.571704626083374, + "learning_rate": 8.729792147806004e-07, + "loss": 0.943, + "step": 480 + }, + { + "epoch": 0.1552, + "grad_norm": 4.129303455352783, + "learning_rate": 8.713295941933355e-07, + "loss": 0.8251, + "step": 485 + }, + { + "epoch": 0.1568, + "grad_norm": 8.326216697692871, + "learning_rate": 8.696799736060706e-07, + "loss": 0.9393, + "step": 490 + }, + { + "epoch": 0.1584, + "grad_norm": 2.903012275695801, + "learning_rate": 8.680303530188057e-07, + "loss": 0.8944, + "step": 495 + }, + { + "epoch": 0.16, + "grad_norm": 5.4961628913879395, + "learning_rate": 8.663807324315408e-07, + "loss": 1.0191, + "step": 500 + }, + { + "epoch": 0.1616, + "grad_norm": 17.958810806274414, + "learning_rate": 8.647311118442758e-07, + "loss": 0.8689, + "step": 505 + }, + { + "epoch": 0.1632, + "grad_norm": 7.708248138427734, + "learning_rate": 8.630814912570108e-07, + "loss": 0.9344, + "step": 510 + }, + { + "epoch": 0.1648, + "grad_norm": 3.0089898109436035, + "learning_rate": 8.614318706697459e-07, + "loss": 0.9085, + "step": 515 + }, + { + "epoch": 0.1664, + "grad_norm": 3.333603858947754, + "learning_rate": 8.59782250082481e-07, + "loss": 0.9389, + "step": 520 + }, + { + "epoch": 0.168, + "grad_norm": 4.273075580596924, + "learning_rate": 8.58132629495216e-07, + "loss": 0.9545, + "step": 525 + }, + { + "epoch": 0.1696, + "grad_norm": 3.9365367889404297, + "learning_rate": 8.564830089079511e-07, + "loss": 1.044, + "step": 530 + }, + { + "epoch": 0.1712, + "grad_norm": 8.090559959411621, + "learning_rate": 8.548333883206862e-07, + "loss": 0.8949, + "step": 535 + }, + { + "epoch": 0.1728, + "grad_norm": 3.675675868988037, + "learning_rate": 8.531837677334213e-07, + "loss": 0.8984, + "step": 540 + }, + { + "epoch": 0.1744, + "grad_norm": 4.396546840667725, + "learning_rate": 8.515341471461564e-07, + "loss": 0.8276, + "step": 545 + }, + { + "epoch": 0.176, + "grad_norm": 5.8129706382751465, + "learning_rate": 8.498845265588914e-07, + "loss": 0.9485, + "step": 550 + }, + { + "epoch": 0.1776, + "grad_norm": 4.243994235992432, + "learning_rate": 8.482349059716265e-07, + "loss": 0.8613, + "step": 555 + }, + { + "epoch": 0.1792, + "grad_norm": 3.252338409423828, + "learning_rate": 8.465852853843616e-07, + "loss": 0.9014, + "step": 560 + }, + { + "epoch": 0.1808, + "grad_norm": 2.9563217163085938, + "learning_rate": 8.449356647970967e-07, + "loss": 0.9632, + "step": 565 + }, + { + "epoch": 0.1824, + "grad_norm": 4.2105302810668945, + "learning_rate": 8.432860442098317e-07, + "loss": 0.8758, + "step": 570 + }, + { + "epoch": 0.184, + "grad_norm": 4.756136417388916, + "learning_rate": 8.416364236225668e-07, + "loss": 0.9102, + "step": 575 + }, + { + "epoch": 0.1856, + "grad_norm": 3.9406795501708984, + "learning_rate": 8.399868030353019e-07, + "loss": 0.9286, + "step": 580 + }, + { + "epoch": 0.1872, + "grad_norm": 5.04837703704834, + "learning_rate": 8.383371824480369e-07, + "loss": 1.0353, + "step": 585 + }, + { + "epoch": 0.1888, + "grad_norm": 2.621293306350708, + "learning_rate": 8.36687561860772e-07, + "loss": 0.9047, + "step": 590 + }, + { + "epoch": 0.1904, + "grad_norm": 4.60697078704834, + "learning_rate": 8.35037941273507e-07, + "loss": 0.9403, + "step": 595 + }, + { + "epoch": 0.192, + "grad_norm": 3.5132675170898438, + "learning_rate": 8.333883206862421e-07, + "loss": 0.9576, + "step": 600 + }, + { + "epoch": 0.1936, + "grad_norm": 6.349253177642822, + "learning_rate": 8.317387000989772e-07, + "loss": 0.9443, + "step": 605 + }, + { + "epoch": 0.1952, + "grad_norm": 4.456959247589111, + "learning_rate": 8.300890795117123e-07, + "loss": 0.9476, + "step": 610 + }, + { + "epoch": 0.1968, + "grad_norm": 3.2254133224487305, + "learning_rate": 8.284394589244474e-07, + "loss": 0.9401, + "step": 615 + }, + { + "epoch": 0.1984, + "grad_norm": 4.775053977966309, + "learning_rate": 8.267898383371824e-07, + "loss": 1.0189, + "step": 620 + }, + { + "epoch": 0.2, + "grad_norm": 5.755006790161133, + "learning_rate": 8.251402177499175e-07, + "loss": 0.8521, + "step": 625 + }, + { + "epoch": 0.2016, + "grad_norm": 4.0426716804504395, + "learning_rate": 8.234905971626526e-07, + "loss": 0.9075, + "step": 630 + }, + { + "epoch": 0.2032, + "grad_norm": 4.348297119140625, + "learning_rate": 8.218409765753877e-07, + "loss": 0.9625, + "step": 635 + }, + { + "epoch": 0.2048, + "grad_norm": 6.654307842254639, + "learning_rate": 8.201913559881227e-07, + "loss": 0.8582, + "step": 640 + }, + { + "epoch": 0.2064, + "grad_norm": 4.090488910675049, + "learning_rate": 8.185417354008578e-07, + "loss": 0.9359, + "step": 645 + }, + { + "epoch": 0.208, + "grad_norm": 5.286700248718262, + "learning_rate": 8.168921148135929e-07, + "loss": 0.9783, + "step": 650 + }, + { + "epoch": 0.2096, + "grad_norm": 4.438581466674805, + "learning_rate": 8.15242494226328e-07, + "loss": 0.9526, + "step": 655 + }, + { + "epoch": 0.2112, + "grad_norm": 3.836512565612793, + "learning_rate": 8.13592873639063e-07, + "loss": 0.8768, + "step": 660 + }, + { + "epoch": 0.2128, + "grad_norm": 3.1037371158599854, + "learning_rate": 8.11943253051798e-07, + "loss": 0.963, + "step": 665 + }, + { + "epoch": 0.2144, + "grad_norm": 5.103884696960449, + "learning_rate": 8.102936324645331e-07, + "loss": 0.9322, + "step": 670 + }, + { + "epoch": 0.216, + "grad_norm": 3.707827091217041, + "learning_rate": 8.086440118772682e-07, + "loss": 0.8691, + "step": 675 + }, + { + "epoch": 0.2176, + "grad_norm": 3.7925477027893066, + "learning_rate": 8.069943912900033e-07, + "loss": 0.9636, + "step": 680 + }, + { + "epoch": 0.2192, + "grad_norm": 4.857919692993164, + "learning_rate": 8.053447707027383e-07, + "loss": 0.8867, + "step": 685 + }, + { + "epoch": 0.2208, + "grad_norm": 3.513091564178467, + "learning_rate": 8.036951501154734e-07, + "loss": 0.9646, + "step": 690 + }, + { + "epoch": 0.2224, + "grad_norm": 3.4893958568573, + "learning_rate": 8.020455295282085e-07, + "loss": 0.9276, + "step": 695 + }, + { + "epoch": 0.224, + "grad_norm": 3.087334156036377, + "learning_rate": 8.003959089409436e-07, + "loss": 0.89, + "step": 700 + }, + { + "epoch": 0.2256, + "grad_norm": 4.584767818450928, + "learning_rate": 7.987462883536787e-07, + "loss": 1.0421, + "step": 705 + }, + { + "epoch": 0.2272, + "grad_norm": 3.7277486324310303, + "learning_rate": 7.970966677664137e-07, + "loss": 0.8757, + "step": 710 + }, + { + "epoch": 0.2288, + "grad_norm": 4.989358425140381, + "learning_rate": 7.954470471791488e-07, + "loss": 0.8421, + "step": 715 + }, + { + "epoch": 0.2304, + "grad_norm": 4.230960845947266, + "learning_rate": 7.937974265918839e-07, + "loss": 0.883, + "step": 720 + }, + { + "epoch": 0.232, + "grad_norm": 7.118892192840576, + "learning_rate": 7.92147806004619e-07, + "loss": 0.9549, + "step": 725 + }, + { + "epoch": 0.2336, + "grad_norm": 10.041189193725586, + "learning_rate": 7.904981854173541e-07, + "loss": 0.9897, + "step": 730 + }, + { + "epoch": 0.2352, + "grad_norm": 3.822767734527588, + "learning_rate": 7.88848564830089e-07, + "loss": 0.8711, + "step": 735 + }, + { + "epoch": 0.2368, + "grad_norm": 4.475744724273682, + "learning_rate": 7.871989442428241e-07, + "loss": 0.9175, + "step": 740 + }, + { + "epoch": 0.2384, + "grad_norm": 5.1056976318359375, + "learning_rate": 7.855493236555592e-07, + "loss": 0.9022, + "step": 745 + }, + { + "epoch": 0.24, + "grad_norm": 4.522522926330566, + "learning_rate": 7.838997030682943e-07, + "loss": 1.0007, + "step": 750 + }, + { + "epoch": 0.2416, + "grad_norm": 4.390966415405273, + "learning_rate": 7.822500824810293e-07, + "loss": 0.9074, + "step": 755 + }, + { + "epoch": 0.2432, + "grad_norm": 7.068999290466309, + "learning_rate": 7.806004618937644e-07, + "loss": 0.8454, + "step": 760 + }, + { + "epoch": 0.2448, + "grad_norm": 3.558549642562866, + "learning_rate": 7.789508413064995e-07, + "loss": 0.88, + "step": 765 + }, + { + "epoch": 0.2464, + "grad_norm": 4.7729949951171875, + "learning_rate": 7.773012207192346e-07, + "loss": 0.8577, + "step": 770 + }, + { + "epoch": 0.248, + "grad_norm": 3.512878894805908, + "learning_rate": 7.756516001319697e-07, + "loss": 1.0939, + "step": 775 + }, + { + "epoch": 0.2496, + "grad_norm": 2.6263558864593506, + "learning_rate": 7.740019795447047e-07, + "loss": 0.8807, + "step": 780 + }, + { + "epoch": 0.2512, + "grad_norm": 2.518568992614746, + "learning_rate": 7.723523589574398e-07, + "loss": 0.7706, + "step": 785 + }, + { + "epoch": 0.2528, + "grad_norm": 5.156455993652344, + "learning_rate": 7.707027383701749e-07, + "loss": 0.8809, + "step": 790 + }, + { + "epoch": 0.2544, + "grad_norm": 7.734130382537842, + "learning_rate": 7.6905311778291e-07, + "loss": 0.967, + "step": 795 + }, + { + "epoch": 0.256, + "grad_norm": 2.6134588718414307, + "learning_rate": 7.674034971956451e-07, + "loss": 0.7965, + "step": 800 + }, + { + "epoch": 0.2576, + "grad_norm": 9.95977783203125, + "learning_rate": 7.657538766083801e-07, + "loss": 0.9095, + "step": 805 + }, + { + "epoch": 0.2592, + "grad_norm": 3.093651533126831, + "learning_rate": 7.64104256021115e-07, + "loss": 0.9707, + "step": 810 + }, + { + "epoch": 0.2608, + "grad_norm": 3.039573907852173, + "learning_rate": 7.624546354338501e-07, + "loss": 0.9317, + "step": 815 + }, + { + "epoch": 0.2624, + "grad_norm": 4.40300989151001, + "learning_rate": 7.608050148465853e-07, + "loss": 0.8223, + "step": 820 + }, + { + "epoch": 0.264, + "grad_norm": 5.527564525604248, + "learning_rate": 7.591553942593202e-07, + "loss": 0.9234, + "step": 825 + }, + { + "epoch": 0.2656, + "grad_norm": 5.654271602630615, + "learning_rate": 7.575057736720553e-07, + "loss": 0.9437, + "step": 830 + }, + { + "epoch": 0.2672, + "grad_norm": 5.316553115844727, + "learning_rate": 7.558561530847904e-07, + "loss": 0.8862, + "step": 835 + }, + { + "epoch": 0.2688, + "grad_norm": 2.8125505447387695, + "learning_rate": 7.542065324975255e-07, + "loss": 0.8235, + "step": 840 + }, + { + "epoch": 0.2704, + "grad_norm": 5.254530429840088, + "learning_rate": 7.525569119102606e-07, + "loss": 0.8986, + "step": 845 + }, + { + "epoch": 0.272, + "grad_norm": 5.69275426864624, + "learning_rate": 7.509072913229956e-07, + "loss": 0.9043, + "step": 850 + }, + { + "epoch": 0.2736, + "grad_norm": 4.995587348937988, + "learning_rate": 7.492576707357307e-07, + "loss": 0.8242, + "step": 855 + }, + { + "epoch": 0.2752, + "grad_norm": 8.568499565124512, + "learning_rate": 7.476080501484658e-07, + "loss": 0.9116, + "step": 860 + }, + { + "epoch": 0.2768, + "grad_norm": 5.804699420928955, + "learning_rate": 7.45958429561201e-07, + "loss": 0.863, + "step": 865 + }, + { + "epoch": 0.2784, + "grad_norm": 7.921741962432861, + "learning_rate": 7.44308808973936e-07, + "loss": 0.9719, + "step": 870 + }, + { + "epoch": 0.28, + "grad_norm": 5.888009071350098, + "learning_rate": 7.42659188386671e-07, + "loss": 0.8999, + "step": 875 + }, + { + "epoch": 0.2816, + "grad_norm": 3.9009482860565186, + "learning_rate": 7.410095677994061e-07, + "loss": 0.9543, + "step": 880 + }, + { + "epoch": 0.2832, + "grad_norm": 3.502060651779175, + "learning_rate": 7.393599472121411e-07, + "loss": 0.8367, + "step": 885 + }, + { + "epoch": 0.2848, + "grad_norm": 4.675789833068848, + "learning_rate": 7.377103266248762e-07, + "loss": 0.8843, + "step": 890 + }, + { + "epoch": 0.2864, + "grad_norm": 3.246445417404175, + "learning_rate": 7.360607060376112e-07, + "loss": 0.9663, + "step": 895 + }, + { + "epoch": 0.288, + "grad_norm": 3.168081521987915, + "learning_rate": 7.344110854503463e-07, + "loss": 0.8186, + "step": 900 + }, + { + "epoch": 0.2896, + "grad_norm": 5.229098320007324, + "learning_rate": 7.327614648630814e-07, + "loss": 0.9686, + "step": 905 + }, + { + "epoch": 0.2912, + "grad_norm": 6.258688926696777, + "learning_rate": 7.311118442758165e-07, + "loss": 0.8538, + "step": 910 + }, + { + "epoch": 0.2928, + "grad_norm": 5.021489143371582, + "learning_rate": 7.294622236885516e-07, + "loss": 0.7937, + "step": 915 + }, + { + "epoch": 0.2944, + "grad_norm": 3.9986860752105713, + "learning_rate": 7.278126031012866e-07, + "loss": 0.8503, + "step": 920 + }, + { + "epoch": 0.296, + "grad_norm": 3.62813663482666, + "learning_rate": 7.261629825140217e-07, + "loss": 0.8199, + "step": 925 + }, + { + "epoch": 0.2976, + "grad_norm": 4.517638683319092, + "learning_rate": 7.245133619267568e-07, + "loss": 0.8521, + "step": 930 + }, + { + "epoch": 0.2992, + "grad_norm": 8.5663423538208, + "learning_rate": 7.228637413394919e-07, + "loss": 0.9132, + "step": 935 + }, + { + "epoch": 0.3008, + "grad_norm": 4.776329040527344, + "learning_rate": 7.212141207522269e-07, + "loss": 0.8566, + "step": 940 + }, + { + "epoch": 0.3024, + "grad_norm": 4.764708518981934, + "learning_rate": 7.19564500164962e-07, + "loss": 0.8181, + "step": 945 + }, + { + "epoch": 0.304, + "grad_norm": 3.810011863708496, + "learning_rate": 7.179148795776971e-07, + "loss": 1.003, + "step": 950 + }, + { + "epoch": 0.3056, + "grad_norm": 4.385900020599365, + "learning_rate": 7.162652589904322e-07, + "loss": 0.8525, + "step": 955 + }, + { + "epoch": 0.3072, + "grad_norm": 8.322861671447754, + "learning_rate": 7.146156384031672e-07, + "loss": 0.9047, + "step": 960 + }, + { + "epoch": 0.3088, + "grad_norm": 6.5899224281311035, + "learning_rate": 7.129660178159022e-07, + "loss": 0.7103, + "step": 965 + }, + { + "epoch": 0.3104, + "grad_norm": 6.054207801818848, + "learning_rate": 7.113163972286373e-07, + "loss": 0.876, + "step": 970 + }, + { + "epoch": 0.312, + "grad_norm": 3.6956799030303955, + "learning_rate": 7.096667766413724e-07, + "loss": 0.9382, + "step": 975 + }, + { + "epoch": 0.3136, + "grad_norm": 4.055649757385254, + "learning_rate": 7.080171560541075e-07, + "loss": 0.8944, + "step": 980 + }, + { + "epoch": 0.3152, + "grad_norm": 2.8398051261901855, + "learning_rate": 7.063675354668426e-07, + "loss": 0.8233, + "step": 985 + }, + { + "epoch": 0.3168, + "grad_norm": 5.903645992279053, + "learning_rate": 7.047179148795776e-07, + "loss": 1.0067, + "step": 990 + }, + { + "epoch": 0.3184, + "grad_norm": 5.374630451202393, + "learning_rate": 7.030682942923127e-07, + "loss": 0.8423, + "step": 995 + }, + { + "epoch": 0.32, + "grad_norm": 6.729516506195068, + "learning_rate": 7.014186737050478e-07, + "loss": 1.0209, + "step": 1000 + }, + { + "epoch": 0.3216, + "grad_norm": 3.4207727909088135, + "learning_rate": 6.997690531177829e-07, + "loss": 0.9196, + "step": 1005 + }, + { + "epoch": 0.3232, + "grad_norm": 5.797353744506836, + "learning_rate": 6.981194325305179e-07, + "loss": 0.921, + "step": 1010 + }, + { + "epoch": 0.3248, + "grad_norm": 4.802167892456055, + "learning_rate": 6.96469811943253e-07, + "loss": 0.8844, + "step": 1015 + }, + { + "epoch": 0.3264, + "grad_norm": 6.671936511993408, + "learning_rate": 6.948201913559881e-07, + "loss": 0.8399, + "step": 1020 + }, + { + "epoch": 0.328, + "grad_norm": 4.027926921844482, + "learning_rate": 6.931705707687232e-07, + "loss": 0.8733, + "step": 1025 + }, + { + "epoch": 0.3296, + "grad_norm": 7.0996575355529785, + "learning_rate": 6.915209501814583e-07, + "loss": 0.8886, + "step": 1030 + }, + { + "epoch": 0.3312, + "grad_norm": 3.9534752368927, + "learning_rate": 6.898713295941932e-07, + "loss": 0.9144, + "step": 1035 + }, + { + "epoch": 0.3328, + "grad_norm": 5.016911506652832, + "learning_rate": 6.882217090069283e-07, + "loss": 0.9936, + "step": 1040 + }, + { + "epoch": 0.3344, + "grad_norm": 3.6231181621551514, + "learning_rate": 6.865720884196634e-07, + "loss": 0.7205, + "step": 1045 + }, + { + "epoch": 0.336, + "grad_norm": 3.6556529998779297, + "learning_rate": 6.849224678323985e-07, + "loss": 0.914, + "step": 1050 + }, + { + "epoch": 0.3376, + "grad_norm": 3.27970552444458, + "learning_rate": 6.832728472451335e-07, + "loss": 0.8306, + "step": 1055 + }, + { + "epoch": 0.3392, + "grad_norm": 3.816570997238159, + "learning_rate": 6.816232266578686e-07, + "loss": 0.9458, + "step": 1060 + }, + { + "epoch": 0.3408, + "grad_norm": 7.391907215118408, + "learning_rate": 6.799736060706037e-07, + "loss": 0.8389, + "step": 1065 + }, + { + "epoch": 0.3424, + "grad_norm": 3.756998300552368, + "learning_rate": 6.783239854833388e-07, + "loss": 0.8889, + "step": 1070 + }, + { + "epoch": 0.344, + "grad_norm": 4.19740629196167, + "learning_rate": 6.766743648960739e-07, + "loss": 0.8942, + "step": 1075 + }, + { + "epoch": 0.3456, + "grad_norm": 4.351824760437012, + "learning_rate": 6.750247443088089e-07, + "loss": 0.9602, + "step": 1080 + }, + { + "epoch": 0.3472, + "grad_norm": 3.371953010559082, + "learning_rate": 6.73375123721544e-07, + "loss": 0.9365, + "step": 1085 + }, + { + "epoch": 0.3488, + "grad_norm": 5.847254753112793, + "learning_rate": 6.717255031342791e-07, + "loss": 0.7705, + "step": 1090 + }, + { + "epoch": 0.3504, + "grad_norm": 5.160730361938477, + "learning_rate": 6.700758825470142e-07, + "loss": 0.9221, + "step": 1095 + }, + { + "epoch": 0.352, + "grad_norm": 3.4430952072143555, + "learning_rate": 6.684262619597493e-07, + "loss": 0.9484, + "step": 1100 + }, + { + "epoch": 0.3536, + "grad_norm": 4.218683242797852, + "learning_rate": 6.667766413724843e-07, + "loss": 0.8446, + "step": 1105 + }, + { + "epoch": 0.3552, + "grad_norm": 5.120244026184082, + "learning_rate": 6.651270207852193e-07, + "loss": 0.9515, + "step": 1110 + }, + { + "epoch": 0.3568, + "grad_norm": 5.609252452850342, + "learning_rate": 6.634774001979544e-07, + "loss": 0.8694, + "step": 1115 + }, + { + "epoch": 0.3584, + "grad_norm": 3.753680467605591, + "learning_rate": 6.618277796106895e-07, + "loss": 0.9242, + "step": 1120 + }, + { + "epoch": 0.36, + "grad_norm": 3.665069580078125, + "learning_rate": 6.601781590234245e-07, + "loss": 0.9034, + "step": 1125 + }, + { + "epoch": 0.3616, + "grad_norm": 4.715619087219238, + "learning_rate": 6.585285384361596e-07, + "loss": 0.8842, + "step": 1130 + }, + { + "epoch": 0.3632, + "grad_norm": 4.438577651977539, + "learning_rate": 6.568789178488947e-07, + "loss": 0.9432, + "step": 1135 + }, + { + "epoch": 0.3648, + "grad_norm": 3.8930490016937256, + "learning_rate": 6.552292972616298e-07, + "loss": 0.8113, + "step": 1140 + }, + { + "epoch": 0.3664, + "grad_norm": 4.182096004486084, + "learning_rate": 6.535796766743649e-07, + "loss": 0.7999, + "step": 1145 + }, + { + "epoch": 0.368, + "grad_norm": 5.353331089019775, + "learning_rate": 6.519300560870999e-07, + "loss": 0.8499, + "step": 1150 + }, + { + "epoch": 0.3696, + "grad_norm": 3.641796588897705, + "learning_rate": 6.50280435499835e-07, + "loss": 0.9247, + "step": 1155 + }, + { + "epoch": 0.3712, + "grad_norm": 4.2418646812438965, + "learning_rate": 6.486308149125701e-07, + "loss": 1.0406, + "step": 1160 + }, + { + "epoch": 0.3728, + "grad_norm": 2.948838233947754, + "learning_rate": 6.469811943253052e-07, + "loss": 0.8411, + "step": 1165 + }, + { + "epoch": 0.3744, + "grad_norm": 7.832685947418213, + "learning_rate": 6.453315737380403e-07, + "loss": 0.9339, + "step": 1170 + }, + { + "epoch": 0.376, + "grad_norm": 3.882305145263672, + "learning_rate": 6.436819531507753e-07, + "loss": 0.9936, + "step": 1175 + }, + { + "epoch": 0.3776, + "grad_norm": 6.916220664978027, + "learning_rate": 6.420323325635104e-07, + "loss": 1.0441, + "step": 1180 + }, + { + "epoch": 0.3792, + "grad_norm": 6.770009517669678, + "learning_rate": 6.403827119762454e-07, + "loss": 0.9299, + "step": 1185 + }, + { + "epoch": 0.3808, + "grad_norm": 4.584465980529785, + "learning_rate": 6.387330913889805e-07, + "loss": 0.9169, + "step": 1190 + }, + { + "epoch": 0.3824, + "grad_norm": 8.000226974487305, + "learning_rate": 6.370834708017155e-07, + "loss": 1.0345, + "step": 1195 + }, + { + "epoch": 0.384, + "grad_norm": 13.314818382263184, + "learning_rate": 6.354338502144506e-07, + "loss": 0.919, + "step": 1200 + }, + { + "epoch": 0.3856, + "grad_norm": 3.7661311626434326, + "learning_rate": 6.337842296271857e-07, + "loss": 0.9678, + "step": 1205 + }, + { + "epoch": 0.3872, + "grad_norm": 4.133317470550537, + "learning_rate": 6.321346090399208e-07, + "loss": 0.9533, + "step": 1210 + }, + { + "epoch": 0.3888, + "grad_norm": 2.6607346534729004, + "learning_rate": 6.304849884526559e-07, + "loss": 0.9946, + "step": 1215 + }, + { + "epoch": 0.3904, + "grad_norm": 3.8332831859588623, + "learning_rate": 6.288353678653909e-07, + "loss": 0.8293, + "step": 1220 + }, + { + "epoch": 0.392, + "grad_norm": 3.6170003414154053, + "learning_rate": 6.27185747278126e-07, + "loss": 0.8952, + "step": 1225 + }, + { + "epoch": 0.3936, + "grad_norm": 5.026386737823486, + "learning_rate": 6.255361266908611e-07, + "loss": 0.917, + "step": 1230 + }, + { + "epoch": 0.3952, + "grad_norm": 10.285544395446777, + "learning_rate": 6.238865061035962e-07, + "loss": 0.8624, + "step": 1235 + }, + { + "epoch": 0.3968, + "grad_norm": 2.899703025817871, + "learning_rate": 6.222368855163313e-07, + "loss": 0.9242, + "step": 1240 + }, + { + "epoch": 0.3984, + "grad_norm": 6.087869167327881, + "learning_rate": 6.205872649290663e-07, + "loss": 0.8574, + "step": 1245 + }, + { + "epoch": 0.4, + "grad_norm": 4.513827323913574, + "learning_rate": 6.189376443418014e-07, + "loss": 0.9205, + "step": 1250 + }, + { + "epoch": 0.4016, + "grad_norm": 3.0381855964660645, + "learning_rate": 6.172880237545365e-07, + "loss": 0.8779, + "step": 1255 + }, + { + "epoch": 0.4032, + "grad_norm": 9.265677452087402, + "learning_rate": 6.156384031672715e-07, + "loss": 0.8645, + "step": 1260 + }, + { + "epoch": 0.4048, + "grad_norm": 5.159205436706543, + "learning_rate": 6.139887825800065e-07, + "loss": 0.7365, + "step": 1265 + }, + { + "epoch": 0.4064, + "grad_norm": 4.157783031463623, + "learning_rate": 6.123391619927416e-07, + "loss": 1.003, + "step": 1270 + }, + { + "epoch": 0.408, + "grad_norm": 4.129422187805176, + "learning_rate": 6.106895414054767e-07, + "loss": 1.0333, + "step": 1275 + }, + { + "epoch": 0.4096, + "grad_norm": 5.1480536460876465, + "learning_rate": 6.090399208182118e-07, + "loss": 0.9097, + "step": 1280 + }, + { + "epoch": 0.4112, + "grad_norm": 6.21195650100708, + "learning_rate": 6.073903002309469e-07, + "loss": 0.9753, + "step": 1285 + }, + { + "epoch": 0.4128, + "grad_norm": 4.375741481781006, + "learning_rate": 6.057406796436819e-07, + "loss": 0.8613, + "step": 1290 + }, + { + "epoch": 0.4144, + "grad_norm": 6.381781578063965, + "learning_rate": 6.04091059056417e-07, + "loss": 0.8447, + "step": 1295 + }, + { + "epoch": 0.416, + "grad_norm": 6.689861297607422, + "learning_rate": 6.024414384691521e-07, + "loss": 0.9417, + "step": 1300 + }, + { + "epoch": 0.4176, + "grad_norm": 7.573152542114258, + "learning_rate": 6.007918178818872e-07, + "loss": 0.8559, + "step": 1305 + }, + { + "epoch": 0.4192, + "grad_norm": 3.2965288162231445, + "learning_rate": 5.991421972946222e-07, + "loss": 0.8483, + "step": 1310 + }, + { + "epoch": 0.4208, + "grad_norm": 3.5138070583343506, + "learning_rate": 5.974925767073573e-07, + "loss": 0.7122, + "step": 1315 + }, + { + "epoch": 0.4224, + "grad_norm": 3.6955363750457764, + "learning_rate": 5.958429561200924e-07, + "loss": 0.8294, + "step": 1320 + }, + { + "epoch": 0.424, + "grad_norm": 3.6129462718963623, + "learning_rate": 5.941933355328275e-07, + "loss": 1.0534, + "step": 1325 + }, + { + "epoch": 0.4256, + "grad_norm": 4.549454212188721, + "learning_rate": 5.925437149455626e-07, + "loss": 0.8865, + "step": 1330 + }, + { + "epoch": 0.4272, + "grad_norm": 5.264537811279297, + "learning_rate": 5.908940943582975e-07, + "loss": 0.9219, + "step": 1335 + }, + { + "epoch": 0.4288, + "grad_norm": 5.2592267990112305, + "learning_rate": 5.892444737710326e-07, + "loss": 0.8414, + "step": 1340 + }, + { + "epoch": 0.4304, + "grad_norm": 7.621036529541016, + "learning_rate": 5.875948531837677e-07, + "loss": 0.8553, + "step": 1345 + }, + { + "epoch": 0.432, + "grad_norm": 4.718931674957275, + "learning_rate": 5.859452325965028e-07, + "loss": 1.0172, + "step": 1350 + }, + { + "epoch": 0.4336, + "grad_norm": 3.382890224456787, + "learning_rate": 5.842956120092379e-07, + "loss": 0.9208, + "step": 1355 + }, + { + "epoch": 0.4352, + "grad_norm": 5.720285415649414, + "learning_rate": 5.826459914219729e-07, + "loss": 0.9376, + "step": 1360 + }, + { + "epoch": 0.4368, + "grad_norm": 6.348506450653076, + "learning_rate": 5.80996370834708e-07, + "loss": 0.8526, + "step": 1365 + }, + { + "epoch": 0.4384, + "grad_norm": 3.639775276184082, + "learning_rate": 5.793467502474431e-07, + "loss": 0.9067, + "step": 1370 + }, + { + "epoch": 0.44, + "grad_norm": 4.016887187957764, + "learning_rate": 5.776971296601782e-07, + "loss": 0.9526, + "step": 1375 + }, + { + "epoch": 0.4416, + "grad_norm": 6.822007656097412, + "learning_rate": 5.760475090729132e-07, + "loss": 0.776, + "step": 1380 + }, + { + "epoch": 0.4432, + "grad_norm": 4.3558125495910645, + "learning_rate": 5.743978884856483e-07, + "loss": 0.8932, + "step": 1385 + }, + { + "epoch": 0.4448, + "grad_norm": 6.4160475730896, + "learning_rate": 5.727482678983834e-07, + "loss": 0.9081, + "step": 1390 + }, + { + "epoch": 0.4464, + "grad_norm": 3.1769394874572754, + "learning_rate": 5.710986473111185e-07, + "loss": 0.8688, + "step": 1395 + }, + { + "epoch": 0.448, + "grad_norm": 3.9851887226104736, + "learning_rate": 5.694490267238536e-07, + "loss": 0.877, + "step": 1400 + }, + { + "epoch": 0.4496, + "grad_norm": 5.6506571769714355, + "learning_rate": 5.677994061365886e-07, + "loss": 0.8325, + "step": 1405 + }, + { + "epoch": 0.4512, + "grad_norm": 3.549743175506592, + "learning_rate": 5.661497855493236e-07, + "loss": 0.8987, + "step": 1410 + }, + { + "epoch": 0.4528, + "grad_norm": 3.143094062805176, + "learning_rate": 5.645001649620587e-07, + "loss": 0.9429, + "step": 1415 + }, + { + "epoch": 0.4544, + "grad_norm": 8.156094551086426, + "learning_rate": 5.628505443747938e-07, + "loss": 0.7903, + "step": 1420 + }, + { + "epoch": 0.456, + "grad_norm": 4.86202335357666, + "learning_rate": 5.612009237875289e-07, + "loss": 0.933, + "step": 1425 + }, + { + "epoch": 0.4576, + "grad_norm": 5.636049270629883, + "learning_rate": 5.595513032002639e-07, + "loss": 0.921, + "step": 1430 + }, + { + "epoch": 0.4592, + "grad_norm": 3.5446226596832275, + "learning_rate": 5.57901682612999e-07, + "loss": 0.8239, + "step": 1435 + }, + { + "epoch": 0.4608, + "grad_norm": 3.516528606414795, + "learning_rate": 5.562520620257341e-07, + "loss": 0.894, + "step": 1440 + }, + { + "epoch": 0.4624, + "grad_norm": 3.1388487815856934, + "learning_rate": 5.546024414384692e-07, + "loss": 0.9401, + "step": 1445 + }, + { + "epoch": 0.464, + "grad_norm": 3.378370761871338, + "learning_rate": 5.529528208512042e-07, + "loss": 0.8786, + "step": 1450 + }, + { + "epoch": 0.4656, + "grad_norm": 4.898928165435791, + "learning_rate": 5.513032002639393e-07, + "loss": 0.8457, + "step": 1455 + }, + { + "epoch": 0.4672, + "grad_norm": 8.320155143737793, + "learning_rate": 5.496535796766744e-07, + "loss": 0.9857, + "step": 1460 + }, + { + "epoch": 0.4688, + "grad_norm": 12.393474578857422, + "learning_rate": 5.480039590894095e-07, + "loss": 1.0179, + "step": 1465 + }, + { + "epoch": 0.4704, + "grad_norm": 3.834761142730713, + "learning_rate": 5.463543385021446e-07, + "loss": 0.8355, + "step": 1470 + }, + { + "epoch": 0.472, + "grad_norm": 5.7657694816589355, + "learning_rate": 5.447047179148796e-07, + "loss": 0.906, + "step": 1475 + }, + { + "epoch": 0.4736, + "grad_norm": 2.89928936958313, + "learning_rate": 5.430550973276147e-07, + "loss": 0.8453, + "step": 1480 + }, + { + "epoch": 0.4752, + "grad_norm": 3.30023455619812, + "learning_rate": 5.414054767403497e-07, + "loss": 0.863, + "step": 1485 + }, + { + "epoch": 0.4768, + "grad_norm": 6.904449462890625, + "learning_rate": 5.397558561530848e-07, + "loss": 0.8722, + "step": 1490 + }, + { + "epoch": 0.4784, + "grad_norm": 2.936325788497925, + "learning_rate": 5.381062355658197e-07, + "loss": 0.8565, + "step": 1495 + }, + { + "epoch": 0.48, + "grad_norm": 6.707699775695801, + "learning_rate": 5.364566149785548e-07, + "loss": 0.827, + "step": 1500 + }, + { + "epoch": 0.4816, + "grad_norm": 3.5800673961639404, + "learning_rate": 5.3480699439129e-07, + "loss": 0.8582, + "step": 1505 + }, + { + "epoch": 0.4832, + "grad_norm": 5.940330505371094, + "learning_rate": 5.33157373804025e-07, + "loss": 0.9983, + "step": 1510 + }, + { + "epoch": 0.4848, + "grad_norm": 4.438694000244141, + "learning_rate": 5.315077532167602e-07, + "loss": 0.8307, + "step": 1515 + }, + { + "epoch": 0.4864, + "grad_norm": 6.149857044219971, + "learning_rate": 5.298581326294951e-07, + "loss": 0.9128, + "step": 1520 + }, + { + "epoch": 0.488, + "grad_norm": 3.874925136566162, + "learning_rate": 5.282085120422302e-07, + "loss": 0.966, + "step": 1525 + }, + { + "epoch": 0.4896, + "grad_norm": 17.836402893066406, + "learning_rate": 5.265588914549653e-07, + "loss": 0.8737, + "step": 1530 + }, + { + "epoch": 0.4912, + "grad_norm": 5.488133430480957, + "learning_rate": 5.249092708677005e-07, + "loss": 0.9353, + "step": 1535 + }, + { + "epoch": 0.4928, + "grad_norm": 4.590605735778809, + "learning_rate": 5.232596502804356e-07, + "loss": 0.8757, + "step": 1540 + }, + { + "epoch": 0.4944, + "grad_norm": 3.0040788650512695, + "learning_rate": 5.216100296931705e-07, + "loss": 0.8116, + "step": 1545 + }, + { + "epoch": 0.496, + "grad_norm": 3.2542080879211426, + "learning_rate": 5.199604091059056e-07, + "loss": 0.9621, + "step": 1550 + }, + { + "epoch": 0.4976, + "grad_norm": 4.786755561828613, + "learning_rate": 5.183107885186407e-07, + "loss": 0.9551, + "step": 1555 + }, + { + "epoch": 0.4992, + "grad_norm": 5.058788299560547, + "learning_rate": 5.166611679313757e-07, + "loss": 0.7788, + "step": 1560 + }, + { + "epoch": 0.5008, + "grad_norm": 6.7137131690979, + "learning_rate": 5.150115473441107e-07, + "loss": 0.8605, + "step": 1565 + }, + { + "epoch": 0.5024, + "grad_norm": 5.942770004272461, + "learning_rate": 5.133619267568458e-07, + "loss": 0.8731, + "step": 1570 + }, + { + "epoch": 0.504, + "grad_norm": 3.7935171127319336, + "learning_rate": 5.117123061695809e-07, + "loss": 0.8126, + "step": 1575 + }, + { + "epoch": 0.5056, + "grad_norm": 3.8675737380981445, + "learning_rate": 5.10062685582316e-07, + "loss": 0.8163, + "step": 1580 + }, + { + "epoch": 0.5072, + "grad_norm": 3.6353890895843506, + "learning_rate": 5.084130649950511e-07, + "loss": 0.8407, + "step": 1585 + }, + { + "epoch": 0.5088, + "grad_norm": 6.919312477111816, + "learning_rate": 5.067634444077861e-07, + "loss": 0.8906, + "step": 1590 + }, + { + "epoch": 0.5104, + "grad_norm": 4.731250286102295, + "learning_rate": 5.051138238205212e-07, + "loss": 0.7421, + "step": 1595 + }, + { + "epoch": 0.512, + "grad_norm": 3.6495304107666016, + "learning_rate": 5.034642032332563e-07, + "loss": 0.8691, + "step": 1600 + }, + { + "epoch": 0.5136, + "grad_norm": 3.6082992553710938, + "learning_rate": 5.018145826459914e-07, + "loss": 0.9252, + "step": 1605 + }, + { + "epoch": 0.5152, + "grad_norm": 2.5912933349609375, + "learning_rate": 5.001649620587265e-07, + "loss": 0.8538, + "step": 1610 + }, + { + "epoch": 0.5168, + "grad_norm": 7.729884624481201, + "learning_rate": 4.985153414714615e-07, + "loss": 0.7578, + "step": 1615 + }, + { + "epoch": 0.5184, + "grad_norm": 4.614051342010498, + "learning_rate": 4.968657208841966e-07, + "loss": 0.8363, + "step": 1620 + }, + { + "epoch": 0.52, + "grad_norm": 3.5848758220672607, + "learning_rate": 4.952161002969316e-07, + "loss": 0.902, + "step": 1625 + }, + { + "epoch": 0.5216, + "grad_norm": 4.744536399841309, + "learning_rate": 4.935664797096667e-07, + "loss": 0.8356, + "step": 1630 + }, + { + "epoch": 0.5232, + "grad_norm": 6.719925880432129, + "learning_rate": 4.919168591224018e-07, + "loss": 0.8663, + "step": 1635 + }, + { + "epoch": 0.5248, + "grad_norm": 5.994638442993164, + "learning_rate": 4.902672385351369e-07, + "loss": 0.8854, + "step": 1640 + }, + { + "epoch": 0.5264, + "grad_norm": 3.5340418815612793, + "learning_rate": 4.88617617947872e-07, + "loss": 0.738, + "step": 1645 + }, + { + "epoch": 0.528, + "grad_norm": 4.414712905883789, + "learning_rate": 4.86967997360607e-07, + "loss": 0.8637, + "step": 1650 + }, + { + "epoch": 0.5296, + "grad_norm": 3.8119003772735596, + "learning_rate": 4.853183767733421e-07, + "loss": 0.8948, + "step": 1655 + }, + { + "epoch": 0.5312, + "grad_norm": 3.453695058822632, + "learning_rate": 4.836687561860771e-07, + "loss": 0.9324, + "step": 1660 + }, + { + "epoch": 0.5328, + "grad_norm": 8.695696830749512, + "learning_rate": 4.820191355988122e-07, + "loss": 0.8486, + "step": 1665 + }, + { + "epoch": 0.5344, + "grad_norm": 3.696438789367676, + "learning_rate": 4.803695150115473e-07, + "loss": 0.8801, + "step": 1670 + }, + { + "epoch": 0.536, + "grad_norm": 5.598580360412598, + "learning_rate": 4.787198944242824e-07, + "loss": 0.9598, + "step": 1675 + }, + { + "epoch": 0.5376, + "grad_norm": 7.447549819946289, + "learning_rate": 4.770702738370175e-07, + "loss": 0.7981, + "step": 1680 + }, + { + "epoch": 0.5392, + "grad_norm": 3.8933768272399902, + "learning_rate": 4.754206532497526e-07, + "loss": 0.7836, + "step": 1685 + }, + { + "epoch": 0.5408, + "grad_norm": 4.233343124389648, + "learning_rate": 4.737710326624876e-07, + "loss": 0.9953, + "step": 1690 + }, + { + "epoch": 0.5424, + "grad_norm": 4.121957302093506, + "learning_rate": 4.721214120752227e-07, + "loss": 0.8909, + "step": 1695 + }, + { + "epoch": 0.544, + "grad_norm": 3.326876640319824, + "learning_rate": 4.704717914879577e-07, + "loss": 0.8207, + "step": 1700 + }, + { + "epoch": 0.5456, + "grad_norm": 4.2965006828308105, + "learning_rate": 4.688221709006928e-07, + "loss": 0.7338, + "step": 1705 + }, + { + "epoch": 0.5472, + "grad_norm": 12.319995880126953, + "learning_rate": 4.6717255031342787e-07, + "loss": 0.8716, + "step": 1710 + }, + { + "epoch": 0.5488, + "grad_norm": 3.4306647777557373, + "learning_rate": 4.6552292972616297e-07, + "loss": 0.8844, + "step": 1715 + }, + { + "epoch": 0.5504, + "grad_norm": 3.8839616775512695, + "learning_rate": 4.638733091388981e-07, + "loss": 0.6966, + "step": 1720 + }, + { + "epoch": 0.552, + "grad_norm": 4.802063941955566, + "learning_rate": 4.622236885516331e-07, + "loss": 1.0128, + "step": 1725 + }, + { + "epoch": 0.5536, + "grad_norm": 3.7047386169433594, + "learning_rate": 4.6057406796436817e-07, + "loss": 0.7706, + "step": 1730 + }, + { + "epoch": 0.5552, + "grad_norm": 5.304298400878906, + "learning_rate": 4.589244473771032e-07, + "loss": 0.7627, + "step": 1735 + }, + { + "epoch": 0.5568, + "grad_norm": 3.211620330810547, + "learning_rate": 4.572748267898383e-07, + "loss": 0.8409, + "step": 1740 + }, + { + "epoch": 0.5584, + "grad_norm": 4.873741149902344, + "learning_rate": 4.5562520620257337e-07, + "loss": 0.9454, + "step": 1745 + }, + { + "epoch": 0.56, + "grad_norm": 3.801036834716797, + "learning_rate": 4.5397558561530847e-07, + "loss": 0.8148, + "step": 1750 + }, + { + "epoch": 0.5616, + "grad_norm": 4.238209247589111, + "learning_rate": 4.5232596502804357e-07, + "loss": 0.907, + "step": 1755 + }, + { + "epoch": 0.5632, + "grad_norm": 5.311016082763672, + "learning_rate": 4.506763444407786e-07, + "loss": 0.8865, + "step": 1760 + }, + { + "epoch": 0.5648, + "grad_norm": 5.096076011657715, + "learning_rate": 4.4902672385351366e-07, + "loss": 0.8967, + "step": 1765 + }, + { + "epoch": 0.5664, + "grad_norm": 3.0391855239868164, + "learning_rate": 4.473771032662487e-07, + "loss": 0.8671, + "step": 1770 + }, + { + "epoch": 0.568, + "grad_norm": 3.2100040912628174, + "learning_rate": 4.457274826789838e-07, + "loss": 0.8144, + "step": 1775 + }, + { + "epoch": 0.5696, + "grad_norm": 4.312873840332031, + "learning_rate": 4.4407786209171886e-07, + "loss": 0.9743, + "step": 1780 + }, + { + "epoch": 0.5712, + "grad_norm": 4.577536582946777, + "learning_rate": 4.4242824150445396e-07, + "loss": 0.8988, + "step": 1785 + }, + { + "epoch": 0.5728, + "grad_norm": 6.181716442108154, + "learning_rate": 4.40778620917189e-07, + "loss": 0.8709, + "step": 1790 + }, + { + "epoch": 0.5744, + "grad_norm": 3.878676176071167, + "learning_rate": 4.391290003299241e-07, + "loss": 0.9212, + "step": 1795 + }, + { + "epoch": 0.576, + "grad_norm": 3.634641170501709, + "learning_rate": 4.374793797426592e-07, + "loss": 0.9252, + "step": 1800 + }, + { + "epoch": 0.5776, + "grad_norm": 4.589493274688721, + "learning_rate": 4.358297591553942e-07, + "loss": 0.9442, + "step": 1805 + }, + { + "epoch": 0.5792, + "grad_norm": 3.5581719875335693, + "learning_rate": 4.341801385681293e-07, + "loss": 0.8068, + "step": 1810 + }, + { + "epoch": 0.5808, + "grad_norm": 10.048519134521484, + "learning_rate": 4.3253051798086436e-07, + "loss": 0.9334, + "step": 1815 + }, + { + "epoch": 0.5824, + "grad_norm": 7.136456489562988, + "learning_rate": 4.3088089739359946e-07, + "loss": 0.8711, + "step": 1820 + }, + { + "epoch": 0.584, + "grad_norm": 12.951844215393066, + "learning_rate": 4.292312768063345e-07, + "loss": 0.8709, + "step": 1825 + }, + { + "epoch": 0.5856, + "grad_norm": 4.377983093261719, + "learning_rate": 4.275816562190696e-07, + "loss": 0.8509, + "step": 1830 + }, + { + "epoch": 0.5872, + "grad_norm": 4.836514472961426, + "learning_rate": 4.259320356318047e-07, + "loss": 0.882, + "step": 1835 + }, + { + "epoch": 0.5888, + "grad_norm": 3.0401344299316406, + "learning_rate": 4.242824150445397e-07, + "loss": 0.9351, + "step": 1840 + }, + { + "epoch": 0.5904, + "grad_norm": 4.854428768157959, + "learning_rate": 4.226327944572748e-07, + "loss": 0.9465, + "step": 1845 + }, + { + "epoch": 0.592, + "grad_norm": 3.092222213745117, + "learning_rate": 4.2098317387000985e-07, + "loss": 0.8038, + "step": 1850 + }, + { + "epoch": 0.5936, + "grad_norm": 5.498143196105957, + "learning_rate": 4.1933355328274495e-07, + "loss": 0.769, + "step": 1855 + }, + { + "epoch": 0.5952, + "grad_norm": 2.4063949584960938, + "learning_rate": 4.1768393269548e-07, + "loss": 0.8057, + "step": 1860 + }, + { + "epoch": 0.5968, + "grad_norm": 5.123895168304443, + "learning_rate": 4.160343121082151e-07, + "loss": 0.9554, + "step": 1865 + }, + { + "epoch": 0.5984, + "grad_norm": 7.29245662689209, + "learning_rate": 4.143846915209502e-07, + "loss": 0.9079, + "step": 1870 + }, + { + "epoch": 0.6, + "grad_norm": 2.9312267303466797, + "learning_rate": 4.1273507093368525e-07, + "loss": 0.8765, + "step": 1875 + }, + { + "epoch": 0.6016, + "grad_norm": 3.0390522480010986, + "learning_rate": 4.110854503464203e-07, + "loss": 0.9539, + "step": 1880 + }, + { + "epoch": 0.6032, + "grad_norm": 3.8350090980529785, + "learning_rate": 4.0943582975915535e-07, + "loss": 0.8397, + "step": 1885 + }, + { + "epoch": 0.6048, + "grad_norm": 3.9119083881378174, + "learning_rate": 4.0778620917189045e-07, + "loss": 0.8769, + "step": 1890 + }, + { + "epoch": 0.6064, + "grad_norm": 3.361199378967285, + "learning_rate": 4.061365885846255e-07, + "loss": 0.9028, + "step": 1895 + }, + { + "epoch": 0.608, + "grad_norm": 4.87637186050415, + "learning_rate": 4.044869679973606e-07, + "loss": 0.8022, + "step": 1900 + }, + { + "epoch": 0.6096, + "grad_norm": 4.546545505523682, + "learning_rate": 4.028373474100957e-07, + "loss": 0.7343, + "step": 1905 + }, + { + "epoch": 0.6112, + "grad_norm": 2.975339651107788, + "learning_rate": 4.0118772682283075e-07, + "loss": 0.8335, + "step": 1910 + }, + { + "epoch": 0.6128, + "grad_norm": 3.8709919452667236, + "learning_rate": 3.995381062355658e-07, + "loss": 0.7803, + "step": 1915 + }, + { + "epoch": 0.6144, + "grad_norm": 2.690919876098633, + "learning_rate": 3.9788848564830084e-07, + "loss": 1.0247, + "step": 1920 + }, + { + "epoch": 0.616, + "grad_norm": 4.163801193237305, + "learning_rate": 3.9623886506103594e-07, + "loss": 0.8764, + "step": 1925 + }, + { + "epoch": 0.6176, + "grad_norm": 5.445613384246826, + "learning_rate": 3.94589244473771e-07, + "loss": 0.9047, + "step": 1930 + }, + { + "epoch": 0.6192, + "grad_norm": 3.3369109630584717, + "learning_rate": 3.929396238865061e-07, + "loss": 0.9644, + "step": 1935 + }, + { + "epoch": 0.6208, + "grad_norm": 2.8063957691192627, + "learning_rate": 3.912900032992412e-07, + "loss": 0.8411, + "step": 1940 + }, + { + "epoch": 0.6224, + "grad_norm": 3.369598865509033, + "learning_rate": 3.8964038271197624e-07, + "loss": 0.8904, + "step": 1945 + }, + { + "epoch": 0.624, + "grad_norm": 11.861967086791992, + "learning_rate": 3.8799076212471134e-07, + "loss": 0.8503, + "step": 1950 + }, + { + "epoch": 0.6256, + "grad_norm": 3.746105670928955, + "learning_rate": 3.8634114153744634e-07, + "loss": 0.8549, + "step": 1955 + }, + { + "epoch": 0.6272, + "grad_norm": 4.717544078826904, + "learning_rate": 3.8469152095018144e-07, + "loss": 0.7668, + "step": 1960 + }, + { + "epoch": 0.6288, + "grad_norm": 3.0035829544067383, + "learning_rate": 3.830419003629165e-07, + "loss": 0.7519, + "step": 1965 + }, + { + "epoch": 0.6304, + "grad_norm": 4.065003395080566, + "learning_rate": 3.813922797756516e-07, + "loss": 0.8116, + "step": 1970 + }, + { + "epoch": 0.632, + "grad_norm": 5.251111030578613, + "learning_rate": 3.7974265918838663e-07, + "loss": 0.8879, + "step": 1975 + }, + { + "epoch": 0.6336, + "grad_norm": 5.612459659576416, + "learning_rate": 3.7809303860112173e-07, + "loss": 0.7931, + "step": 1980 + }, + { + "epoch": 0.6352, + "grad_norm": 4.041755199432373, + "learning_rate": 3.7644341801385684e-07, + "loss": 0.7963, + "step": 1985 + }, + { + "epoch": 0.6368, + "grad_norm": 9.98974609375, + "learning_rate": 3.7479379742659183e-07, + "loss": 0.8962, + "step": 1990 + }, + { + "epoch": 0.6384, + "grad_norm": 3.949065685272217, + "learning_rate": 3.7314417683932693e-07, + "loss": 0.9132, + "step": 1995 + }, + { + "epoch": 0.64, + "grad_norm": 4.0974297523498535, + "learning_rate": 3.71494556252062e-07, + "loss": 0.9125, + "step": 2000 + }, + { + "epoch": 0.6416, + "grad_norm": 3.70499324798584, + "learning_rate": 3.698449356647971e-07, + "loss": 0.7575, + "step": 2005 + }, + { + "epoch": 0.6432, + "grad_norm": 4.345754623413086, + "learning_rate": 3.6819531507753213e-07, + "loss": 0.7473, + "step": 2010 + }, + { + "epoch": 0.6448, + "grad_norm": 2.8242263793945312, + "learning_rate": 3.6654569449026723e-07, + "loss": 0.959, + "step": 2015 + }, + { + "epoch": 0.6464, + "grad_norm": 3.6714463233947754, + "learning_rate": 3.6489607390300233e-07, + "loss": 0.7814, + "step": 2020 + }, + { + "epoch": 0.648, + "grad_norm": 4.4022908210754395, + "learning_rate": 3.632464533157374e-07, + "loss": 0.8977, + "step": 2025 + }, + { + "epoch": 0.6496, + "grad_norm": 3.5451886653900146, + "learning_rate": 3.6159683272847243e-07, + "loss": 0.8835, + "step": 2030 + }, + { + "epoch": 0.6512, + "grad_norm": 5.699954509735107, + "learning_rate": 3.599472121412075e-07, + "loss": 0.8339, + "step": 2035 + }, + { + "epoch": 0.6528, + "grad_norm": 3.2886204719543457, + "learning_rate": 3.582975915539426e-07, + "loss": 0.85, + "step": 2040 + }, + { + "epoch": 0.6544, + "grad_norm": 2.8363375663757324, + "learning_rate": 3.566479709666776e-07, + "loss": 0.8195, + "step": 2045 + }, + { + "epoch": 0.656, + "grad_norm": 3.734877824783325, + "learning_rate": 3.549983503794127e-07, + "loss": 0.8803, + "step": 2050 + }, + { + "epoch": 0.6576, + "grad_norm": 2.6836330890655518, + "learning_rate": 3.533487297921478e-07, + "loss": 0.8574, + "step": 2055 + }, + { + "epoch": 0.6592, + "grad_norm": 3.9296648502349854, + "learning_rate": 3.516991092048829e-07, + "loss": 0.7938, + "step": 2060 + }, + { + "epoch": 0.6608, + "grad_norm": 2.973696231842041, + "learning_rate": 3.500494886176179e-07, + "loss": 0.8593, + "step": 2065 + }, + { + "epoch": 0.6624, + "grad_norm": 4.675530433654785, + "learning_rate": 3.4839986803035297e-07, + "loss": 0.8456, + "step": 2070 + }, + { + "epoch": 0.664, + "grad_norm": 4.891861915588379, + "learning_rate": 3.4675024744308807e-07, + "loss": 0.9814, + "step": 2075 + }, + { + "epoch": 0.6656, + "grad_norm": 3.9921982288360596, + "learning_rate": 3.451006268558231e-07, + "loss": 0.8416, + "step": 2080 + }, + { + "epoch": 0.6672, + "grad_norm": 3.1958041191101074, + "learning_rate": 3.434510062685582e-07, + "loss": 0.905, + "step": 2085 + }, + { + "epoch": 0.6688, + "grad_norm": 4.344924449920654, + "learning_rate": 3.418013856812933e-07, + "loss": 0.8202, + "step": 2090 + }, + { + "epoch": 0.6704, + "grad_norm": 7.5191569328308105, + "learning_rate": 3.4015176509402837e-07, + "loss": 0.9426, + "step": 2095 + }, + { + "epoch": 0.672, + "grad_norm": 4.440326690673828, + "learning_rate": 3.3850214450676347e-07, + "loss": 0.8205, + "step": 2100 + }, + { + "epoch": 0.6736, + "grad_norm": 6.4901123046875, + "learning_rate": 3.3685252391949846e-07, + "loss": 0.7936, + "step": 2105 + }, + { + "epoch": 0.6752, + "grad_norm": 3.9426374435424805, + "learning_rate": 3.3520290333223357e-07, + "loss": 0.793, + "step": 2110 + }, + { + "epoch": 0.6768, + "grad_norm": 5.018584728240967, + "learning_rate": 3.335532827449686e-07, + "loss": 0.7799, + "step": 2115 + }, + { + "epoch": 0.6784, + "grad_norm": 3.7835421562194824, + "learning_rate": 3.319036621577037e-07, + "loss": 0.8696, + "step": 2120 + }, + { + "epoch": 0.68, + "grad_norm": 6.0190839767456055, + "learning_rate": 3.302540415704388e-07, + "loss": 0.9139, + "step": 2125 + }, + { + "epoch": 0.6816, + "grad_norm": 5.751317977905273, + "learning_rate": 3.2860442098317386e-07, + "loss": 0.8522, + "step": 2130 + }, + { + "epoch": 0.6832, + "grad_norm": 6.684688091278076, + "learning_rate": 3.2695480039590896e-07, + "loss": 0.8061, + "step": 2135 + }, + { + "epoch": 0.6848, + "grad_norm": 2.783705234527588, + "learning_rate": 3.2530517980864396e-07, + "loss": 0.7725, + "step": 2140 + }, + { + "epoch": 0.6864, + "grad_norm": 4.636482238769531, + "learning_rate": 3.2365555922137906e-07, + "loss": 0.8479, + "step": 2145 + }, + { + "epoch": 0.688, + "grad_norm": 5.260950565338135, + "learning_rate": 3.220059386341141e-07, + "loss": 1.0717, + "step": 2150 + }, + { + "epoch": 0.6896, + "grad_norm": 5.191953659057617, + "learning_rate": 3.203563180468492e-07, + "loss": 0.8714, + "step": 2155 + }, + { + "epoch": 0.6912, + "grad_norm": 7.279730796813965, + "learning_rate": 3.187066974595843e-07, + "loss": 0.8246, + "step": 2160 + }, + { + "epoch": 0.6928, + "grad_norm": 2.966627359390259, + "learning_rate": 3.1705707687231936e-07, + "loss": 0.9053, + "step": 2165 + }, + { + "epoch": 0.6944, + "grad_norm": 8.789515495300293, + "learning_rate": 3.1540745628505446e-07, + "loss": 0.7851, + "step": 2170 + }, + { + "epoch": 0.696, + "grad_norm": 2.929105520248413, + "learning_rate": 3.137578356977895e-07, + "loss": 0.956, + "step": 2175 + }, + { + "epoch": 0.6976, + "grad_norm": 5.6356096267700195, + "learning_rate": 3.1210821511052456e-07, + "loss": 0.7946, + "step": 2180 + }, + { + "epoch": 0.6992, + "grad_norm": 3.3033862113952637, + "learning_rate": 3.104585945232596e-07, + "loss": 0.884, + "step": 2185 + }, + { + "epoch": 0.7008, + "grad_norm": 5.996482849121094, + "learning_rate": 3.088089739359947e-07, + "loss": 0.8342, + "step": 2190 + }, + { + "epoch": 0.7024, + "grad_norm": 7.644280910491943, + "learning_rate": 3.0715935334872975e-07, + "loss": 0.8575, + "step": 2195 + }, + { + "epoch": 0.704, + "grad_norm": 5.780369281768799, + "learning_rate": 3.0550973276146485e-07, + "loss": 0.84, + "step": 2200 + }, + { + "epoch": 0.7056, + "grad_norm": 3.7677314281463623, + "learning_rate": 3.0386011217419995e-07, + "loss": 0.8764, + "step": 2205 + }, + { + "epoch": 0.7072, + "grad_norm": 4.153870105743408, + "learning_rate": 3.02210491586935e-07, + "loss": 0.8447, + "step": 2210 + }, + { + "epoch": 0.7088, + "grad_norm": 6.395594120025635, + "learning_rate": 3.0056087099967005e-07, + "loss": 0.8499, + "step": 2215 + }, + { + "epoch": 0.7104, + "grad_norm": 3.4210963249206543, + "learning_rate": 2.989112504124051e-07, + "loss": 0.7547, + "step": 2220 + }, + { + "epoch": 0.712, + "grad_norm": 2.710740327835083, + "learning_rate": 2.972616298251402e-07, + "loss": 0.8286, + "step": 2225 + }, + { + "epoch": 0.7136, + "grad_norm": 5.014111042022705, + "learning_rate": 2.9561200923787525e-07, + "loss": 0.8706, + "step": 2230 + }, + { + "epoch": 0.7152, + "grad_norm": 3.8330109119415283, + "learning_rate": 2.9396238865061035e-07, + "loss": 0.8221, + "step": 2235 + }, + { + "epoch": 0.7168, + "grad_norm": 5.695978164672852, + "learning_rate": 2.9231276806334545e-07, + "loss": 0.8412, + "step": 2240 + }, + { + "epoch": 0.7184, + "grad_norm": 5.974388599395752, + "learning_rate": 2.906631474760805e-07, + "loss": 0.8235, + "step": 2245 + }, + { + "epoch": 0.72, + "grad_norm": 2.9334166049957275, + "learning_rate": 2.890135268888156e-07, + "loss": 0.8965, + "step": 2250 + }, + { + "epoch": 0.7216, + "grad_norm": 8.407828330993652, + "learning_rate": 2.873639063015506e-07, + "loss": 0.8991, + "step": 2255 + }, + { + "epoch": 0.7232, + "grad_norm": 4.443752765655518, + "learning_rate": 2.857142857142857e-07, + "loss": 0.7565, + "step": 2260 + }, + { + "epoch": 0.7248, + "grad_norm": 6.351187229156494, + "learning_rate": 2.8406466512702074e-07, + "loss": 0.8202, + "step": 2265 + }, + { + "epoch": 0.7264, + "grad_norm": 4.715820789337158, + "learning_rate": 2.8241504453975584e-07, + "loss": 0.877, + "step": 2270 + }, + { + "epoch": 0.728, + "grad_norm": 3.1347246170043945, + "learning_rate": 2.8076542395249094e-07, + "loss": 0.8227, + "step": 2275 + }, + { + "epoch": 0.7296, + "grad_norm": 3.8322551250457764, + "learning_rate": 2.79115803365226e-07, + "loss": 0.7839, + "step": 2280 + }, + { + "epoch": 0.7312, + "grad_norm": 4.289877414703369, + "learning_rate": 2.774661827779611e-07, + "loss": 0.7952, + "step": 2285 + }, + { + "epoch": 0.7328, + "grad_norm": 3.775768995285034, + "learning_rate": 2.758165621906961e-07, + "loss": 0.7924, + "step": 2290 + }, + { + "epoch": 0.7344, + "grad_norm": 4.233770370483398, + "learning_rate": 2.741669416034312e-07, + "loss": 0.8008, + "step": 2295 + }, + { + "epoch": 0.736, + "grad_norm": 5.131399154663086, + "learning_rate": 2.7251732101616624e-07, + "loss": 0.8103, + "step": 2300 + }, + { + "epoch": 0.7376, + "grad_norm": 7.184566497802734, + "learning_rate": 2.7086770042890134e-07, + "loss": 0.9024, + "step": 2305 + }, + { + "epoch": 0.7392, + "grad_norm": 3.6044952869415283, + "learning_rate": 2.6921807984163644e-07, + "loss": 0.8925, + "step": 2310 + }, + { + "epoch": 0.7408, + "grad_norm": 8.124959945678711, + "learning_rate": 2.675684592543715e-07, + "loss": 0.8227, + "step": 2315 + }, + { + "epoch": 0.7424, + "grad_norm": 5.050447463989258, + "learning_rate": 2.659188386671066e-07, + "loss": 0.87, + "step": 2320 + }, + { + "epoch": 0.744, + "grad_norm": 3.2532646656036377, + "learning_rate": 2.6426921807984164e-07, + "loss": 0.9024, + "step": 2325 + }, + { + "epoch": 0.7456, + "grad_norm": 7.244692325592041, + "learning_rate": 2.626195974925767e-07, + "loss": 0.7402, + "step": 2330 + }, + { + "epoch": 0.7472, + "grad_norm": 5.4176435470581055, + "learning_rate": 2.6096997690531173e-07, + "loss": 0.9476, + "step": 2335 + }, + { + "epoch": 0.7488, + "grad_norm": 13.94157886505127, + "learning_rate": 2.5932035631804683e-07, + "loss": 0.7621, + "step": 2340 + }, + { + "epoch": 0.7504, + "grad_norm": 2.509117603302002, + "learning_rate": 2.5767073573078193e-07, + "loss": 0.7259, + "step": 2345 + }, + { + "epoch": 0.752, + "grad_norm": 3.073138952255249, + "learning_rate": 2.56021115143517e-07, + "loss": 0.8224, + "step": 2350 + }, + { + "epoch": 0.7536, + "grad_norm": 3.9155077934265137, + "learning_rate": 2.543714945562521e-07, + "loss": 0.8596, + "step": 2355 + }, + { + "epoch": 0.7552, + "grad_norm": 6.405920028686523, + "learning_rate": 2.5272187396898713e-07, + "loss": 0.8152, + "step": 2360 + }, + { + "epoch": 0.7568, + "grad_norm": 3.8203928470611572, + "learning_rate": 2.510722533817222e-07, + "loss": 0.8907, + "step": 2365 + }, + { + "epoch": 0.7584, + "grad_norm": 3.9368674755096436, + "learning_rate": 2.494226327944573e-07, + "loss": 0.8804, + "step": 2370 + }, + { + "epoch": 0.76, + "grad_norm": 4.452835559844971, + "learning_rate": 2.4777301220719233e-07, + "loss": 0.8304, + "step": 2375 + }, + { + "epoch": 0.7616, + "grad_norm": 4.987030982971191, + "learning_rate": 2.461233916199274e-07, + "loss": 0.9112, + "step": 2380 + }, + { + "epoch": 0.7632, + "grad_norm": 7.84393310546875, + "learning_rate": 2.444737710326625e-07, + "loss": 0.8584, + "step": 2385 + }, + { + "epoch": 0.7648, + "grad_norm": 3.063011646270752, + "learning_rate": 2.428241504453976e-07, + "loss": 0.8648, + "step": 2390 + }, + { + "epoch": 0.7664, + "grad_norm": 5.494943618774414, + "learning_rate": 2.411745298581326e-07, + "loss": 0.8633, + "step": 2395 + }, + { + "epoch": 0.768, + "grad_norm": 3.209425449371338, + "learning_rate": 2.3952490927086767e-07, + "loss": 0.7618, + "step": 2400 + }, + { + "epoch": 0.7696, + "grad_norm": 3.163612127304077, + "learning_rate": 2.3787528868360277e-07, + "loss": 0.7093, + "step": 2405 + }, + { + "epoch": 0.7712, + "grad_norm": 4.022956848144531, + "learning_rate": 2.3622566809633785e-07, + "loss": 0.7705, + "step": 2410 + }, + { + "epoch": 0.7728, + "grad_norm": 3.748598575592041, + "learning_rate": 2.345760475090729e-07, + "loss": 0.942, + "step": 2415 + }, + { + "epoch": 0.7744, + "grad_norm": 3.5295920372009277, + "learning_rate": 2.3292642692180797e-07, + "loss": 0.8094, + "step": 2420 + }, + { + "epoch": 0.776, + "grad_norm": 5.767539978027344, + "learning_rate": 2.3127680633454305e-07, + "loss": 0.8425, + "step": 2425 + }, + { + "epoch": 0.7776, + "grad_norm": 3.4246678352355957, + "learning_rate": 2.2962718574727812e-07, + "loss": 0.7777, + "step": 2430 + }, + { + "epoch": 0.7792, + "grad_norm": 3.2887349128723145, + "learning_rate": 2.2797756516001317e-07, + "loss": 0.8842, + "step": 2435 + }, + { + "epoch": 0.7808, + "grad_norm": 4.332666397094727, + "learning_rate": 2.2632794457274827e-07, + "loss": 0.8946, + "step": 2440 + }, + { + "epoch": 0.7824, + "grad_norm": 3.6099178791046143, + "learning_rate": 2.2467832398548334e-07, + "loss": 0.929, + "step": 2445 + }, + { + "epoch": 0.784, + "grad_norm": 2.9944612979888916, + "learning_rate": 2.230287033982184e-07, + "loss": 0.9407, + "step": 2450 + }, + { + "epoch": 0.7856, + "grad_norm": 4.5248613357543945, + "learning_rate": 2.2137908281095347e-07, + "loss": 0.9392, + "step": 2455 + }, + { + "epoch": 0.7872, + "grad_norm": 2.7737669944763184, + "learning_rate": 2.1972946222368854e-07, + "loss": 0.627, + "step": 2460 + }, + { + "epoch": 0.7888, + "grad_norm": 2.95540452003479, + "learning_rate": 2.1807984163642361e-07, + "loss": 0.7346, + "step": 2465 + }, + { + "epoch": 0.7904, + "grad_norm": 8.727768898010254, + "learning_rate": 2.1643022104915866e-07, + "loss": 0.9009, + "step": 2470 + }, + { + "epoch": 0.792, + "grad_norm": 4.173437118530273, + "learning_rate": 2.1478060046189376e-07, + "loss": 0.8968, + "step": 2475 + }, + { + "epoch": 0.7936, + "grad_norm": 3.4933767318725586, + "learning_rate": 2.1313097987462884e-07, + "loss": 0.868, + "step": 2480 + }, + { + "epoch": 0.7952, + "grad_norm": 3.33754301071167, + "learning_rate": 2.114813592873639e-07, + "loss": 0.8185, + "step": 2485 + }, + { + "epoch": 0.7968, + "grad_norm": 7.453437328338623, + "learning_rate": 2.0983173870009896e-07, + "loss": 0.8654, + "step": 2490 + }, + { + "epoch": 0.7984, + "grad_norm": 9.287111282348633, + "learning_rate": 2.0818211811283404e-07, + "loss": 0.7744, + "step": 2495 + }, + { + "epoch": 0.8, + "grad_norm": 4.195357322692871, + "learning_rate": 2.065324975255691e-07, + "loss": 0.8295, + "step": 2500 + }, + { + "epoch": 0.8016, + "grad_norm": 4.878857612609863, + "learning_rate": 2.0488287693830418e-07, + "loss": 0.7917, + "step": 2505 + }, + { + "epoch": 0.8032, + "grad_norm": 3.239182710647583, + "learning_rate": 2.0323325635103923e-07, + "loss": 0.8339, + "step": 2510 + }, + { + "epoch": 0.8048, + "grad_norm": 4.196946144104004, + "learning_rate": 2.0158363576377433e-07, + "loss": 0.9609, + "step": 2515 + }, + { + "epoch": 0.8064, + "grad_norm": 2.7448809146881104, + "learning_rate": 1.999340151765094e-07, + "loss": 0.8516, + "step": 2520 + }, + { + "epoch": 0.808, + "grad_norm": 4.893881320953369, + "learning_rate": 1.9828439458924446e-07, + "loss": 0.7908, + "step": 2525 + }, + { + "epoch": 0.8096, + "grad_norm": 3.318279266357422, + "learning_rate": 1.9663477400197953e-07, + "loss": 0.7966, + "step": 2530 + }, + { + "epoch": 0.8112, + "grad_norm": 2.901827573776245, + "learning_rate": 1.949851534147146e-07, + "loss": 0.7831, + "step": 2535 + }, + { + "epoch": 0.8128, + "grad_norm": 5.1762847900390625, + "learning_rate": 1.9333553282744968e-07, + "loss": 0.9897, + "step": 2540 + }, + { + "epoch": 0.8144, + "grad_norm": 6.721929550170898, + "learning_rate": 1.9168591224018473e-07, + "loss": 0.8774, + "step": 2545 + }, + { + "epoch": 0.816, + "grad_norm": 3.2159762382507324, + "learning_rate": 1.9003629165291983e-07, + "loss": 0.8543, + "step": 2550 + }, + { + "epoch": 0.8176, + "grad_norm": 2.2698450088500977, + "learning_rate": 1.883866710656549e-07, + "loss": 0.7665, + "step": 2555 + }, + { + "epoch": 0.8192, + "grad_norm": 6.745720386505127, + "learning_rate": 1.8673705047838998e-07, + "loss": 0.8497, + "step": 2560 + }, + { + "epoch": 0.8208, + "grad_norm": 7.653261661529541, + "learning_rate": 1.8508742989112503e-07, + "loss": 0.8156, + "step": 2565 + }, + { + "epoch": 0.8224, + "grad_norm": 5.070962905883789, + "learning_rate": 1.834378093038601e-07, + "loss": 0.8599, + "step": 2570 + }, + { + "epoch": 0.824, + "grad_norm": 3.101536750793457, + "learning_rate": 1.8178818871659517e-07, + "loss": 0.9103, + "step": 2575 + }, + { + "epoch": 0.8256, + "grad_norm": 5.420032024383545, + "learning_rate": 1.8013856812933025e-07, + "loss": 0.8812, + "step": 2580 + }, + { + "epoch": 0.8272, + "grad_norm": 6.0531697273254395, + "learning_rate": 1.7848894754206532e-07, + "loss": 0.9167, + "step": 2585 + }, + { + "epoch": 0.8288, + "grad_norm": 7.951639175415039, + "learning_rate": 1.768393269548004e-07, + "loss": 0.958, + "step": 2590 + }, + { + "epoch": 0.8304, + "grad_norm": 3.8448524475097656, + "learning_rate": 1.7518970636753547e-07, + "loss": 0.9107, + "step": 2595 + }, + { + "epoch": 0.832, + "grad_norm": 6.432617664337158, + "learning_rate": 1.7354008578027052e-07, + "loss": 0.9313, + "step": 2600 + }, + { + "epoch": 0.8336, + "grad_norm": 6.240530967712402, + "learning_rate": 1.718904651930056e-07, + "loss": 0.8152, + "step": 2605 + }, + { + "epoch": 0.8352, + "grad_norm": 3.8396613597869873, + "learning_rate": 1.7024084460574067e-07, + "loss": 0.8105, + "step": 2610 + }, + { + "epoch": 0.8368, + "grad_norm": 5.357729434967041, + "learning_rate": 1.6859122401847574e-07, + "loss": 0.8994, + "step": 2615 + }, + { + "epoch": 0.8384, + "grad_norm": 3.711209535598755, + "learning_rate": 1.669416034312108e-07, + "loss": 0.8206, + "step": 2620 + }, + { + "epoch": 0.84, + "grad_norm": 8.217768669128418, + "learning_rate": 1.652919828439459e-07, + "loss": 0.8569, + "step": 2625 + }, + { + "epoch": 0.8416, + "grad_norm": 3.3683290481567383, + "learning_rate": 1.6364236225668097e-07, + "loss": 0.8385, + "step": 2630 + }, + { + "epoch": 0.8432, + "grad_norm": 5.228672981262207, + "learning_rate": 1.6199274166941604e-07, + "loss": 0.9177, + "step": 2635 + }, + { + "epoch": 0.8448, + "grad_norm": 4.824789524078369, + "learning_rate": 1.603431210821511e-07, + "loss": 0.8195, + "step": 2640 + }, + { + "epoch": 0.8464, + "grad_norm": 3.6725375652313232, + "learning_rate": 1.5869350049488616e-07, + "loss": 0.8017, + "step": 2645 + }, + { + "epoch": 0.848, + "grad_norm": 3.130878210067749, + "learning_rate": 1.5704387990762124e-07, + "loss": 0.8088, + "step": 2650 + }, + { + "epoch": 0.8496, + "grad_norm": 4.449658393859863, + "learning_rate": 1.553942593203563e-07, + "loss": 0.7835, + "step": 2655 + }, + { + "epoch": 0.8512, + "grad_norm": 4.97245454788208, + "learning_rate": 1.537446387330914e-07, + "loss": 0.837, + "step": 2660 + }, + { + "epoch": 0.8528, + "grad_norm": 16.94793128967285, + "learning_rate": 1.5209501814582646e-07, + "loss": 0.8113, + "step": 2665 + }, + { + "epoch": 0.8544, + "grad_norm": 4.743756294250488, + "learning_rate": 1.5044539755856154e-07, + "loss": 0.8042, + "step": 2670 + }, + { + "epoch": 0.856, + "grad_norm": 10.094191551208496, + "learning_rate": 1.4879577697129658e-07, + "loss": 0.8571, + "step": 2675 + }, + { + "epoch": 0.8576, + "grad_norm": 5.925148010253906, + "learning_rate": 1.4714615638403166e-07, + "loss": 0.8749, + "step": 2680 + }, + { + "epoch": 0.8592, + "grad_norm": 3.691056251525879, + "learning_rate": 1.4549653579676673e-07, + "loss": 0.8824, + "step": 2685 + }, + { + "epoch": 0.8608, + "grad_norm": 3.58223295211792, + "learning_rate": 1.438469152095018e-07, + "loss": 0.8761, + "step": 2690 + }, + { + "epoch": 0.8624, + "grad_norm": 2.6448755264282227, + "learning_rate": 1.4219729462223686e-07, + "loss": 0.8868, + "step": 2695 + }, + { + "epoch": 0.864, + "grad_norm": 3.1470277309417725, + "learning_rate": 1.4054767403497196e-07, + "loss": 0.738, + "step": 2700 + }, + { + "epoch": 0.8656, + "grad_norm": 2.670072317123413, + "learning_rate": 1.3889805344770703e-07, + "loss": 0.834, + "step": 2705 + }, + { + "epoch": 0.8672, + "grad_norm": 3.6198477745056152, + "learning_rate": 1.372484328604421e-07, + "loss": 0.866, + "step": 2710 + }, + { + "epoch": 0.8688, + "grad_norm": 3.4495441913604736, + "learning_rate": 1.3559881227317715e-07, + "loss": 0.8393, + "step": 2715 + }, + { + "epoch": 0.8704, + "grad_norm": 4.266736030578613, + "learning_rate": 1.3394919168591223e-07, + "loss": 0.6669, + "step": 2720 + }, + { + "epoch": 0.872, + "grad_norm": 7.86549711227417, + "learning_rate": 1.322995710986473e-07, + "loss": 0.8842, + "step": 2725 + }, + { + "epoch": 0.8736, + "grad_norm": 9.990023612976074, + "learning_rate": 1.3064995051138238e-07, + "loss": 0.8435, + "step": 2730 + }, + { + "epoch": 0.8752, + "grad_norm": 5.904709815979004, + "learning_rate": 1.2900032992411745e-07, + "loss": 0.8569, + "step": 2735 + }, + { + "epoch": 0.8768, + "grad_norm": 3.9784185886383057, + "learning_rate": 1.2735070933685253e-07, + "loss": 0.8613, + "step": 2740 + }, + { + "epoch": 0.8784, + "grad_norm": 3.423604965209961, + "learning_rate": 1.257010887495876e-07, + "loss": 0.8349, + "step": 2745 + }, + { + "epoch": 0.88, + "grad_norm": 7.008749961853027, + "learning_rate": 1.2405146816232267e-07, + "loss": 0.739, + "step": 2750 + }, + { + "epoch": 0.8816, + "grad_norm": 4.69740104675293, + "learning_rate": 1.2240184757505772e-07, + "loss": 0.8429, + "step": 2755 + }, + { + "epoch": 0.8832, + "grad_norm": 11.424933433532715, + "learning_rate": 1.207522269877928e-07, + "loss": 0.8223, + "step": 2760 + }, + { + "epoch": 0.8848, + "grad_norm": 9.028188705444336, + "learning_rate": 1.1910260640052787e-07, + "loss": 0.8795, + "step": 2765 + }, + { + "epoch": 0.8864, + "grad_norm": 4.878159999847412, + "learning_rate": 1.1745298581326295e-07, + "loss": 0.8503, + "step": 2770 + }, + { + "epoch": 0.888, + "grad_norm": 4.5786237716674805, + "learning_rate": 1.1580336522599801e-07, + "loss": 0.7894, + "step": 2775 + }, + { + "epoch": 0.8896, + "grad_norm": 5.562559127807617, + "learning_rate": 1.141537446387331e-07, + "loss": 0.9339, + "step": 2780 + }, + { + "epoch": 0.8912, + "grad_norm": 5.035255432128906, + "learning_rate": 1.1250412405146816e-07, + "loss": 0.9041, + "step": 2785 + }, + { + "epoch": 0.8928, + "grad_norm": 3.3735504150390625, + "learning_rate": 1.1085450346420323e-07, + "loss": 0.8033, + "step": 2790 + }, + { + "epoch": 0.8944, + "grad_norm": 3.345130443572998, + "learning_rate": 1.0920488287693829e-07, + "loss": 0.7649, + "step": 2795 + }, + { + "epoch": 0.896, + "grad_norm": 2.9101784229278564, + "learning_rate": 1.0755526228967338e-07, + "loss": 0.7702, + "step": 2800 + }, + { + "epoch": 0.8976, + "grad_norm": 8.602867126464844, + "learning_rate": 1.0590564170240844e-07, + "loss": 0.8769, + "step": 2805 + }, + { + "epoch": 0.8992, + "grad_norm": 3.7691822052001953, + "learning_rate": 1.042560211151435e-07, + "loss": 0.7437, + "step": 2810 + }, + { + "epoch": 0.9008, + "grad_norm": 3.548344612121582, + "learning_rate": 1.0260640052787859e-07, + "loss": 0.7927, + "step": 2815 + }, + { + "epoch": 0.9024, + "grad_norm": 6.130397319793701, + "learning_rate": 1.0095677994061365e-07, + "loss": 0.8964, + "step": 2820 + }, + { + "epoch": 0.904, + "grad_norm": 6.0678935050964355, + "learning_rate": 9.930715935334873e-08, + "loss": 0.8846, + "step": 2825 + }, + { + "epoch": 0.9056, + "grad_norm": 3.202853202819824, + "learning_rate": 9.765753876608379e-08, + "loss": 0.7872, + "step": 2830 + }, + { + "epoch": 0.9072, + "grad_norm": 7.076948165893555, + "learning_rate": 9.600791817881887e-08, + "loss": 0.841, + "step": 2835 + }, + { + "epoch": 0.9088, + "grad_norm": 4.527687072753906, + "learning_rate": 9.435829759155394e-08, + "loss": 0.8376, + "step": 2840 + }, + { + "epoch": 0.9104, + "grad_norm": 3.8655998706817627, + "learning_rate": 9.270867700428901e-08, + "loss": 0.9163, + "step": 2845 + }, + { + "epoch": 0.912, + "grad_norm": 4.189991474151611, + "learning_rate": 9.105905641702407e-08, + "loss": 0.9171, + "step": 2850 + }, + { + "epoch": 0.9136, + "grad_norm": 3.6782002449035645, + "learning_rate": 8.940943582975916e-08, + "loss": 0.8107, + "step": 2855 + }, + { + "epoch": 0.9152, + "grad_norm": 3.8793959617614746, + "learning_rate": 8.775981524249422e-08, + "loss": 0.8324, + "step": 2860 + }, + { + "epoch": 0.9168, + "grad_norm": 6.360919952392578, + "learning_rate": 8.61101946552293e-08, + "loss": 0.901, + "step": 2865 + }, + { + "epoch": 0.9184, + "grad_norm": 4.218044757843018, + "learning_rate": 8.446057406796437e-08, + "loss": 0.7871, + "step": 2870 + }, + { + "epoch": 0.92, + "grad_norm": 6.177008628845215, + "learning_rate": 8.281095348069944e-08, + "loss": 0.841, + "step": 2875 + }, + { + "epoch": 0.9216, + "grad_norm": 2.739051103591919, + "learning_rate": 8.11613328934345e-08, + "loss": 0.7872, + "step": 2880 + }, + { + "epoch": 0.9232, + "grad_norm": 8.707544326782227, + "learning_rate": 7.951171230616957e-08, + "loss": 0.9274, + "step": 2885 + }, + { + "epoch": 0.9248, + "grad_norm": 4.484316825866699, + "learning_rate": 7.786209171890465e-08, + "loss": 0.9121, + "step": 2890 + }, + { + "epoch": 0.9264, + "grad_norm": 3.211519479751587, + "learning_rate": 7.621247113163972e-08, + "loss": 0.8737, + "step": 2895 + }, + { + "epoch": 0.928, + "grad_norm": 5.020310878753662, + "learning_rate": 7.456285054437479e-08, + "loss": 0.8358, + "step": 2900 + }, + { + "epoch": 0.9296, + "grad_norm": 5.312314510345459, + "learning_rate": 7.291322995710985e-08, + "loss": 0.831, + "step": 2905 + }, + { + "epoch": 0.9312, + "grad_norm": 2.917203903198242, + "learning_rate": 7.126360936984494e-08, + "loss": 0.8756, + "step": 2910 + }, + { + "epoch": 0.9328, + "grad_norm": 3.924370288848877, + "learning_rate": 6.961398878258e-08, + "loss": 0.7825, + "step": 2915 + }, + { + "epoch": 0.9344, + "grad_norm": 3.571991205215454, + "learning_rate": 6.796436819531507e-08, + "loss": 0.807, + "step": 2920 + }, + { + "epoch": 0.936, + "grad_norm": 5.816591739654541, + "learning_rate": 6.631474760805014e-08, + "loss": 0.8491, + "step": 2925 + }, + { + "epoch": 0.9376, + "grad_norm": 2.8010520935058594, + "learning_rate": 6.466512702078522e-08, + "loss": 0.9383, + "step": 2930 + }, + { + "epoch": 0.9392, + "grad_norm": 4.16404914855957, + "learning_rate": 6.301550643352028e-08, + "loss": 0.9048, + "step": 2935 + }, + { + "epoch": 0.9408, + "grad_norm": 3.1094634532928467, + "learning_rate": 6.136588584625536e-08, + "loss": 0.8829, + "step": 2940 + }, + { + "epoch": 0.9424, + "grad_norm": 6.206966400146484, + "learning_rate": 5.971626525899043e-08, + "loss": 0.8568, + "step": 2945 + }, + { + "epoch": 0.944, + "grad_norm": 3.327371120452881, + "learning_rate": 5.80666446717255e-08, + "loss": 0.8895, + "step": 2950 + }, + { + "epoch": 0.9456, + "grad_norm": 3.068650722503662, + "learning_rate": 5.641702408446057e-08, + "loss": 0.8451, + "step": 2955 + }, + { + "epoch": 0.9472, + "grad_norm": 18.251916885375977, + "learning_rate": 5.4767403497195644e-08, + "loss": 0.8896, + "step": 2960 + }, + { + "epoch": 0.9488, + "grad_norm": 6.762292385101318, + "learning_rate": 5.311778290993071e-08, + "loss": 0.775, + "step": 2965 + }, + { + "epoch": 0.9504, + "grad_norm": 3.4393362998962402, + "learning_rate": 5.1468162322665786e-08, + "loss": 0.8701, + "step": 2970 + }, + { + "epoch": 0.952, + "grad_norm": 3.25495982170105, + "learning_rate": 4.9818541735400854e-08, + "loss": 0.9626, + "step": 2975 + }, + { + "epoch": 0.9536, + "grad_norm": 3.750603437423706, + "learning_rate": 4.816892114813593e-08, + "loss": 0.8311, + "step": 2980 + }, + { + "epoch": 0.9552, + "grad_norm": 8.03615951538086, + "learning_rate": 4.6519300560871e-08, + "loss": 0.8256, + "step": 2985 + }, + { + "epoch": 0.9568, + "grad_norm": 3.2003724575042725, + "learning_rate": 4.486967997360607e-08, + "loss": 0.8626, + "step": 2990 + }, + { + "epoch": 0.9584, + "grad_norm": 4.359886169433594, + "learning_rate": 4.3220059386341145e-08, + "loss": 0.8756, + "step": 2995 + }, + { + "epoch": 0.96, + "grad_norm": 6.897675514221191, + "learning_rate": 4.1570438799076207e-08, + "loss": 0.8767, + "step": 3000 + }, + { + "epoch": 0.9616, + "grad_norm": 3.0032472610473633, + "learning_rate": 3.992081821181128e-08, + "loss": 0.9283, + "step": 3005 + }, + { + "epoch": 0.9632, + "grad_norm": 4.568953037261963, + "learning_rate": 3.827119762454635e-08, + "loss": 0.6707, + "step": 3010 + }, + { + "epoch": 0.9648, + "grad_norm": 6.175788879394531, + "learning_rate": 3.6621577037281423e-08, + "loss": 0.7988, + "step": 3015 + }, + { + "epoch": 0.9664, + "grad_norm": 6.2108354568481445, + "learning_rate": 3.497195645001649e-08, + "loss": 0.8649, + "step": 3020 + }, + { + "epoch": 0.968, + "grad_norm": 5.617148399353027, + "learning_rate": 3.3322335862751566e-08, + "loss": 0.8005, + "step": 3025 + }, + { + "epoch": 0.9696, + "grad_norm": 2.560255765914917, + "learning_rate": 3.1672715275486634e-08, + "loss": 0.8828, + "step": 3030 + }, + { + "epoch": 0.9712, + "grad_norm": 3.895402669906616, + "learning_rate": 3.002309468822171e-08, + "loss": 0.9119, + "step": 3035 + }, + { + "epoch": 0.9728, + "grad_norm": 3.2274169921875, + "learning_rate": 2.837347410095678e-08, + "loss": 0.9227, + "step": 3040 + }, + { + "epoch": 0.9744, + "grad_norm": 3.9759535789489746, + "learning_rate": 2.672385351369185e-08, + "loss": 0.7694, + "step": 3045 + }, + { + "epoch": 0.976, + "grad_norm": 5.453105926513672, + "learning_rate": 2.507423292642692e-08, + "loss": 0.8129, + "step": 3050 + }, + { + "epoch": 0.9776, + "grad_norm": 3.8954710960388184, + "learning_rate": 2.342461233916199e-08, + "loss": 0.8646, + "step": 3055 + }, + { + "epoch": 0.9792, + "grad_norm": 5.158627033233643, + "learning_rate": 2.177499175189706e-08, + "loss": 0.873, + "step": 3060 + }, + { + "epoch": 0.9808, + "grad_norm": 3.405482769012451, + "learning_rate": 2.0125371164632132e-08, + "loss": 0.6943, + "step": 3065 + }, + { + "epoch": 0.9824, + "grad_norm": 4.411757469177246, + "learning_rate": 1.8475750577367203e-08, + "loss": 0.8178, + "step": 3070 + }, + { + "epoch": 0.984, + "grad_norm": 3.8500680923461914, + "learning_rate": 1.6826129990102277e-08, + "loss": 0.8075, + "step": 3075 + }, + { + "epoch": 0.9856, + "grad_norm": 9.081463813781738, + "learning_rate": 1.517650940283735e-08, + "loss": 0.8583, + "step": 3080 + }, + { + "epoch": 0.9872, + "grad_norm": 3.0819270610809326, + "learning_rate": 1.3526888815572416e-08, + "loss": 0.7818, + "step": 3085 + }, + { + "epoch": 0.9888, + "grad_norm": 12.406457901000977, + "learning_rate": 1.187726822830749e-08, + "loss": 0.9238, + "step": 3090 + }, + { + "epoch": 0.9904, + "grad_norm": 4.571506977081299, + "learning_rate": 1.022764764104256e-08, + "loss": 0.788, + "step": 3095 + }, + { + "epoch": 0.992, + "grad_norm": 2.9352900981903076, + "learning_rate": 8.578027053777632e-09, + "loss": 0.9279, + "step": 3100 + }, + { + "epoch": 0.9936, + "grad_norm": 4.189651966094971, + "learning_rate": 6.928406466512702e-09, + "loss": 0.8379, + "step": 3105 + }, + { + "epoch": 0.9952, + "grad_norm": 3.8705317974090576, + "learning_rate": 5.278785879247773e-09, + "loss": 0.778, + "step": 3110 + }, + { + "epoch": 0.9968, + "grad_norm": 2.701988458633423, + "learning_rate": 3.629165291982844e-09, + "loss": 0.9363, + "step": 3115 + }, + { + "epoch": 0.9984, + "grad_norm": 3.2893664836883545, + "learning_rate": 1.9795447047179146e-09, + "loss": 0.7902, + "step": 3120 + }, + { + "epoch": 1.0, + "grad_norm": 4.953529357910156, + "learning_rate": 3.299241174529858e-10, + "loss": 0.7962, + "step": 3125 + }, + { + "epoch": 1.0, + "step": 3125, + "total_flos": 1.0275244834155397e+18, + "train_loss": 0.8923269732666016, + "train_runtime": 7058.0642, + "train_samples_per_second": 7.084, + "train_steps_per_second": 0.443 + } + ], + "logging_steps": 5, + "max_steps": 3125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0275244834155397e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}