| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 3125, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0016, | |
| "grad_norm": 9.999629974365234, | |
| "learning_rate": 4.25531914893617e-08, | |
| "loss": 1.0338, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0032, | |
| "grad_norm": 7.134084224700928, | |
| "learning_rate": 9.574468085106382e-08, | |
| "loss": 1.1826, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0048, | |
| "grad_norm": 5.787372589111328, | |
| "learning_rate": 1.4893617021276595e-07, | |
| "loss": 1.145, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0064, | |
| "grad_norm": 13.603924751281738, | |
| "learning_rate": 2.0212765957446807e-07, | |
| "loss": 1.1501, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 13.660343170166016, | |
| "learning_rate": 2.5531914893617016e-07, | |
| "loss": 1.1397, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0096, | |
| "grad_norm": 5.391168594360352, | |
| "learning_rate": 3.085106382978723e-07, | |
| "loss": 1.0871, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0112, | |
| "grad_norm": 9.299227714538574, | |
| "learning_rate": 3.617021276595745e-07, | |
| "loss": 1.2235, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.0128, | |
| "grad_norm": 4.13670539855957, | |
| "learning_rate": 4.148936170212766e-07, | |
| "loss": 1.0978, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0144, | |
| "grad_norm": 5.579084396362305, | |
| "learning_rate": 4.6808510638297873e-07, | |
| "loss": 1.2814, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 13.503495216369629, | |
| "learning_rate": 5.212765957446809e-07, | |
| "loss": 0.9962, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0176, | |
| "grad_norm": 16.958608627319336, | |
| "learning_rate": 5.74468085106383e-07, | |
| "loss": 1.0797, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.0192, | |
| "grad_norm": 8.709441184997559, | |
| "learning_rate": 6.276595744680851e-07, | |
| "loss": 1.1269, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0208, | |
| "grad_norm": 6.347010612487793, | |
| "learning_rate": 6.808510638297872e-07, | |
| "loss": 1.1201, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.0224, | |
| "grad_norm": 12.77128791809082, | |
| "learning_rate": 7.340425531914893e-07, | |
| "loss": 1.0846, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 8.114184379577637, | |
| "learning_rate": 7.872340425531915e-07, | |
| "loss": 1.151, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.0256, | |
| "grad_norm": 17.945985794067383, | |
| "learning_rate": 8.404255319148936e-07, | |
| "loss": 1.0711, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0272, | |
| "grad_norm": 5.469882965087891, | |
| "learning_rate": 8.936170212765957e-07, | |
| "loss": 0.9901, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.0288, | |
| "grad_norm": 10.689380645751953, | |
| "learning_rate": 9.468085106382978e-07, | |
| "loss": 1.2635, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0304, | |
| "grad_norm": 9.217676162719727, | |
| "learning_rate": 1e-06, | |
| "loss": 1.0604, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 4.253987789154053, | |
| "learning_rate": 9.98350379412735e-07, | |
| "loss": 1.1865, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0336, | |
| "grad_norm": 10.984722137451172, | |
| "learning_rate": 9.967007588254702e-07, | |
| "loss": 1.1504, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.0352, | |
| "grad_norm": 6.79901647567749, | |
| "learning_rate": 9.950511382382052e-07, | |
| "loss": 1.0338, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0368, | |
| "grad_norm": 10.826879501342773, | |
| "learning_rate": 9.934015176509404e-07, | |
| "loss": 1.1058, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "grad_norm": 6.437893390655518, | |
| "learning_rate": 9.917518970636754e-07, | |
| "loss": 1.0317, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 3.419424295425415, | |
| "learning_rate": 9.901022764764103e-07, | |
| "loss": 1.0573, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.0416, | |
| "grad_norm": 14.196756362915039, | |
| "learning_rate": 9.884526558891456e-07, | |
| "loss": 1.0892, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0432, | |
| "grad_norm": 4.492501258850098, | |
| "learning_rate": 9.868030353018806e-07, | |
| "loss": 0.9772, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.0448, | |
| "grad_norm": 8.752493858337402, | |
| "learning_rate": 9.851534147146155e-07, | |
| "loss": 0.984, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.0464, | |
| "grad_norm": 3.8549039363861084, | |
| "learning_rate": 9.835037941273505e-07, | |
| "loss": 0.9935, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 5.623340129852295, | |
| "learning_rate": 9.818541735400857e-07, | |
| "loss": 1.1332, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0496, | |
| "grad_norm": 5.848334789276123, | |
| "learning_rate": 9.802045529528207e-07, | |
| "loss": 0.912, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "grad_norm": 7.2841877937316895, | |
| "learning_rate": 9.78554932365556e-07, | |
| "loss": 1.1562, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0528, | |
| "grad_norm": 5.735965728759766, | |
| "learning_rate": 9.76905311778291e-07, | |
| "loss": 0.9819, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.0544, | |
| "grad_norm": 3.197845220565796, | |
| "learning_rate": 9.75255691191026e-07, | |
| "loss": 0.9843, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 8.21580696105957, | |
| "learning_rate": 9.736060706037611e-07, | |
| "loss": 0.9533, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "grad_norm": 5.551872730255127, | |
| "learning_rate": 9.719564500164961e-07, | |
| "loss": 1.0205, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0592, | |
| "grad_norm": 5.322945594787598, | |
| "learning_rate": 9.703068294292313e-07, | |
| "loss": 1.0036, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.0608, | |
| "grad_norm": 4.90363883972168, | |
| "learning_rate": 9.686572088419663e-07, | |
| "loss": 1.0135, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.0624, | |
| "grad_norm": 8.087169647216797, | |
| "learning_rate": 9.670075882547013e-07, | |
| "loss": 1.0293, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 4.1587910652160645, | |
| "learning_rate": 9.653579676674365e-07, | |
| "loss": 1.0719, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0656, | |
| "grad_norm": 3.2837698459625244, | |
| "learning_rate": 9.637083470801715e-07, | |
| "loss": 1.0107, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.0672, | |
| "grad_norm": 5.359975814819336, | |
| "learning_rate": 9.620587264929065e-07, | |
| "loss": 0.9036, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.0688, | |
| "grad_norm": 5.2580037117004395, | |
| "learning_rate": 9.604091059056415e-07, | |
| "loss": 0.9704, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.0704, | |
| "grad_norm": 9.796117782592773, | |
| "learning_rate": 9.587594853183767e-07, | |
| "loss": 0.9456, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 10.24465560913086, | |
| "learning_rate": 9.571098647311117e-07, | |
| "loss": 1.0502, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.0736, | |
| "grad_norm": 7.899555683135986, | |
| "learning_rate": 9.55460244143847e-07, | |
| "loss": 0.9849, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0752, | |
| "grad_norm": 6.677064418792725, | |
| "learning_rate": 9.53810623556582e-07, | |
| "loss": 1.103, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.0768, | |
| "grad_norm": 8.85185718536377, | |
| "learning_rate": 9.52161002969317e-07, | |
| "loss": 0.9351, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.0784, | |
| "grad_norm": 7.826456069946289, | |
| "learning_rate": 9.505113823820521e-07, | |
| "loss": 0.9369, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 7.929803371429443, | |
| "learning_rate": 9.488617617947871e-07, | |
| "loss": 0.8399, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0816, | |
| "grad_norm": 4.857858180999756, | |
| "learning_rate": 9.472121412075222e-07, | |
| "loss": 0.9738, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.0832, | |
| "grad_norm": 7.609739303588867, | |
| "learning_rate": 9.455625206202573e-07, | |
| "loss": 1.0397, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.0848, | |
| "grad_norm": 4.687152862548828, | |
| "learning_rate": 9.439129000329924e-07, | |
| "loss": 1.0207, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.0864, | |
| "grad_norm": 3.707120180130005, | |
| "learning_rate": 9.422632794457274e-07, | |
| "loss": 1.0404, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 7.183946132659912, | |
| "learning_rate": 9.406136588584625e-07, | |
| "loss": 0.9329, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.0896, | |
| "grad_norm": 4.107320785522461, | |
| "learning_rate": 9.389640382711976e-07, | |
| "loss": 0.9674, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.0912, | |
| "grad_norm": 2.987569808959961, | |
| "learning_rate": 9.373144176839326e-07, | |
| "loss": 1.0431, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.0928, | |
| "grad_norm": 7.492343902587891, | |
| "learning_rate": 9.356647970966677e-07, | |
| "loss": 0.8819, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.0944, | |
| "grad_norm": 3.460360050201416, | |
| "learning_rate": 9.340151765094027e-07, | |
| "loss": 1.1553, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 4.670125961303711, | |
| "learning_rate": 9.323655559221378e-07, | |
| "loss": 0.9511, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0976, | |
| "grad_norm": 6.766526699066162, | |
| "learning_rate": 9.307159353348729e-07, | |
| "loss": 0.9625, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.0992, | |
| "grad_norm": 3.6760966777801514, | |
| "learning_rate": 9.29066314747608e-07, | |
| "loss": 0.9503, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1008, | |
| "grad_norm": 9.794249534606934, | |
| "learning_rate": 9.27416694160343e-07, | |
| "loss": 0.9533, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.1024, | |
| "grad_norm": 8.894811630249023, | |
| "learning_rate": 9.257670735730781e-07, | |
| "loss": 0.8668, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 3.3411269187927246, | |
| "learning_rate": 9.241174529858132e-07, | |
| "loss": 0.9171, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.1056, | |
| "grad_norm": 9.227668762207031, | |
| "learning_rate": 9.224678323985483e-07, | |
| "loss": 0.9065, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1072, | |
| "grad_norm": 3.217501640319824, | |
| "learning_rate": 9.208182118112834e-07, | |
| "loss": 0.945, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.1088, | |
| "grad_norm": 3.9283738136291504, | |
| "learning_rate": 9.191685912240184e-07, | |
| "loss": 0.9166, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.1104, | |
| "grad_norm": 7.905593395233154, | |
| "learning_rate": 9.175189706367535e-07, | |
| "loss": 1.0471, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 3.8356964588165283, | |
| "learning_rate": 9.158693500494886e-07, | |
| "loss": 0.9593, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1136, | |
| "grad_norm": 4.25161600112915, | |
| "learning_rate": 9.142197294622237e-07, | |
| "loss": 0.9251, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.1152, | |
| "grad_norm": 2.885007381439209, | |
| "learning_rate": 9.125701088749587e-07, | |
| "loss": 0.9124, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.1168, | |
| "grad_norm": 6.2251763343811035, | |
| "learning_rate": 9.109204882876937e-07, | |
| "loss": 1.0204, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.1184, | |
| "grad_norm": 5.769200801849365, | |
| "learning_rate": 9.092708677004288e-07, | |
| "loss": 0.8877, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 6.167758941650391, | |
| "learning_rate": 9.076212471131639e-07, | |
| "loss": 0.9371, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.1216, | |
| "grad_norm": 4.101468563079834, | |
| "learning_rate": 9.05971626525899e-07, | |
| "loss": 0.9571, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.1232, | |
| "grad_norm": 3.352560043334961, | |
| "learning_rate": 9.04322005938634e-07, | |
| "loss": 0.9988, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.1248, | |
| "grad_norm": 4.724895000457764, | |
| "learning_rate": 9.026723853513691e-07, | |
| "loss": 0.972, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.1264, | |
| "grad_norm": 5.613089561462402, | |
| "learning_rate": 9.010227647641042e-07, | |
| "loss": 0.8806, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 4.489896774291992, | |
| "learning_rate": 8.993731441768393e-07, | |
| "loss": 0.9119, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1296, | |
| "grad_norm": 6.169275760650635, | |
| "learning_rate": 8.977235235895744e-07, | |
| "loss": 0.9942, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.1312, | |
| "grad_norm": 4.484738826751709, | |
| "learning_rate": 8.960739030023094e-07, | |
| "loss": 0.8567, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.1328, | |
| "grad_norm": 3.1110870838165283, | |
| "learning_rate": 8.944242824150445e-07, | |
| "loss": 0.912, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.1344, | |
| "grad_norm": 7.398248672485352, | |
| "learning_rate": 8.927746618277796e-07, | |
| "loss": 0.9913, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 6.651194095611572, | |
| "learning_rate": 8.911250412405147e-07, | |
| "loss": 0.8492, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.1376, | |
| "grad_norm": 6.037364482879639, | |
| "learning_rate": 8.894754206532498e-07, | |
| "loss": 0.9839, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.1392, | |
| "grad_norm": 3.9697699546813965, | |
| "learning_rate": 8.878258000659847e-07, | |
| "loss": 0.9343, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.1408, | |
| "grad_norm": 4.4155497550964355, | |
| "learning_rate": 8.861761794787198e-07, | |
| "loss": 0.9474, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.1424, | |
| "grad_norm": 4.292988300323486, | |
| "learning_rate": 8.845265588914549e-07, | |
| "loss": 1.0596, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 2.772756338119507, | |
| "learning_rate": 8.8287693830419e-07, | |
| "loss": 0.9765, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1456, | |
| "grad_norm": 7.738980770111084, | |
| "learning_rate": 8.81227317716925e-07, | |
| "loss": 0.8599, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.1472, | |
| "grad_norm": 9.246415138244629, | |
| "learning_rate": 8.795776971296601e-07, | |
| "loss": 0.9711, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1488, | |
| "grad_norm": 5.940875053405762, | |
| "learning_rate": 8.779280765423952e-07, | |
| "loss": 0.9433, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.1504, | |
| "grad_norm": 6.259022235870361, | |
| "learning_rate": 8.762784559551303e-07, | |
| "loss": 0.9859, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 7.941705226898193, | |
| "learning_rate": 8.746288353678654e-07, | |
| "loss": 0.8696, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.1536, | |
| "grad_norm": 3.571704626083374, | |
| "learning_rate": 8.729792147806004e-07, | |
| "loss": 0.943, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.1552, | |
| "grad_norm": 4.129303455352783, | |
| "learning_rate": 8.713295941933355e-07, | |
| "loss": 0.8251, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.1568, | |
| "grad_norm": 8.326216697692871, | |
| "learning_rate": 8.696799736060706e-07, | |
| "loss": 0.9393, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.1584, | |
| "grad_norm": 2.903012275695801, | |
| "learning_rate": 8.680303530188057e-07, | |
| "loss": 0.8944, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.4961628913879395, | |
| "learning_rate": 8.663807324315408e-07, | |
| "loss": 1.0191, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1616, | |
| "grad_norm": 17.958810806274414, | |
| "learning_rate": 8.647311118442758e-07, | |
| "loss": 0.8689, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.1632, | |
| "grad_norm": 7.708248138427734, | |
| "learning_rate": 8.630814912570108e-07, | |
| "loss": 0.9344, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.1648, | |
| "grad_norm": 3.0089898109436035, | |
| "learning_rate": 8.614318706697459e-07, | |
| "loss": 0.9085, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.1664, | |
| "grad_norm": 3.333603858947754, | |
| "learning_rate": 8.59782250082481e-07, | |
| "loss": 0.9389, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 4.273075580596924, | |
| "learning_rate": 8.58132629495216e-07, | |
| "loss": 0.9545, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.1696, | |
| "grad_norm": 3.9365367889404297, | |
| "learning_rate": 8.564830089079511e-07, | |
| "loss": 1.044, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.1712, | |
| "grad_norm": 8.090559959411621, | |
| "learning_rate": 8.548333883206862e-07, | |
| "loss": 0.8949, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.1728, | |
| "grad_norm": 3.675675868988037, | |
| "learning_rate": 8.531837677334213e-07, | |
| "loss": 0.8984, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.1744, | |
| "grad_norm": 4.396546840667725, | |
| "learning_rate": 8.515341471461564e-07, | |
| "loss": 0.8276, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 5.8129706382751465, | |
| "learning_rate": 8.498845265588914e-07, | |
| "loss": 0.9485, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1776, | |
| "grad_norm": 4.243994235992432, | |
| "learning_rate": 8.482349059716265e-07, | |
| "loss": 0.8613, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.1792, | |
| "grad_norm": 3.252338409423828, | |
| "learning_rate": 8.465852853843616e-07, | |
| "loss": 0.9014, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.1808, | |
| "grad_norm": 2.9563217163085938, | |
| "learning_rate": 8.449356647970967e-07, | |
| "loss": 0.9632, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.1824, | |
| "grad_norm": 4.2105302810668945, | |
| "learning_rate": 8.432860442098317e-07, | |
| "loss": 0.8758, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 4.756136417388916, | |
| "learning_rate": 8.416364236225668e-07, | |
| "loss": 0.9102, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.1856, | |
| "grad_norm": 3.9406795501708984, | |
| "learning_rate": 8.399868030353019e-07, | |
| "loss": 0.9286, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.1872, | |
| "grad_norm": 5.04837703704834, | |
| "learning_rate": 8.383371824480369e-07, | |
| "loss": 1.0353, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.1888, | |
| "grad_norm": 2.621293306350708, | |
| "learning_rate": 8.36687561860772e-07, | |
| "loss": 0.9047, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.1904, | |
| "grad_norm": 4.60697078704834, | |
| "learning_rate": 8.35037941273507e-07, | |
| "loss": 0.9403, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 3.5132675170898438, | |
| "learning_rate": 8.333883206862421e-07, | |
| "loss": 0.9576, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1936, | |
| "grad_norm": 6.349253177642822, | |
| "learning_rate": 8.317387000989772e-07, | |
| "loss": 0.9443, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.1952, | |
| "grad_norm": 4.456959247589111, | |
| "learning_rate": 8.300890795117123e-07, | |
| "loss": 0.9476, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.1968, | |
| "grad_norm": 3.2254133224487305, | |
| "learning_rate": 8.284394589244474e-07, | |
| "loss": 0.9401, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.1984, | |
| "grad_norm": 4.775053977966309, | |
| "learning_rate": 8.267898383371824e-07, | |
| "loss": 1.0189, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.755006790161133, | |
| "learning_rate": 8.251402177499175e-07, | |
| "loss": 0.8521, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.2016, | |
| "grad_norm": 4.0426716804504395, | |
| "learning_rate": 8.234905971626526e-07, | |
| "loss": 0.9075, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.2032, | |
| "grad_norm": 4.348297119140625, | |
| "learning_rate": 8.218409765753877e-07, | |
| "loss": 0.9625, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.2048, | |
| "grad_norm": 6.654307842254639, | |
| "learning_rate": 8.201913559881227e-07, | |
| "loss": 0.8582, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.2064, | |
| "grad_norm": 4.090488910675049, | |
| "learning_rate": 8.185417354008578e-07, | |
| "loss": 0.9359, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 5.286700248718262, | |
| "learning_rate": 8.168921148135929e-07, | |
| "loss": 0.9783, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.2096, | |
| "grad_norm": 4.438581466674805, | |
| "learning_rate": 8.15242494226328e-07, | |
| "loss": 0.9526, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.2112, | |
| "grad_norm": 3.836512565612793, | |
| "learning_rate": 8.13592873639063e-07, | |
| "loss": 0.8768, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.2128, | |
| "grad_norm": 3.1037371158599854, | |
| "learning_rate": 8.11943253051798e-07, | |
| "loss": 0.963, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.2144, | |
| "grad_norm": 5.103884696960449, | |
| "learning_rate": 8.102936324645331e-07, | |
| "loss": 0.9322, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 3.707827091217041, | |
| "learning_rate": 8.086440118772682e-07, | |
| "loss": 0.8691, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.2176, | |
| "grad_norm": 3.7925477027893066, | |
| "learning_rate": 8.069943912900033e-07, | |
| "loss": 0.9636, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.2192, | |
| "grad_norm": 4.857919692993164, | |
| "learning_rate": 8.053447707027383e-07, | |
| "loss": 0.8867, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.2208, | |
| "grad_norm": 3.513091564178467, | |
| "learning_rate": 8.036951501154734e-07, | |
| "loss": 0.9646, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.2224, | |
| "grad_norm": 3.4893958568573, | |
| "learning_rate": 8.020455295282085e-07, | |
| "loss": 0.9276, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 3.087334156036377, | |
| "learning_rate": 8.003959089409436e-07, | |
| "loss": 0.89, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2256, | |
| "grad_norm": 4.584767818450928, | |
| "learning_rate": 7.987462883536787e-07, | |
| "loss": 1.0421, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.2272, | |
| "grad_norm": 3.7277486324310303, | |
| "learning_rate": 7.970966677664137e-07, | |
| "loss": 0.8757, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.2288, | |
| "grad_norm": 4.989358425140381, | |
| "learning_rate": 7.954470471791488e-07, | |
| "loss": 0.8421, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.2304, | |
| "grad_norm": 4.230960845947266, | |
| "learning_rate": 7.937974265918839e-07, | |
| "loss": 0.883, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 7.118892192840576, | |
| "learning_rate": 7.92147806004619e-07, | |
| "loss": 0.9549, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.2336, | |
| "grad_norm": 10.041189193725586, | |
| "learning_rate": 7.904981854173541e-07, | |
| "loss": 0.9897, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.2352, | |
| "grad_norm": 3.822767734527588, | |
| "learning_rate": 7.88848564830089e-07, | |
| "loss": 0.8711, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.2368, | |
| "grad_norm": 4.475744724273682, | |
| "learning_rate": 7.871989442428241e-07, | |
| "loss": 0.9175, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.2384, | |
| "grad_norm": 5.1056976318359375, | |
| "learning_rate": 7.855493236555592e-07, | |
| "loss": 0.9022, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 4.522522926330566, | |
| "learning_rate": 7.838997030682943e-07, | |
| "loss": 1.0007, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2416, | |
| "grad_norm": 4.390966415405273, | |
| "learning_rate": 7.822500824810293e-07, | |
| "loss": 0.9074, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.2432, | |
| "grad_norm": 7.068999290466309, | |
| "learning_rate": 7.806004618937644e-07, | |
| "loss": 0.8454, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.2448, | |
| "grad_norm": 3.558549642562866, | |
| "learning_rate": 7.789508413064995e-07, | |
| "loss": 0.88, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.2464, | |
| "grad_norm": 4.7729949951171875, | |
| "learning_rate": 7.773012207192346e-07, | |
| "loss": 0.8577, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 3.512878894805908, | |
| "learning_rate": 7.756516001319697e-07, | |
| "loss": 1.0939, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.2496, | |
| "grad_norm": 2.6263558864593506, | |
| "learning_rate": 7.740019795447047e-07, | |
| "loss": 0.8807, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.2512, | |
| "grad_norm": 2.518568992614746, | |
| "learning_rate": 7.723523589574398e-07, | |
| "loss": 0.7706, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.2528, | |
| "grad_norm": 5.156455993652344, | |
| "learning_rate": 7.707027383701749e-07, | |
| "loss": 0.8809, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.2544, | |
| "grad_norm": 7.734130382537842, | |
| "learning_rate": 7.6905311778291e-07, | |
| "loss": 0.967, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 2.6134588718414307, | |
| "learning_rate": 7.674034971956451e-07, | |
| "loss": 0.7965, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.2576, | |
| "grad_norm": 9.95977783203125, | |
| "learning_rate": 7.657538766083801e-07, | |
| "loss": 0.9095, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.2592, | |
| "grad_norm": 3.093651533126831, | |
| "learning_rate": 7.64104256021115e-07, | |
| "loss": 0.9707, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.2608, | |
| "grad_norm": 3.039573907852173, | |
| "learning_rate": 7.624546354338501e-07, | |
| "loss": 0.9317, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.2624, | |
| "grad_norm": 4.40300989151001, | |
| "learning_rate": 7.608050148465853e-07, | |
| "loss": 0.8223, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 5.527564525604248, | |
| "learning_rate": 7.591553942593202e-07, | |
| "loss": 0.9234, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.2656, | |
| "grad_norm": 5.654271602630615, | |
| "learning_rate": 7.575057736720553e-07, | |
| "loss": 0.9437, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.2672, | |
| "grad_norm": 5.316553115844727, | |
| "learning_rate": 7.558561530847904e-07, | |
| "loss": 0.8862, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.2688, | |
| "grad_norm": 2.8125505447387695, | |
| "learning_rate": 7.542065324975255e-07, | |
| "loss": 0.8235, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.2704, | |
| "grad_norm": 5.254530429840088, | |
| "learning_rate": 7.525569119102606e-07, | |
| "loss": 0.8986, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 5.69275426864624, | |
| "learning_rate": 7.509072913229956e-07, | |
| "loss": 0.9043, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2736, | |
| "grad_norm": 4.995587348937988, | |
| "learning_rate": 7.492576707357307e-07, | |
| "loss": 0.8242, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.2752, | |
| "grad_norm": 8.568499565124512, | |
| "learning_rate": 7.476080501484658e-07, | |
| "loss": 0.9116, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.2768, | |
| "grad_norm": 5.804699420928955, | |
| "learning_rate": 7.45958429561201e-07, | |
| "loss": 0.863, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.2784, | |
| "grad_norm": 7.921741962432861, | |
| "learning_rate": 7.44308808973936e-07, | |
| "loss": 0.9719, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.888009071350098, | |
| "learning_rate": 7.42659188386671e-07, | |
| "loss": 0.8999, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.2816, | |
| "grad_norm": 3.9009482860565186, | |
| "learning_rate": 7.410095677994061e-07, | |
| "loss": 0.9543, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.2832, | |
| "grad_norm": 3.502060651779175, | |
| "learning_rate": 7.393599472121411e-07, | |
| "loss": 0.8367, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.2848, | |
| "grad_norm": 4.675789833068848, | |
| "learning_rate": 7.377103266248762e-07, | |
| "loss": 0.8843, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.2864, | |
| "grad_norm": 3.246445417404175, | |
| "learning_rate": 7.360607060376112e-07, | |
| "loss": 0.9663, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 3.168081521987915, | |
| "learning_rate": 7.344110854503463e-07, | |
| "loss": 0.8186, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2896, | |
| "grad_norm": 5.229098320007324, | |
| "learning_rate": 7.327614648630814e-07, | |
| "loss": 0.9686, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.2912, | |
| "grad_norm": 6.258688926696777, | |
| "learning_rate": 7.311118442758165e-07, | |
| "loss": 0.8538, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.2928, | |
| "grad_norm": 5.021489143371582, | |
| "learning_rate": 7.294622236885516e-07, | |
| "loss": 0.7937, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.2944, | |
| "grad_norm": 3.9986860752105713, | |
| "learning_rate": 7.278126031012866e-07, | |
| "loss": 0.8503, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 3.62813663482666, | |
| "learning_rate": 7.261629825140217e-07, | |
| "loss": 0.8199, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.2976, | |
| "grad_norm": 4.517638683319092, | |
| "learning_rate": 7.245133619267568e-07, | |
| "loss": 0.8521, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.2992, | |
| "grad_norm": 8.5663423538208, | |
| "learning_rate": 7.228637413394919e-07, | |
| "loss": 0.9132, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.3008, | |
| "grad_norm": 4.776329040527344, | |
| "learning_rate": 7.212141207522269e-07, | |
| "loss": 0.8566, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.3024, | |
| "grad_norm": 4.764708518981934, | |
| "learning_rate": 7.19564500164962e-07, | |
| "loss": 0.8181, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 3.810011863708496, | |
| "learning_rate": 7.179148795776971e-07, | |
| "loss": 1.003, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.3056, | |
| "grad_norm": 4.385900020599365, | |
| "learning_rate": 7.162652589904322e-07, | |
| "loss": 0.8525, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.3072, | |
| "grad_norm": 8.322861671447754, | |
| "learning_rate": 7.146156384031672e-07, | |
| "loss": 0.9047, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.3088, | |
| "grad_norm": 6.5899224281311035, | |
| "learning_rate": 7.129660178159022e-07, | |
| "loss": 0.7103, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.3104, | |
| "grad_norm": 6.054207801818848, | |
| "learning_rate": 7.113163972286373e-07, | |
| "loss": 0.876, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 3.6956799030303955, | |
| "learning_rate": 7.096667766413724e-07, | |
| "loss": 0.9382, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.3136, | |
| "grad_norm": 4.055649757385254, | |
| "learning_rate": 7.080171560541075e-07, | |
| "loss": 0.8944, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.3152, | |
| "grad_norm": 2.8398051261901855, | |
| "learning_rate": 7.063675354668426e-07, | |
| "loss": 0.8233, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.3168, | |
| "grad_norm": 5.903645992279053, | |
| "learning_rate": 7.047179148795776e-07, | |
| "loss": 1.0067, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.3184, | |
| "grad_norm": 5.374630451202393, | |
| "learning_rate": 7.030682942923127e-07, | |
| "loss": 0.8423, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 6.729516506195068, | |
| "learning_rate": 7.014186737050478e-07, | |
| "loss": 1.0209, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3216, | |
| "grad_norm": 3.4207727909088135, | |
| "learning_rate": 6.997690531177829e-07, | |
| "loss": 0.9196, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.3232, | |
| "grad_norm": 5.797353744506836, | |
| "learning_rate": 6.981194325305179e-07, | |
| "loss": 0.921, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.3248, | |
| "grad_norm": 4.802167892456055, | |
| "learning_rate": 6.96469811943253e-07, | |
| "loss": 0.8844, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.3264, | |
| "grad_norm": 6.671936511993408, | |
| "learning_rate": 6.948201913559881e-07, | |
| "loss": 0.8399, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 4.027926921844482, | |
| "learning_rate": 6.931705707687232e-07, | |
| "loss": 0.8733, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.3296, | |
| "grad_norm": 7.0996575355529785, | |
| "learning_rate": 6.915209501814583e-07, | |
| "loss": 0.8886, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.3312, | |
| "grad_norm": 3.9534752368927, | |
| "learning_rate": 6.898713295941932e-07, | |
| "loss": 0.9144, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.3328, | |
| "grad_norm": 5.016911506652832, | |
| "learning_rate": 6.882217090069283e-07, | |
| "loss": 0.9936, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.3344, | |
| "grad_norm": 3.6231181621551514, | |
| "learning_rate": 6.865720884196634e-07, | |
| "loss": 0.7205, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 3.6556529998779297, | |
| "learning_rate": 6.849224678323985e-07, | |
| "loss": 0.914, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3376, | |
| "grad_norm": 3.27970552444458, | |
| "learning_rate": 6.832728472451335e-07, | |
| "loss": 0.8306, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.3392, | |
| "grad_norm": 3.816570997238159, | |
| "learning_rate": 6.816232266578686e-07, | |
| "loss": 0.9458, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.3408, | |
| "grad_norm": 7.391907215118408, | |
| "learning_rate": 6.799736060706037e-07, | |
| "loss": 0.8389, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.3424, | |
| "grad_norm": 3.756998300552368, | |
| "learning_rate": 6.783239854833388e-07, | |
| "loss": 0.8889, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 4.19740629196167, | |
| "learning_rate": 6.766743648960739e-07, | |
| "loss": 0.8942, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.3456, | |
| "grad_norm": 4.351824760437012, | |
| "learning_rate": 6.750247443088089e-07, | |
| "loss": 0.9602, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.3472, | |
| "grad_norm": 3.371953010559082, | |
| "learning_rate": 6.73375123721544e-07, | |
| "loss": 0.9365, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.3488, | |
| "grad_norm": 5.847254753112793, | |
| "learning_rate": 6.717255031342791e-07, | |
| "loss": 0.7705, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.3504, | |
| "grad_norm": 5.160730361938477, | |
| "learning_rate": 6.700758825470142e-07, | |
| "loss": 0.9221, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 3.4430952072143555, | |
| "learning_rate": 6.684262619597493e-07, | |
| "loss": 0.9484, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3536, | |
| "grad_norm": 4.218683242797852, | |
| "learning_rate": 6.667766413724843e-07, | |
| "loss": 0.8446, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.3552, | |
| "grad_norm": 5.120244026184082, | |
| "learning_rate": 6.651270207852193e-07, | |
| "loss": 0.9515, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.3568, | |
| "grad_norm": 5.609252452850342, | |
| "learning_rate": 6.634774001979544e-07, | |
| "loss": 0.8694, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.3584, | |
| "grad_norm": 3.753680467605591, | |
| "learning_rate": 6.618277796106895e-07, | |
| "loss": 0.9242, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 3.665069580078125, | |
| "learning_rate": 6.601781590234245e-07, | |
| "loss": 0.9034, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.3616, | |
| "grad_norm": 4.715619087219238, | |
| "learning_rate": 6.585285384361596e-07, | |
| "loss": 0.8842, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.3632, | |
| "grad_norm": 4.438577651977539, | |
| "learning_rate": 6.568789178488947e-07, | |
| "loss": 0.9432, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.3648, | |
| "grad_norm": 3.8930490016937256, | |
| "learning_rate": 6.552292972616298e-07, | |
| "loss": 0.8113, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.3664, | |
| "grad_norm": 4.182096004486084, | |
| "learning_rate": 6.535796766743649e-07, | |
| "loss": 0.7999, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 5.353331089019775, | |
| "learning_rate": 6.519300560870999e-07, | |
| "loss": 0.8499, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3696, | |
| "grad_norm": 3.641796588897705, | |
| "learning_rate": 6.50280435499835e-07, | |
| "loss": 0.9247, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.3712, | |
| "grad_norm": 4.2418646812438965, | |
| "learning_rate": 6.486308149125701e-07, | |
| "loss": 1.0406, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.3728, | |
| "grad_norm": 2.948838233947754, | |
| "learning_rate": 6.469811943253052e-07, | |
| "loss": 0.8411, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.3744, | |
| "grad_norm": 7.832685947418213, | |
| "learning_rate": 6.453315737380403e-07, | |
| "loss": 0.9339, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 3.882305145263672, | |
| "learning_rate": 6.436819531507753e-07, | |
| "loss": 0.9936, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.3776, | |
| "grad_norm": 6.916220664978027, | |
| "learning_rate": 6.420323325635104e-07, | |
| "loss": 1.0441, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.3792, | |
| "grad_norm": 6.770009517669678, | |
| "learning_rate": 6.403827119762454e-07, | |
| "loss": 0.9299, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.3808, | |
| "grad_norm": 4.584465980529785, | |
| "learning_rate": 6.387330913889805e-07, | |
| "loss": 0.9169, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.3824, | |
| "grad_norm": 8.000226974487305, | |
| "learning_rate": 6.370834708017155e-07, | |
| "loss": 1.0345, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 13.314818382263184, | |
| "learning_rate": 6.354338502144506e-07, | |
| "loss": 0.919, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3856, | |
| "grad_norm": 3.7661311626434326, | |
| "learning_rate": 6.337842296271857e-07, | |
| "loss": 0.9678, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.3872, | |
| "grad_norm": 4.133317470550537, | |
| "learning_rate": 6.321346090399208e-07, | |
| "loss": 0.9533, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.3888, | |
| "grad_norm": 2.6607346534729004, | |
| "learning_rate": 6.304849884526559e-07, | |
| "loss": 0.9946, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.3904, | |
| "grad_norm": 3.8332831859588623, | |
| "learning_rate": 6.288353678653909e-07, | |
| "loss": 0.8293, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 3.6170003414154053, | |
| "learning_rate": 6.27185747278126e-07, | |
| "loss": 0.8952, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.3936, | |
| "grad_norm": 5.026386737823486, | |
| "learning_rate": 6.255361266908611e-07, | |
| "loss": 0.917, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.3952, | |
| "grad_norm": 10.285544395446777, | |
| "learning_rate": 6.238865061035962e-07, | |
| "loss": 0.8624, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.3968, | |
| "grad_norm": 2.899703025817871, | |
| "learning_rate": 6.222368855163313e-07, | |
| "loss": 0.9242, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.3984, | |
| "grad_norm": 6.087869167327881, | |
| "learning_rate": 6.205872649290663e-07, | |
| "loss": 0.8574, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 4.513827323913574, | |
| "learning_rate": 6.189376443418014e-07, | |
| "loss": 0.9205, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.4016, | |
| "grad_norm": 3.0381855964660645, | |
| "learning_rate": 6.172880237545365e-07, | |
| "loss": 0.8779, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.4032, | |
| "grad_norm": 9.265677452087402, | |
| "learning_rate": 6.156384031672715e-07, | |
| "loss": 0.8645, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.4048, | |
| "grad_norm": 5.159205436706543, | |
| "learning_rate": 6.139887825800065e-07, | |
| "loss": 0.7365, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.4064, | |
| "grad_norm": 4.157783031463623, | |
| "learning_rate": 6.123391619927416e-07, | |
| "loss": 1.003, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 4.129422187805176, | |
| "learning_rate": 6.106895414054767e-07, | |
| "loss": 1.0333, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.4096, | |
| "grad_norm": 5.1480536460876465, | |
| "learning_rate": 6.090399208182118e-07, | |
| "loss": 0.9097, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.4112, | |
| "grad_norm": 6.21195650100708, | |
| "learning_rate": 6.073903002309469e-07, | |
| "loss": 0.9753, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.4128, | |
| "grad_norm": 4.375741481781006, | |
| "learning_rate": 6.057406796436819e-07, | |
| "loss": 0.8613, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.4144, | |
| "grad_norm": 6.381781578063965, | |
| "learning_rate": 6.04091059056417e-07, | |
| "loss": 0.8447, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 6.689861297607422, | |
| "learning_rate": 6.024414384691521e-07, | |
| "loss": 0.9417, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.4176, | |
| "grad_norm": 7.573152542114258, | |
| "learning_rate": 6.007918178818872e-07, | |
| "loss": 0.8559, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.4192, | |
| "grad_norm": 3.2965288162231445, | |
| "learning_rate": 5.991421972946222e-07, | |
| "loss": 0.8483, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.4208, | |
| "grad_norm": 3.5138070583343506, | |
| "learning_rate": 5.974925767073573e-07, | |
| "loss": 0.7122, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.4224, | |
| "grad_norm": 3.6955363750457764, | |
| "learning_rate": 5.958429561200924e-07, | |
| "loss": 0.8294, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 3.6129462718963623, | |
| "learning_rate": 5.941933355328275e-07, | |
| "loss": 1.0534, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.4256, | |
| "grad_norm": 4.549454212188721, | |
| "learning_rate": 5.925437149455626e-07, | |
| "loss": 0.8865, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.4272, | |
| "grad_norm": 5.264537811279297, | |
| "learning_rate": 5.908940943582975e-07, | |
| "loss": 0.9219, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.4288, | |
| "grad_norm": 5.2592267990112305, | |
| "learning_rate": 5.892444737710326e-07, | |
| "loss": 0.8414, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.4304, | |
| "grad_norm": 7.621036529541016, | |
| "learning_rate": 5.875948531837677e-07, | |
| "loss": 0.8553, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 4.718931674957275, | |
| "learning_rate": 5.859452325965028e-07, | |
| "loss": 1.0172, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.4336, | |
| "grad_norm": 3.382890224456787, | |
| "learning_rate": 5.842956120092379e-07, | |
| "loss": 0.9208, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.4352, | |
| "grad_norm": 5.720285415649414, | |
| "learning_rate": 5.826459914219729e-07, | |
| "loss": 0.9376, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.4368, | |
| "grad_norm": 6.348506450653076, | |
| "learning_rate": 5.80996370834708e-07, | |
| "loss": 0.8526, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.4384, | |
| "grad_norm": 3.639775276184082, | |
| "learning_rate": 5.793467502474431e-07, | |
| "loss": 0.9067, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 4.016887187957764, | |
| "learning_rate": 5.776971296601782e-07, | |
| "loss": 0.9526, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.4416, | |
| "grad_norm": 6.822007656097412, | |
| "learning_rate": 5.760475090729132e-07, | |
| "loss": 0.776, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.4432, | |
| "grad_norm": 4.3558125495910645, | |
| "learning_rate": 5.743978884856483e-07, | |
| "loss": 0.8932, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.4448, | |
| "grad_norm": 6.4160475730896, | |
| "learning_rate": 5.727482678983834e-07, | |
| "loss": 0.9081, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.4464, | |
| "grad_norm": 3.1769394874572754, | |
| "learning_rate": 5.710986473111185e-07, | |
| "loss": 0.8688, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 3.9851887226104736, | |
| "learning_rate": 5.694490267238536e-07, | |
| "loss": 0.877, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.4496, | |
| "grad_norm": 5.6506571769714355, | |
| "learning_rate": 5.677994061365886e-07, | |
| "loss": 0.8325, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.4512, | |
| "grad_norm": 3.549743175506592, | |
| "learning_rate": 5.661497855493236e-07, | |
| "loss": 0.8987, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.4528, | |
| "grad_norm": 3.143094062805176, | |
| "learning_rate": 5.645001649620587e-07, | |
| "loss": 0.9429, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.4544, | |
| "grad_norm": 8.156094551086426, | |
| "learning_rate": 5.628505443747938e-07, | |
| "loss": 0.7903, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 4.86202335357666, | |
| "learning_rate": 5.612009237875289e-07, | |
| "loss": 0.933, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.4576, | |
| "grad_norm": 5.636049270629883, | |
| "learning_rate": 5.595513032002639e-07, | |
| "loss": 0.921, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.4592, | |
| "grad_norm": 3.5446226596832275, | |
| "learning_rate": 5.57901682612999e-07, | |
| "loss": 0.8239, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.4608, | |
| "grad_norm": 3.516528606414795, | |
| "learning_rate": 5.562520620257341e-07, | |
| "loss": 0.894, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.4624, | |
| "grad_norm": 3.1388487815856934, | |
| "learning_rate": 5.546024414384692e-07, | |
| "loss": 0.9401, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 3.378370761871338, | |
| "learning_rate": 5.529528208512042e-07, | |
| "loss": 0.8786, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.4656, | |
| "grad_norm": 4.898928165435791, | |
| "learning_rate": 5.513032002639393e-07, | |
| "loss": 0.8457, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.4672, | |
| "grad_norm": 8.320155143737793, | |
| "learning_rate": 5.496535796766744e-07, | |
| "loss": 0.9857, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.4688, | |
| "grad_norm": 12.393474578857422, | |
| "learning_rate": 5.480039590894095e-07, | |
| "loss": 1.0179, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.4704, | |
| "grad_norm": 3.834761142730713, | |
| "learning_rate": 5.463543385021446e-07, | |
| "loss": 0.8355, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 5.7657694816589355, | |
| "learning_rate": 5.447047179148796e-07, | |
| "loss": 0.906, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.4736, | |
| "grad_norm": 2.89928936958313, | |
| "learning_rate": 5.430550973276147e-07, | |
| "loss": 0.8453, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.4752, | |
| "grad_norm": 3.30023455619812, | |
| "learning_rate": 5.414054767403497e-07, | |
| "loss": 0.863, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.4768, | |
| "grad_norm": 6.904449462890625, | |
| "learning_rate": 5.397558561530848e-07, | |
| "loss": 0.8722, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.4784, | |
| "grad_norm": 2.936325788497925, | |
| "learning_rate": 5.381062355658197e-07, | |
| "loss": 0.8565, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 6.707699775695801, | |
| "learning_rate": 5.364566149785548e-07, | |
| "loss": 0.827, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4816, | |
| "grad_norm": 3.5800673961639404, | |
| "learning_rate": 5.3480699439129e-07, | |
| "loss": 0.8582, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.4832, | |
| "grad_norm": 5.940330505371094, | |
| "learning_rate": 5.33157373804025e-07, | |
| "loss": 0.9983, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.4848, | |
| "grad_norm": 4.438694000244141, | |
| "learning_rate": 5.315077532167602e-07, | |
| "loss": 0.8307, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.4864, | |
| "grad_norm": 6.149857044219971, | |
| "learning_rate": 5.298581326294951e-07, | |
| "loss": 0.9128, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 3.874925136566162, | |
| "learning_rate": 5.282085120422302e-07, | |
| "loss": 0.966, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.4896, | |
| "grad_norm": 17.836402893066406, | |
| "learning_rate": 5.265588914549653e-07, | |
| "loss": 0.8737, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.4912, | |
| "grad_norm": 5.488133430480957, | |
| "learning_rate": 5.249092708677005e-07, | |
| "loss": 0.9353, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.4928, | |
| "grad_norm": 4.590605735778809, | |
| "learning_rate": 5.232596502804356e-07, | |
| "loss": 0.8757, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.4944, | |
| "grad_norm": 3.0040788650512695, | |
| "learning_rate": 5.216100296931705e-07, | |
| "loss": 0.8116, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 3.2542080879211426, | |
| "learning_rate": 5.199604091059056e-07, | |
| "loss": 0.9621, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.4976, | |
| "grad_norm": 4.786755561828613, | |
| "learning_rate": 5.183107885186407e-07, | |
| "loss": 0.9551, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.4992, | |
| "grad_norm": 5.058788299560547, | |
| "learning_rate": 5.166611679313757e-07, | |
| "loss": 0.7788, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.5008, | |
| "grad_norm": 6.7137131690979, | |
| "learning_rate": 5.150115473441107e-07, | |
| "loss": 0.8605, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.5024, | |
| "grad_norm": 5.942770004272461, | |
| "learning_rate": 5.133619267568458e-07, | |
| "loss": 0.8731, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 3.7935171127319336, | |
| "learning_rate": 5.117123061695809e-07, | |
| "loss": 0.8126, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.5056, | |
| "grad_norm": 3.8675737380981445, | |
| "learning_rate": 5.10062685582316e-07, | |
| "loss": 0.8163, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.5072, | |
| "grad_norm": 3.6353890895843506, | |
| "learning_rate": 5.084130649950511e-07, | |
| "loss": 0.8407, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.5088, | |
| "grad_norm": 6.919312477111816, | |
| "learning_rate": 5.067634444077861e-07, | |
| "loss": 0.8906, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.5104, | |
| "grad_norm": 4.731250286102295, | |
| "learning_rate": 5.051138238205212e-07, | |
| "loss": 0.7421, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 3.6495304107666016, | |
| "learning_rate": 5.034642032332563e-07, | |
| "loss": 0.8691, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5136, | |
| "grad_norm": 3.6082992553710938, | |
| "learning_rate": 5.018145826459914e-07, | |
| "loss": 0.9252, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.5152, | |
| "grad_norm": 2.5912933349609375, | |
| "learning_rate": 5.001649620587265e-07, | |
| "loss": 0.8538, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.5168, | |
| "grad_norm": 7.729884624481201, | |
| "learning_rate": 4.985153414714615e-07, | |
| "loss": 0.7578, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.5184, | |
| "grad_norm": 4.614051342010498, | |
| "learning_rate": 4.968657208841966e-07, | |
| "loss": 0.8363, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 3.5848758220672607, | |
| "learning_rate": 4.952161002969316e-07, | |
| "loss": 0.902, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.5216, | |
| "grad_norm": 4.744536399841309, | |
| "learning_rate": 4.935664797096667e-07, | |
| "loss": 0.8356, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.5232, | |
| "grad_norm": 6.719925880432129, | |
| "learning_rate": 4.919168591224018e-07, | |
| "loss": 0.8663, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.5248, | |
| "grad_norm": 5.994638442993164, | |
| "learning_rate": 4.902672385351369e-07, | |
| "loss": 0.8854, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.5264, | |
| "grad_norm": 3.5340418815612793, | |
| "learning_rate": 4.88617617947872e-07, | |
| "loss": 0.738, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 4.414712905883789, | |
| "learning_rate": 4.86967997360607e-07, | |
| "loss": 0.8637, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.5296, | |
| "grad_norm": 3.8119003772735596, | |
| "learning_rate": 4.853183767733421e-07, | |
| "loss": 0.8948, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.5312, | |
| "grad_norm": 3.453695058822632, | |
| "learning_rate": 4.836687561860771e-07, | |
| "loss": 0.9324, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.5328, | |
| "grad_norm": 8.695696830749512, | |
| "learning_rate": 4.820191355988122e-07, | |
| "loss": 0.8486, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.5344, | |
| "grad_norm": 3.696438789367676, | |
| "learning_rate": 4.803695150115473e-07, | |
| "loss": 0.8801, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 5.598580360412598, | |
| "learning_rate": 4.787198944242824e-07, | |
| "loss": 0.9598, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.5376, | |
| "grad_norm": 7.447549819946289, | |
| "learning_rate": 4.770702738370175e-07, | |
| "loss": 0.7981, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.5392, | |
| "grad_norm": 3.8933768272399902, | |
| "learning_rate": 4.754206532497526e-07, | |
| "loss": 0.7836, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.5408, | |
| "grad_norm": 4.233343124389648, | |
| "learning_rate": 4.737710326624876e-07, | |
| "loss": 0.9953, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.5424, | |
| "grad_norm": 4.121957302093506, | |
| "learning_rate": 4.721214120752227e-07, | |
| "loss": 0.8909, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 3.326876640319824, | |
| "learning_rate": 4.704717914879577e-07, | |
| "loss": 0.8207, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5456, | |
| "grad_norm": 4.2965006828308105, | |
| "learning_rate": 4.688221709006928e-07, | |
| "loss": 0.7338, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.5472, | |
| "grad_norm": 12.319995880126953, | |
| "learning_rate": 4.6717255031342787e-07, | |
| "loss": 0.8716, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.5488, | |
| "grad_norm": 3.4306647777557373, | |
| "learning_rate": 4.6552292972616297e-07, | |
| "loss": 0.8844, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.5504, | |
| "grad_norm": 3.8839616775512695, | |
| "learning_rate": 4.638733091388981e-07, | |
| "loss": 0.6966, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 4.802063941955566, | |
| "learning_rate": 4.622236885516331e-07, | |
| "loss": 1.0128, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.5536, | |
| "grad_norm": 3.7047386169433594, | |
| "learning_rate": 4.6057406796436817e-07, | |
| "loss": 0.7706, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.5552, | |
| "grad_norm": 5.304298400878906, | |
| "learning_rate": 4.589244473771032e-07, | |
| "loss": 0.7627, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.5568, | |
| "grad_norm": 3.211620330810547, | |
| "learning_rate": 4.572748267898383e-07, | |
| "loss": 0.8409, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.5584, | |
| "grad_norm": 4.873741149902344, | |
| "learning_rate": 4.5562520620257337e-07, | |
| "loss": 0.9454, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 3.801036834716797, | |
| "learning_rate": 4.5397558561530847e-07, | |
| "loss": 0.8148, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5616, | |
| "grad_norm": 4.238209247589111, | |
| "learning_rate": 4.5232596502804357e-07, | |
| "loss": 0.907, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.5632, | |
| "grad_norm": 5.311016082763672, | |
| "learning_rate": 4.506763444407786e-07, | |
| "loss": 0.8865, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.5648, | |
| "grad_norm": 5.096076011657715, | |
| "learning_rate": 4.4902672385351366e-07, | |
| "loss": 0.8967, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.5664, | |
| "grad_norm": 3.0391855239868164, | |
| "learning_rate": 4.473771032662487e-07, | |
| "loss": 0.8671, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 3.2100040912628174, | |
| "learning_rate": 4.457274826789838e-07, | |
| "loss": 0.8144, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.5696, | |
| "grad_norm": 4.312873840332031, | |
| "learning_rate": 4.4407786209171886e-07, | |
| "loss": 0.9743, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.5712, | |
| "grad_norm": 4.577536582946777, | |
| "learning_rate": 4.4242824150445396e-07, | |
| "loss": 0.8988, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.5728, | |
| "grad_norm": 6.181716442108154, | |
| "learning_rate": 4.40778620917189e-07, | |
| "loss": 0.8709, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.5744, | |
| "grad_norm": 3.878676176071167, | |
| "learning_rate": 4.391290003299241e-07, | |
| "loss": 0.9212, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 3.634641170501709, | |
| "learning_rate": 4.374793797426592e-07, | |
| "loss": 0.9252, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5776, | |
| "grad_norm": 4.589493274688721, | |
| "learning_rate": 4.358297591553942e-07, | |
| "loss": 0.9442, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 0.5792, | |
| "grad_norm": 3.5581719875335693, | |
| "learning_rate": 4.341801385681293e-07, | |
| "loss": 0.8068, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.5808, | |
| "grad_norm": 10.048519134521484, | |
| "learning_rate": 4.3253051798086436e-07, | |
| "loss": 0.9334, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 0.5824, | |
| "grad_norm": 7.136456489562988, | |
| "learning_rate": 4.3088089739359946e-07, | |
| "loss": 0.8711, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 12.951844215393066, | |
| "learning_rate": 4.292312768063345e-07, | |
| "loss": 0.8709, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.5856, | |
| "grad_norm": 4.377983093261719, | |
| "learning_rate": 4.275816562190696e-07, | |
| "loss": 0.8509, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.5872, | |
| "grad_norm": 4.836514472961426, | |
| "learning_rate": 4.259320356318047e-07, | |
| "loss": 0.882, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 0.5888, | |
| "grad_norm": 3.0401344299316406, | |
| "learning_rate": 4.242824150445397e-07, | |
| "loss": 0.9351, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.5904, | |
| "grad_norm": 4.854428768157959, | |
| "learning_rate": 4.226327944572748e-07, | |
| "loss": 0.9465, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 3.092222213745117, | |
| "learning_rate": 4.2098317387000985e-07, | |
| "loss": 0.8038, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.5936, | |
| "grad_norm": 5.498143196105957, | |
| "learning_rate": 4.1933355328274495e-07, | |
| "loss": 0.769, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 0.5952, | |
| "grad_norm": 2.4063949584960938, | |
| "learning_rate": 4.1768393269548e-07, | |
| "loss": 0.8057, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.5968, | |
| "grad_norm": 5.123895168304443, | |
| "learning_rate": 4.160343121082151e-07, | |
| "loss": 0.9554, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 0.5984, | |
| "grad_norm": 7.29245662689209, | |
| "learning_rate": 4.143846915209502e-07, | |
| "loss": 0.9079, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 2.9312267303466797, | |
| "learning_rate": 4.1273507093368525e-07, | |
| "loss": 0.8765, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.6016, | |
| "grad_norm": 3.0390522480010986, | |
| "learning_rate": 4.110854503464203e-07, | |
| "loss": 0.9539, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.6032, | |
| "grad_norm": 3.8350090980529785, | |
| "learning_rate": 4.0943582975915535e-07, | |
| "loss": 0.8397, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 0.6048, | |
| "grad_norm": 3.9119083881378174, | |
| "learning_rate": 4.0778620917189045e-07, | |
| "loss": 0.8769, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.6064, | |
| "grad_norm": 3.361199378967285, | |
| "learning_rate": 4.061365885846255e-07, | |
| "loss": 0.9028, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 4.87637186050415, | |
| "learning_rate": 4.044869679973606e-07, | |
| "loss": 0.8022, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6096, | |
| "grad_norm": 4.546545505523682, | |
| "learning_rate": 4.028373474100957e-07, | |
| "loss": 0.7343, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 0.6112, | |
| "grad_norm": 2.975339651107788, | |
| "learning_rate": 4.0118772682283075e-07, | |
| "loss": 0.8335, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.6128, | |
| "grad_norm": 3.8709919452667236, | |
| "learning_rate": 3.995381062355658e-07, | |
| "loss": 0.7803, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 0.6144, | |
| "grad_norm": 2.690919876098633, | |
| "learning_rate": 3.9788848564830084e-07, | |
| "loss": 1.0247, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 4.163801193237305, | |
| "learning_rate": 3.9623886506103594e-07, | |
| "loss": 0.8764, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.6176, | |
| "grad_norm": 5.445613384246826, | |
| "learning_rate": 3.94589244473771e-07, | |
| "loss": 0.9047, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.6192, | |
| "grad_norm": 3.3369109630584717, | |
| "learning_rate": 3.929396238865061e-07, | |
| "loss": 0.9644, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 0.6208, | |
| "grad_norm": 2.8063957691192627, | |
| "learning_rate": 3.912900032992412e-07, | |
| "loss": 0.8411, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.6224, | |
| "grad_norm": 3.369598865509033, | |
| "learning_rate": 3.8964038271197624e-07, | |
| "loss": 0.8904, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 11.861967086791992, | |
| "learning_rate": 3.8799076212471134e-07, | |
| "loss": 0.8503, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.6256, | |
| "grad_norm": 3.746105670928955, | |
| "learning_rate": 3.8634114153744634e-07, | |
| "loss": 0.8549, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 0.6272, | |
| "grad_norm": 4.717544078826904, | |
| "learning_rate": 3.8469152095018144e-07, | |
| "loss": 0.7668, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.6288, | |
| "grad_norm": 3.0035829544067383, | |
| "learning_rate": 3.830419003629165e-07, | |
| "loss": 0.7519, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 0.6304, | |
| "grad_norm": 4.065003395080566, | |
| "learning_rate": 3.813922797756516e-07, | |
| "loss": 0.8116, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 5.251111030578613, | |
| "learning_rate": 3.7974265918838663e-07, | |
| "loss": 0.8879, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.6336, | |
| "grad_norm": 5.612459659576416, | |
| "learning_rate": 3.7809303860112173e-07, | |
| "loss": 0.7931, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.6352, | |
| "grad_norm": 4.041755199432373, | |
| "learning_rate": 3.7644341801385684e-07, | |
| "loss": 0.7963, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 0.6368, | |
| "grad_norm": 9.98974609375, | |
| "learning_rate": 3.7479379742659183e-07, | |
| "loss": 0.8962, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.6384, | |
| "grad_norm": 3.949065685272217, | |
| "learning_rate": 3.7314417683932693e-07, | |
| "loss": 0.9132, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 4.0974297523498535, | |
| "learning_rate": 3.71494556252062e-07, | |
| "loss": 0.9125, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6416, | |
| "grad_norm": 3.70499324798584, | |
| "learning_rate": 3.698449356647971e-07, | |
| "loss": 0.7575, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 0.6432, | |
| "grad_norm": 4.345754623413086, | |
| "learning_rate": 3.6819531507753213e-07, | |
| "loss": 0.7473, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.6448, | |
| "grad_norm": 2.8242263793945312, | |
| "learning_rate": 3.6654569449026723e-07, | |
| "loss": 0.959, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 0.6464, | |
| "grad_norm": 3.6714463233947754, | |
| "learning_rate": 3.6489607390300233e-07, | |
| "loss": 0.7814, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 4.4022908210754395, | |
| "learning_rate": 3.632464533157374e-07, | |
| "loss": 0.8977, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.6496, | |
| "grad_norm": 3.5451886653900146, | |
| "learning_rate": 3.6159683272847243e-07, | |
| "loss": 0.8835, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.6512, | |
| "grad_norm": 5.699954509735107, | |
| "learning_rate": 3.599472121412075e-07, | |
| "loss": 0.8339, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 0.6528, | |
| "grad_norm": 3.2886204719543457, | |
| "learning_rate": 3.582975915539426e-07, | |
| "loss": 0.85, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.6544, | |
| "grad_norm": 2.8363375663757324, | |
| "learning_rate": 3.566479709666776e-07, | |
| "loss": 0.8195, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 3.734877824783325, | |
| "learning_rate": 3.549983503794127e-07, | |
| "loss": 0.8803, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6576, | |
| "grad_norm": 2.6836330890655518, | |
| "learning_rate": 3.533487297921478e-07, | |
| "loss": 0.8574, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 0.6592, | |
| "grad_norm": 3.9296648502349854, | |
| "learning_rate": 3.516991092048829e-07, | |
| "loss": 0.7938, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.6608, | |
| "grad_norm": 2.973696231842041, | |
| "learning_rate": 3.500494886176179e-07, | |
| "loss": 0.8593, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 0.6624, | |
| "grad_norm": 4.675530433654785, | |
| "learning_rate": 3.4839986803035297e-07, | |
| "loss": 0.8456, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 4.891861915588379, | |
| "learning_rate": 3.4675024744308807e-07, | |
| "loss": 0.9814, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.6656, | |
| "grad_norm": 3.9921982288360596, | |
| "learning_rate": 3.451006268558231e-07, | |
| "loss": 0.8416, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.6672, | |
| "grad_norm": 3.1958041191101074, | |
| "learning_rate": 3.434510062685582e-07, | |
| "loss": 0.905, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 0.6688, | |
| "grad_norm": 4.344924449920654, | |
| "learning_rate": 3.418013856812933e-07, | |
| "loss": 0.8202, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.6704, | |
| "grad_norm": 7.5191569328308105, | |
| "learning_rate": 3.4015176509402837e-07, | |
| "loss": 0.9426, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 4.440326690673828, | |
| "learning_rate": 3.3850214450676347e-07, | |
| "loss": 0.8205, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6736, | |
| "grad_norm": 6.4901123046875, | |
| "learning_rate": 3.3685252391949846e-07, | |
| "loss": 0.7936, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 0.6752, | |
| "grad_norm": 3.9426374435424805, | |
| "learning_rate": 3.3520290333223357e-07, | |
| "loss": 0.793, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.6768, | |
| "grad_norm": 5.018584728240967, | |
| "learning_rate": 3.335532827449686e-07, | |
| "loss": 0.7799, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 0.6784, | |
| "grad_norm": 3.7835421562194824, | |
| "learning_rate": 3.319036621577037e-07, | |
| "loss": 0.8696, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 6.0190839767456055, | |
| "learning_rate": 3.302540415704388e-07, | |
| "loss": 0.9139, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.6816, | |
| "grad_norm": 5.751317977905273, | |
| "learning_rate": 3.2860442098317386e-07, | |
| "loss": 0.8522, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.6832, | |
| "grad_norm": 6.684688091278076, | |
| "learning_rate": 3.2695480039590896e-07, | |
| "loss": 0.8061, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 0.6848, | |
| "grad_norm": 2.783705234527588, | |
| "learning_rate": 3.2530517980864396e-07, | |
| "loss": 0.7725, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.6864, | |
| "grad_norm": 4.636482238769531, | |
| "learning_rate": 3.2365555922137906e-07, | |
| "loss": 0.8479, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 5.260950565338135, | |
| "learning_rate": 3.220059386341141e-07, | |
| "loss": 1.0717, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6896, | |
| "grad_norm": 5.191953659057617, | |
| "learning_rate": 3.203563180468492e-07, | |
| "loss": 0.8714, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 0.6912, | |
| "grad_norm": 7.279730796813965, | |
| "learning_rate": 3.187066974595843e-07, | |
| "loss": 0.8246, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.6928, | |
| "grad_norm": 2.966627359390259, | |
| "learning_rate": 3.1705707687231936e-07, | |
| "loss": 0.9053, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 0.6944, | |
| "grad_norm": 8.789515495300293, | |
| "learning_rate": 3.1540745628505446e-07, | |
| "loss": 0.7851, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 2.929105520248413, | |
| "learning_rate": 3.137578356977895e-07, | |
| "loss": 0.956, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.6976, | |
| "grad_norm": 5.6356096267700195, | |
| "learning_rate": 3.1210821511052456e-07, | |
| "loss": 0.7946, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.6992, | |
| "grad_norm": 3.3033862113952637, | |
| "learning_rate": 3.104585945232596e-07, | |
| "loss": 0.884, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 0.7008, | |
| "grad_norm": 5.996482849121094, | |
| "learning_rate": 3.088089739359947e-07, | |
| "loss": 0.8342, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.7024, | |
| "grad_norm": 7.644280910491943, | |
| "learning_rate": 3.0715935334872975e-07, | |
| "loss": 0.8575, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 5.780369281768799, | |
| "learning_rate": 3.0550973276146485e-07, | |
| "loss": 0.84, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7056, | |
| "grad_norm": 3.7677314281463623, | |
| "learning_rate": 3.0386011217419995e-07, | |
| "loss": 0.8764, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 0.7072, | |
| "grad_norm": 4.153870105743408, | |
| "learning_rate": 3.02210491586935e-07, | |
| "loss": 0.8447, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.7088, | |
| "grad_norm": 6.395594120025635, | |
| "learning_rate": 3.0056087099967005e-07, | |
| "loss": 0.8499, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 0.7104, | |
| "grad_norm": 3.4210963249206543, | |
| "learning_rate": 2.989112504124051e-07, | |
| "loss": 0.7547, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 2.710740327835083, | |
| "learning_rate": 2.972616298251402e-07, | |
| "loss": 0.8286, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.7136, | |
| "grad_norm": 5.014111042022705, | |
| "learning_rate": 2.9561200923787525e-07, | |
| "loss": 0.8706, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.7152, | |
| "grad_norm": 3.8330109119415283, | |
| "learning_rate": 2.9396238865061035e-07, | |
| "loss": 0.8221, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 0.7168, | |
| "grad_norm": 5.695978164672852, | |
| "learning_rate": 2.9231276806334545e-07, | |
| "loss": 0.8412, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.7184, | |
| "grad_norm": 5.974388599395752, | |
| "learning_rate": 2.906631474760805e-07, | |
| "loss": 0.8235, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 2.9334166049957275, | |
| "learning_rate": 2.890135268888156e-07, | |
| "loss": 0.8965, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.7216, | |
| "grad_norm": 8.407828330993652, | |
| "learning_rate": 2.873639063015506e-07, | |
| "loss": 0.8991, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 0.7232, | |
| "grad_norm": 4.443752765655518, | |
| "learning_rate": 2.857142857142857e-07, | |
| "loss": 0.7565, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.7248, | |
| "grad_norm": 6.351187229156494, | |
| "learning_rate": 2.8406466512702074e-07, | |
| "loss": 0.8202, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 0.7264, | |
| "grad_norm": 4.715820789337158, | |
| "learning_rate": 2.8241504453975584e-07, | |
| "loss": 0.877, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 3.1347246170043945, | |
| "learning_rate": 2.8076542395249094e-07, | |
| "loss": 0.8227, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.7296, | |
| "grad_norm": 3.8322551250457764, | |
| "learning_rate": 2.79115803365226e-07, | |
| "loss": 0.7839, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.7312, | |
| "grad_norm": 4.289877414703369, | |
| "learning_rate": 2.774661827779611e-07, | |
| "loss": 0.7952, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 0.7328, | |
| "grad_norm": 3.775768995285034, | |
| "learning_rate": 2.758165621906961e-07, | |
| "loss": 0.7924, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.7344, | |
| "grad_norm": 4.233770370483398, | |
| "learning_rate": 2.741669416034312e-07, | |
| "loss": 0.8008, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 5.131399154663086, | |
| "learning_rate": 2.7251732101616624e-07, | |
| "loss": 0.8103, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.7376, | |
| "grad_norm": 7.184566497802734, | |
| "learning_rate": 2.7086770042890134e-07, | |
| "loss": 0.9024, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 0.7392, | |
| "grad_norm": 3.6044952869415283, | |
| "learning_rate": 2.6921807984163644e-07, | |
| "loss": 0.8925, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.7408, | |
| "grad_norm": 8.124959945678711, | |
| "learning_rate": 2.675684592543715e-07, | |
| "loss": 0.8227, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 0.7424, | |
| "grad_norm": 5.050447463989258, | |
| "learning_rate": 2.659188386671066e-07, | |
| "loss": 0.87, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 3.2532646656036377, | |
| "learning_rate": 2.6426921807984164e-07, | |
| "loss": 0.9024, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 0.7456, | |
| "grad_norm": 7.244692325592041, | |
| "learning_rate": 2.626195974925767e-07, | |
| "loss": 0.7402, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.7472, | |
| "grad_norm": 5.4176435470581055, | |
| "learning_rate": 2.6096997690531173e-07, | |
| "loss": 0.9476, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 0.7488, | |
| "grad_norm": 13.94157886505127, | |
| "learning_rate": 2.5932035631804683e-07, | |
| "loss": 0.7621, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.7504, | |
| "grad_norm": 2.509117603302002, | |
| "learning_rate": 2.5767073573078193e-07, | |
| "loss": 0.7259, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 3.073138952255249, | |
| "learning_rate": 2.56021115143517e-07, | |
| "loss": 0.8224, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.7536, | |
| "grad_norm": 3.9155077934265137, | |
| "learning_rate": 2.543714945562521e-07, | |
| "loss": 0.8596, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 0.7552, | |
| "grad_norm": 6.405920028686523, | |
| "learning_rate": 2.5272187396898713e-07, | |
| "loss": 0.8152, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.7568, | |
| "grad_norm": 3.8203928470611572, | |
| "learning_rate": 2.510722533817222e-07, | |
| "loss": 0.8907, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 0.7584, | |
| "grad_norm": 3.9368674755096436, | |
| "learning_rate": 2.494226327944573e-07, | |
| "loss": 0.8804, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 4.452835559844971, | |
| "learning_rate": 2.4777301220719233e-07, | |
| "loss": 0.8304, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 0.7616, | |
| "grad_norm": 4.987030982971191, | |
| "learning_rate": 2.461233916199274e-07, | |
| "loss": 0.9112, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.7632, | |
| "grad_norm": 7.84393310546875, | |
| "learning_rate": 2.444737710326625e-07, | |
| "loss": 0.8584, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 0.7648, | |
| "grad_norm": 3.063011646270752, | |
| "learning_rate": 2.428241504453976e-07, | |
| "loss": 0.8648, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.7664, | |
| "grad_norm": 5.494943618774414, | |
| "learning_rate": 2.411745298581326e-07, | |
| "loss": 0.8633, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 3.209425449371338, | |
| "learning_rate": 2.3952490927086767e-07, | |
| "loss": 0.7618, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7696, | |
| "grad_norm": 3.163612127304077, | |
| "learning_rate": 2.3787528868360277e-07, | |
| "loss": 0.7093, | |
| "step": 2405 | |
| }, | |
| { | |
| "epoch": 0.7712, | |
| "grad_norm": 4.022956848144531, | |
| "learning_rate": 2.3622566809633785e-07, | |
| "loss": 0.7705, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.7728, | |
| "grad_norm": 3.748598575592041, | |
| "learning_rate": 2.345760475090729e-07, | |
| "loss": 0.942, | |
| "step": 2415 | |
| }, | |
| { | |
| "epoch": 0.7744, | |
| "grad_norm": 3.5295920372009277, | |
| "learning_rate": 2.3292642692180797e-07, | |
| "loss": 0.8094, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 5.767539978027344, | |
| "learning_rate": 2.3127680633454305e-07, | |
| "loss": 0.8425, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 0.7776, | |
| "grad_norm": 3.4246678352355957, | |
| "learning_rate": 2.2962718574727812e-07, | |
| "loss": 0.7777, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.7792, | |
| "grad_norm": 3.2887349128723145, | |
| "learning_rate": 2.2797756516001317e-07, | |
| "loss": 0.8842, | |
| "step": 2435 | |
| }, | |
| { | |
| "epoch": 0.7808, | |
| "grad_norm": 4.332666397094727, | |
| "learning_rate": 2.2632794457274827e-07, | |
| "loss": 0.8946, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.7824, | |
| "grad_norm": 3.6099178791046143, | |
| "learning_rate": 2.2467832398548334e-07, | |
| "loss": 0.929, | |
| "step": 2445 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 2.9944612979888916, | |
| "learning_rate": 2.230287033982184e-07, | |
| "loss": 0.9407, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.7856, | |
| "grad_norm": 4.5248613357543945, | |
| "learning_rate": 2.2137908281095347e-07, | |
| "loss": 0.9392, | |
| "step": 2455 | |
| }, | |
| { | |
| "epoch": 0.7872, | |
| "grad_norm": 2.7737669944763184, | |
| "learning_rate": 2.1972946222368854e-07, | |
| "loss": 0.627, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.7888, | |
| "grad_norm": 2.95540452003479, | |
| "learning_rate": 2.1807984163642361e-07, | |
| "loss": 0.7346, | |
| "step": 2465 | |
| }, | |
| { | |
| "epoch": 0.7904, | |
| "grad_norm": 8.727768898010254, | |
| "learning_rate": 2.1643022104915866e-07, | |
| "loss": 0.9009, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 4.173437118530273, | |
| "learning_rate": 2.1478060046189376e-07, | |
| "loss": 0.8968, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 0.7936, | |
| "grad_norm": 3.4933767318725586, | |
| "learning_rate": 2.1313097987462884e-07, | |
| "loss": 0.868, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.7952, | |
| "grad_norm": 3.33754301071167, | |
| "learning_rate": 2.114813592873639e-07, | |
| "loss": 0.8185, | |
| "step": 2485 | |
| }, | |
| { | |
| "epoch": 0.7968, | |
| "grad_norm": 7.453437328338623, | |
| "learning_rate": 2.0983173870009896e-07, | |
| "loss": 0.8654, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.7984, | |
| "grad_norm": 9.287111282348633, | |
| "learning_rate": 2.0818211811283404e-07, | |
| "loss": 0.7744, | |
| "step": 2495 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 4.195357322692871, | |
| "learning_rate": 2.065324975255691e-07, | |
| "loss": 0.8295, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8016, | |
| "grad_norm": 4.878857612609863, | |
| "learning_rate": 2.0488287693830418e-07, | |
| "loss": 0.7917, | |
| "step": 2505 | |
| }, | |
| { | |
| "epoch": 0.8032, | |
| "grad_norm": 3.239182710647583, | |
| "learning_rate": 2.0323325635103923e-07, | |
| "loss": 0.8339, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.8048, | |
| "grad_norm": 4.196946144104004, | |
| "learning_rate": 2.0158363576377433e-07, | |
| "loss": 0.9609, | |
| "step": 2515 | |
| }, | |
| { | |
| "epoch": 0.8064, | |
| "grad_norm": 2.7448809146881104, | |
| "learning_rate": 1.999340151765094e-07, | |
| "loss": 0.8516, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": 4.893881320953369, | |
| "learning_rate": 1.9828439458924446e-07, | |
| "loss": 0.7908, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 0.8096, | |
| "grad_norm": 3.318279266357422, | |
| "learning_rate": 1.9663477400197953e-07, | |
| "loss": 0.7966, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.8112, | |
| "grad_norm": 2.901827573776245, | |
| "learning_rate": 1.949851534147146e-07, | |
| "loss": 0.7831, | |
| "step": 2535 | |
| }, | |
| { | |
| "epoch": 0.8128, | |
| "grad_norm": 5.1762847900390625, | |
| "learning_rate": 1.9333553282744968e-07, | |
| "loss": 0.9897, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.8144, | |
| "grad_norm": 6.721929550170898, | |
| "learning_rate": 1.9168591224018473e-07, | |
| "loss": 0.8774, | |
| "step": 2545 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 3.2159762382507324, | |
| "learning_rate": 1.9003629165291983e-07, | |
| "loss": 0.8543, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.8176, | |
| "grad_norm": 2.2698450088500977, | |
| "learning_rate": 1.883866710656549e-07, | |
| "loss": 0.7665, | |
| "step": 2555 | |
| }, | |
| { | |
| "epoch": 0.8192, | |
| "grad_norm": 6.745720386505127, | |
| "learning_rate": 1.8673705047838998e-07, | |
| "loss": 0.8497, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.8208, | |
| "grad_norm": 7.653261661529541, | |
| "learning_rate": 1.8508742989112503e-07, | |
| "loss": 0.8156, | |
| "step": 2565 | |
| }, | |
| { | |
| "epoch": 0.8224, | |
| "grad_norm": 5.070962905883789, | |
| "learning_rate": 1.834378093038601e-07, | |
| "loss": 0.8599, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": 3.101536750793457, | |
| "learning_rate": 1.8178818871659517e-07, | |
| "loss": 0.9103, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 0.8256, | |
| "grad_norm": 5.420032024383545, | |
| "learning_rate": 1.8013856812933025e-07, | |
| "loss": 0.8812, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.8272, | |
| "grad_norm": 6.0531697273254395, | |
| "learning_rate": 1.7848894754206532e-07, | |
| "loss": 0.9167, | |
| "step": 2585 | |
| }, | |
| { | |
| "epoch": 0.8288, | |
| "grad_norm": 7.951639175415039, | |
| "learning_rate": 1.768393269548004e-07, | |
| "loss": 0.958, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.8304, | |
| "grad_norm": 3.8448524475097656, | |
| "learning_rate": 1.7518970636753547e-07, | |
| "loss": 0.9107, | |
| "step": 2595 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 6.432617664337158, | |
| "learning_rate": 1.7354008578027052e-07, | |
| "loss": 0.9313, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.8336, | |
| "grad_norm": 6.240530967712402, | |
| "learning_rate": 1.718904651930056e-07, | |
| "loss": 0.8152, | |
| "step": 2605 | |
| }, | |
| { | |
| "epoch": 0.8352, | |
| "grad_norm": 3.8396613597869873, | |
| "learning_rate": 1.7024084460574067e-07, | |
| "loss": 0.8105, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.8368, | |
| "grad_norm": 5.357729434967041, | |
| "learning_rate": 1.6859122401847574e-07, | |
| "loss": 0.8994, | |
| "step": 2615 | |
| }, | |
| { | |
| "epoch": 0.8384, | |
| "grad_norm": 3.711209535598755, | |
| "learning_rate": 1.669416034312108e-07, | |
| "loss": 0.8206, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 8.217768669128418, | |
| "learning_rate": 1.652919828439459e-07, | |
| "loss": 0.8569, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 0.8416, | |
| "grad_norm": 3.3683290481567383, | |
| "learning_rate": 1.6364236225668097e-07, | |
| "loss": 0.8385, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.8432, | |
| "grad_norm": 5.228672981262207, | |
| "learning_rate": 1.6199274166941604e-07, | |
| "loss": 0.9177, | |
| "step": 2635 | |
| }, | |
| { | |
| "epoch": 0.8448, | |
| "grad_norm": 4.824789524078369, | |
| "learning_rate": 1.603431210821511e-07, | |
| "loss": 0.8195, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.8464, | |
| "grad_norm": 3.6725375652313232, | |
| "learning_rate": 1.5869350049488616e-07, | |
| "loss": 0.8017, | |
| "step": 2645 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 3.130878210067749, | |
| "learning_rate": 1.5704387990762124e-07, | |
| "loss": 0.8088, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.8496, | |
| "grad_norm": 4.449658393859863, | |
| "learning_rate": 1.553942593203563e-07, | |
| "loss": 0.7835, | |
| "step": 2655 | |
| }, | |
| { | |
| "epoch": 0.8512, | |
| "grad_norm": 4.97245454788208, | |
| "learning_rate": 1.537446387330914e-07, | |
| "loss": 0.837, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.8528, | |
| "grad_norm": 16.94793128967285, | |
| "learning_rate": 1.5209501814582646e-07, | |
| "loss": 0.8113, | |
| "step": 2665 | |
| }, | |
| { | |
| "epoch": 0.8544, | |
| "grad_norm": 4.743756294250488, | |
| "learning_rate": 1.5044539755856154e-07, | |
| "loss": 0.8042, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": 10.094191551208496, | |
| "learning_rate": 1.4879577697129658e-07, | |
| "loss": 0.8571, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 0.8576, | |
| "grad_norm": 5.925148010253906, | |
| "learning_rate": 1.4714615638403166e-07, | |
| "loss": 0.8749, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.8592, | |
| "grad_norm": 3.691056251525879, | |
| "learning_rate": 1.4549653579676673e-07, | |
| "loss": 0.8824, | |
| "step": 2685 | |
| }, | |
| { | |
| "epoch": 0.8608, | |
| "grad_norm": 3.58223295211792, | |
| "learning_rate": 1.438469152095018e-07, | |
| "loss": 0.8761, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.8624, | |
| "grad_norm": 2.6448755264282227, | |
| "learning_rate": 1.4219729462223686e-07, | |
| "loss": 0.8868, | |
| "step": 2695 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 3.1470277309417725, | |
| "learning_rate": 1.4054767403497196e-07, | |
| "loss": 0.738, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8656, | |
| "grad_norm": 2.670072317123413, | |
| "learning_rate": 1.3889805344770703e-07, | |
| "loss": 0.834, | |
| "step": 2705 | |
| }, | |
| { | |
| "epoch": 0.8672, | |
| "grad_norm": 3.6198477745056152, | |
| "learning_rate": 1.372484328604421e-07, | |
| "loss": 0.866, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.8688, | |
| "grad_norm": 3.4495441913604736, | |
| "learning_rate": 1.3559881227317715e-07, | |
| "loss": 0.8393, | |
| "step": 2715 | |
| }, | |
| { | |
| "epoch": 0.8704, | |
| "grad_norm": 4.266736030578613, | |
| "learning_rate": 1.3394919168591223e-07, | |
| "loss": 0.6669, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": 7.86549711227417, | |
| "learning_rate": 1.322995710986473e-07, | |
| "loss": 0.8842, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 0.8736, | |
| "grad_norm": 9.990023612976074, | |
| "learning_rate": 1.3064995051138238e-07, | |
| "loss": 0.8435, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.8752, | |
| "grad_norm": 5.904709815979004, | |
| "learning_rate": 1.2900032992411745e-07, | |
| "loss": 0.8569, | |
| "step": 2735 | |
| }, | |
| { | |
| "epoch": 0.8768, | |
| "grad_norm": 3.9784185886383057, | |
| "learning_rate": 1.2735070933685253e-07, | |
| "loss": 0.8613, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.8784, | |
| "grad_norm": 3.423604965209961, | |
| "learning_rate": 1.257010887495876e-07, | |
| "loss": 0.8349, | |
| "step": 2745 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 7.008749961853027, | |
| "learning_rate": 1.2405146816232267e-07, | |
| "loss": 0.739, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.8816, | |
| "grad_norm": 4.69740104675293, | |
| "learning_rate": 1.2240184757505772e-07, | |
| "loss": 0.8429, | |
| "step": 2755 | |
| }, | |
| { | |
| "epoch": 0.8832, | |
| "grad_norm": 11.424933433532715, | |
| "learning_rate": 1.207522269877928e-07, | |
| "loss": 0.8223, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.8848, | |
| "grad_norm": 9.028188705444336, | |
| "learning_rate": 1.1910260640052787e-07, | |
| "loss": 0.8795, | |
| "step": 2765 | |
| }, | |
| { | |
| "epoch": 0.8864, | |
| "grad_norm": 4.878159999847412, | |
| "learning_rate": 1.1745298581326295e-07, | |
| "loss": 0.8503, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": 4.5786237716674805, | |
| "learning_rate": 1.1580336522599801e-07, | |
| "loss": 0.7894, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 0.8896, | |
| "grad_norm": 5.562559127807617, | |
| "learning_rate": 1.141537446387331e-07, | |
| "loss": 0.9339, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.8912, | |
| "grad_norm": 5.035255432128906, | |
| "learning_rate": 1.1250412405146816e-07, | |
| "loss": 0.9041, | |
| "step": 2785 | |
| }, | |
| { | |
| "epoch": 0.8928, | |
| "grad_norm": 3.3735504150390625, | |
| "learning_rate": 1.1085450346420323e-07, | |
| "loss": 0.8033, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.8944, | |
| "grad_norm": 3.345130443572998, | |
| "learning_rate": 1.0920488287693829e-07, | |
| "loss": 0.7649, | |
| "step": 2795 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 2.9101784229278564, | |
| "learning_rate": 1.0755526228967338e-07, | |
| "loss": 0.7702, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8976, | |
| "grad_norm": 8.602867126464844, | |
| "learning_rate": 1.0590564170240844e-07, | |
| "loss": 0.8769, | |
| "step": 2805 | |
| }, | |
| { | |
| "epoch": 0.8992, | |
| "grad_norm": 3.7691822052001953, | |
| "learning_rate": 1.042560211151435e-07, | |
| "loss": 0.7437, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.9008, | |
| "grad_norm": 3.548344612121582, | |
| "learning_rate": 1.0260640052787859e-07, | |
| "loss": 0.7927, | |
| "step": 2815 | |
| }, | |
| { | |
| "epoch": 0.9024, | |
| "grad_norm": 6.130397319793701, | |
| "learning_rate": 1.0095677994061365e-07, | |
| "loss": 0.8964, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": 6.0678935050964355, | |
| "learning_rate": 9.930715935334873e-08, | |
| "loss": 0.8846, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 0.9056, | |
| "grad_norm": 3.202853202819824, | |
| "learning_rate": 9.765753876608379e-08, | |
| "loss": 0.7872, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.9072, | |
| "grad_norm": 7.076948165893555, | |
| "learning_rate": 9.600791817881887e-08, | |
| "loss": 0.841, | |
| "step": 2835 | |
| }, | |
| { | |
| "epoch": 0.9088, | |
| "grad_norm": 4.527687072753906, | |
| "learning_rate": 9.435829759155394e-08, | |
| "loss": 0.8376, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.9104, | |
| "grad_norm": 3.8655998706817627, | |
| "learning_rate": 9.270867700428901e-08, | |
| "loss": 0.9163, | |
| "step": 2845 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 4.189991474151611, | |
| "learning_rate": 9.105905641702407e-08, | |
| "loss": 0.9171, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.9136, | |
| "grad_norm": 3.6782002449035645, | |
| "learning_rate": 8.940943582975916e-08, | |
| "loss": 0.8107, | |
| "step": 2855 | |
| }, | |
| { | |
| "epoch": 0.9152, | |
| "grad_norm": 3.8793959617614746, | |
| "learning_rate": 8.775981524249422e-08, | |
| "loss": 0.8324, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.9168, | |
| "grad_norm": 6.360919952392578, | |
| "learning_rate": 8.61101946552293e-08, | |
| "loss": 0.901, | |
| "step": 2865 | |
| }, | |
| { | |
| "epoch": 0.9184, | |
| "grad_norm": 4.218044757843018, | |
| "learning_rate": 8.446057406796437e-08, | |
| "loss": 0.7871, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 6.177008628845215, | |
| "learning_rate": 8.281095348069944e-08, | |
| "loss": 0.841, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 0.9216, | |
| "grad_norm": 2.739051103591919, | |
| "learning_rate": 8.11613328934345e-08, | |
| "loss": 0.7872, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.9232, | |
| "grad_norm": 8.707544326782227, | |
| "learning_rate": 7.951171230616957e-08, | |
| "loss": 0.9274, | |
| "step": 2885 | |
| }, | |
| { | |
| "epoch": 0.9248, | |
| "grad_norm": 4.484316825866699, | |
| "learning_rate": 7.786209171890465e-08, | |
| "loss": 0.9121, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.9264, | |
| "grad_norm": 3.211519479751587, | |
| "learning_rate": 7.621247113163972e-08, | |
| "loss": 0.8737, | |
| "step": 2895 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 5.020310878753662, | |
| "learning_rate": 7.456285054437479e-08, | |
| "loss": 0.8358, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.9296, | |
| "grad_norm": 5.312314510345459, | |
| "learning_rate": 7.291322995710985e-08, | |
| "loss": 0.831, | |
| "step": 2905 | |
| }, | |
| { | |
| "epoch": 0.9312, | |
| "grad_norm": 2.917203903198242, | |
| "learning_rate": 7.126360936984494e-08, | |
| "loss": 0.8756, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.9328, | |
| "grad_norm": 3.924370288848877, | |
| "learning_rate": 6.961398878258e-08, | |
| "loss": 0.7825, | |
| "step": 2915 | |
| }, | |
| { | |
| "epoch": 0.9344, | |
| "grad_norm": 3.571991205215454, | |
| "learning_rate": 6.796436819531507e-08, | |
| "loss": 0.807, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": 5.816591739654541, | |
| "learning_rate": 6.631474760805014e-08, | |
| "loss": 0.8491, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 0.9376, | |
| "grad_norm": 2.8010520935058594, | |
| "learning_rate": 6.466512702078522e-08, | |
| "loss": 0.9383, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.9392, | |
| "grad_norm": 4.16404914855957, | |
| "learning_rate": 6.301550643352028e-08, | |
| "loss": 0.9048, | |
| "step": 2935 | |
| }, | |
| { | |
| "epoch": 0.9408, | |
| "grad_norm": 3.1094634532928467, | |
| "learning_rate": 6.136588584625536e-08, | |
| "loss": 0.8829, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.9424, | |
| "grad_norm": 6.206966400146484, | |
| "learning_rate": 5.971626525899043e-08, | |
| "loss": 0.8568, | |
| "step": 2945 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 3.327371120452881, | |
| "learning_rate": 5.80666446717255e-08, | |
| "loss": 0.8895, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.9456, | |
| "grad_norm": 3.068650722503662, | |
| "learning_rate": 5.641702408446057e-08, | |
| "loss": 0.8451, | |
| "step": 2955 | |
| }, | |
| { | |
| "epoch": 0.9472, | |
| "grad_norm": 18.251916885375977, | |
| "learning_rate": 5.4767403497195644e-08, | |
| "loss": 0.8896, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.9488, | |
| "grad_norm": 6.762292385101318, | |
| "learning_rate": 5.311778290993071e-08, | |
| "loss": 0.775, | |
| "step": 2965 | |
| }, | |
| { | |
| "epoch": 0.9504, | |
| "grad_norm": 3.4393362998962402, | |
| "learning_rate": 5.1468162322665786e-08, | |
| "loss": 0.8701, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": 3.25495982170105, | |
| "learning_rate": 4.9818541735400854e-08, | |
| "loss": 0.9626, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 0.9536, | |
| "grad_norm": 3.750603437423706, | |
| "learning_rate": 4.816892114813593e-08, | |
| "loss": 0.8311, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.9552, | |
| "grad_norm": 8.03615951538086, | |
| "learning_rate": 4.6519300560871e-08, | |
| "loss": 0.8256, | |
| "step": 2985 | |
| }, | |
| { | |
| "epoch": 0.9568, | |
| "grad_norm": 3.2003724575042725, | |
| "learning_rate": 4.486967997360607e-08, | |
| "loss": 0.8626, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.9584, | |
| "grad_norm": 4.359886169433594, | |
| "learning_rate": 4.3220059386341145e-08, | |
| "loss": 0.8756, | |
| "step": 2995 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 6.897675514221191, | |
| "learning_rate": 4.1570438799076207e-08, | |
| "loss": 0.8767, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.9616, | |
| "grad_norm": 3.0032472610473633, | |
| "learning_rate": 3.992081821181128e-08, | |
| "loss": 0.9283, | |
| "step": 3005 | |
| }, | |
| { | |
| "epoch": 0.9632, | |
| "grad_norm": 4.568953037261963, | |
| "learning_rate": 3.827119762454635e-08, | |
| "loss": 0.6707, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.9648, | |
| "grad_norm": 6.175788879394531, | |
| "learning_rate": 3.6621577037281423e-08, | |
| "loss": 0.7988, | |
| "step": 3015 | |
| }, | |
| { | |
| "epoch": 0.9664, | |
| "grad_norm": 6.2108354568481445, | |
| "learning_rate": 3.497195645001649e-08, | |
| "loss": 0.8649, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.968, | |
| "grad_norm": 5.617148399353027, | |
| "learning_rate": 3.3322335862751566e-08, | |
| "loss": 0.8005, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 0.9696, | |
| "grad_norm": 2.560255765914917, | |
| "learning_rate": 3.1672715275486634e-08, | |
| "loss": 0.8828, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.9712, | |
| "grad_norm": 3.895402669906616, | |
| "learning_rate": 3.002309468822171e-08, | |
| "loss": 0.9119, | |
| "step": 3035 | |
| }, | |
| { | |
| "epoch": 0.9728, | |
| "grad_norm": 3.2274169921875, | |
| "learning_rate": 2.837347410095678e-08, | |
| "loss": 0.9227, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.9744, | |
| "grad_norm": 3.9759535789489746, | |
| "learning_rate": 2.672385351369185e-08, | |
| "loss": 0.7694, | |
| "step": 3045 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 5.453105926513672, | |
| "learning_rate": 2.507423292642692e-08, | |
| "loss": 0.8129, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.9776, | |
| "grad_norm": 3.8954710960388184, | |
| "learning_rate": 2.342461233916199e-08, | |
| "loss": 0.8646, | |
| "step": 3055 | |
| }, | |
| { | |
| "epoch": 0.9792, | |
| "grad_norm": 5.158627033233643, | |
| "learning_rate": 2.177499175189706e-08, | |
| "loss": 0.873, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.9808, | |
| "grad_norm": 3.405482769012451, | |
| "learning_rate": 2.0125371164632132e-08, | |
| "loss": 0.6943, | |
| "step": 3065 | |
| }, | |
| { | |
| "epoch": 0.9824, | |
| "grad_norm": 4.411757469177246, | |
| "learning_rate": 1.8475750577367203e-08, | |
| "loss": 0.8178, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.984, | |
| "grad_norm": 3.8500680923461914, | |
| "learning_rate": 1.6826129990102277e-08, | |
| "loss": 0.8075, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 0.9856, | |
| "grad_norm": 9.081463813781738, | |
| "learning_rate": 1.517650940283735e-08, | |
| "loss": 0.8583, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.9872, | |
| "grad_norm": 3.0819270610809326, | |
| "learning_rate": 1.3526888815572416e-08, | |
| "loss": 0.7818, | |
| "step": 3085 | |
| }, | |
| { | |
| "epoch": 0.9888, | |
| "grad_norm": 12.406457901000977, | |
| "learning_rate": 1.187726822830749e-08, | |
| "loss": 0.9238, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.9904, | |
| "grad_norm": 4.571506977081299, | |
| "learning_rate": 1.022764764104256e-08, | |
| "loss": 0.788, | |
| "step": 3095 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 2.9352900981903076, | |
| "learning_rate": 8.578027053777632e-09, | |
| "loss": 0.9279, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.9936, | |
| "grad_norm": 4.189651966094971, | |
| "learning_rate": 6.928406466512702e-09, | |
| "loss": 0.8379, | |
| "step": 3105 | |
| }, | |
| { | |
| "epoch": 0.9952, | |
| "grad_norm": 3.8705317974090576, | |
| "learning_rate": 5.278785879247773e-09, | |
| "loss": 0.778, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.9968, | |
| "grad_norm": 2.701988458633423, | |
| "learning_rate": 3.629165291982844e-09, | |
| "loss": 0.9363, | |
| "step": 3115 | |
| }, | |
| { | |
| "epoch": 0.9984, | |
| "grad_norm": 3.2893664836883545, | |
| "learning_rate": 1.9795447047179146e-09, | |
| "loss": 0.7902, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 4.953529357910156, | |
| "learning_rate": 3.299241174529858e-10, | |
| "loss": 0.7962, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 3125, | |
| "total_flos": 1.0275244834155397e+18, | |
| "train_loss": 0.8923269732666016, | |
| "train_runtime": 7058.0642, | |
| "train_samples_per_second": 7.084, | |
| "train_steps_per_second": 0.443 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 3125, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0275244834155397e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |