| { | |
| "best_metric": 1.396972417831421, | |
| "best_model_checkpoint": "./qwen_t/qwen_o5/checkpoint-320", | |
| "epoch": 0.11695906432748537, | |
| "eval_steps": 10, | |
| "global_step": 350, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003341687552213868, | |
| "grad_norm": 0.7599132657051086, | |
| "learning_rate": 0.0002, | |
| "loss": 3.626, | |
| "mean_token_accuracy": 0.36439715698361397, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.003341687552213868, | |
| "eval_loss": 3.443260669708252, | |
| "eval_mean_token_accuracy": 0.4417985293127242, | |
| "eval_runtime": 41.9693, | |
| "eval_samples_per_second": 80.058, | |
| "eval_steps_per_second": 10.007, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.006683375104427736, | |
| "grad_norm": 0.8953952193260193, | |
| "learning_rate": 0.0002, | |
| "loss": 2.9254, | |
| "mean_token_accuracy": 0.49326967149972917, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.006683375104427736, | |
| "eval_loss": 2.5516397953033447, | |
| "eval_mean_token_accuracy": 0.565807048479716, | |
| "eval_runtime": 40.7965, | |
| "eval_samples_per_second": 82.36, | |
| "eval_steps_per_second": 10.295, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.010025062656641603, | |
| "grad_norm": 0.6876745223999023, | |
| "learning_rate": 0.0002, | |
| "loss": 2.231, | |
| "mean_token_accuracy": 0.6093000993132591, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.010025062656641603, | |
| "eval_loss": 2.153578042984009, | |
| "eval_mean_token_accuracy": 0.6297694771062761, | |
| "eval_runtime": 35.4077, | |
| "eval_samples_per_second": 94.895, | |
| "eval_steps_per_second": 11.862, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.013366750208855471, | |
| "grad_norm": 0.9506922960281372, | |
| "learning_rate": 0.0002, | |
| "loss": 1.8838, | |
| "mean_token_accuracy": 0.6746647953987122, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.013366750208855471, | |
| "eval_loss": 2.019120454788208, | |
| "eval_mean_token_accuracy": 0.6349048288805145, | |
| "eval_runtime": 35.8494, | |
| "eval_samples_per_second": 93.725, | |
| "eval_steps_per_second": 11.716, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.01670843776106934, | |
| "grad_norm": 2.049982786178589, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5276, | |
| "mean_token_accuracy": 0.7170954093337059, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.01670843776106934, | |
| "eval_loss": 1.813344120979309, | |
| "eval_mean_token_accuracy": 0.6365837232697578, | |
| "eval_runtime": 29.3868, | |
| "eval_samples_per_second": 114.337, | |
| "eval_steps_per_second": 14.292, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.020050125313283207, | |
| "grad_norm": 0.7795844674110413, | |
| "learning_rate": 0.0002, | |
| "loss": 2.2638, | |
| "mean_token_accuracy": 0.5529197990894318, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.020050125313283207, | |
| "eval_loss": 1.6835161447525024, | |
| "eval_mean_token_accuracy": 0.6496368288993836, | |
| "eval_runtime": 67.7706, | |
| "eval_samples_per_second": 49.579, | |
| "eval_steps_per_second": 6.197, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.023391812865497075, | |
| "grad_norm": 0.6929437518119812, | |
| "learning_rate": 0.0002, | |
| "loss": 1.7755, | |
| "mean_token_accuracy": 0.6449611410498619, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.023391812865497075, | |
| "eval_loss": 1.5908516645431519, | |
| "eval_mean_token_accuracy": 0.6815834226352828, | |
| "eval_runtime": 63.7232, | |
| "eval_samples_per_second": 52.728, | |
| "eval_steps_per_second": 6.591, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.026733500417710943, | |
| "grad_norm": 0.5863602161407471, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4411, | |
| "mean_token_accuracy": 0.6989624485373497, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.026733500417710943, | |
| "eval_loss": 1.5338634252548218, | |
| "eval_mean_token_accuracy": 0.6856980549437659, | |
| "eval_runtime": 50.2939, | |
| "eval_samples_per_second": 66.807, | |
| "eval_steps_per_second": 8.351, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.03007518796992481, | |
| "grad_norm": 1.1920981407165527, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2398, | |
| "mean_token_accuracy": 0.733481515944004, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.03007518796992481, | |
| "eval_loss": 1.5052729845046997, | |
| "eval_mean_token_accuracy": 0.6931079140731267, | |
| "eval_runtime": 64.5138, | |
| "eval_samples_per_second": 52.082, | |
| "eval_steps_per_second": 6.51, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.03341687552213868, | |
| "grad_norm": 0.6549275517463684, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0111, | |
| "mean_token_accuracy": 0.7853028282523156, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03341687552213868, | |
| "eval_loss": 1.518865704536438, | |
| "eval_mean_token_accuracy": 0.6946799146987143, | |
| "eval_runtime": 34.4958, | |
| "eval_samples_per_second": 97.403, | |
| "eval_steps_per_second": 12.175, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.036758563074352546, | |
| "grad_norm": 0.5133540630340576, | |
| "learning_rate": 0.0002, | |
| "loss": 1.9947, | |
| "mean_token_accuracy": 0.6003070399165154, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.036758563074352546, | |
| "eval_loss": 1.4736624956130981, | |
| "eval_mean_token_accuracy": 0.6929171987232707, | |
| "eval_runtime": 31.0194, | |
| "eval_samples_per_second": 108.319, | |
| "eval_steps_per_second": 13.54, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.040100250626566414, | |
| "grad_norm": 0.4258256256580353, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5854, | |
| "mean_token_accuracy": 0.6611790612339974, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.040100250626566414, | |
| "eval_loss": 1.458544373512268, | |
| "eval_mean_token_accuracy": 0.6966695138386317, | |
| "eval_runtime": 32.5414, | |
| "eval_samples_per_second": 103.253, | |
| "eval_steps_per_second": 12.907, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.04344193817878028, | |
| "grad_norm": 0.5019882321357727, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3932, | |
| "mean_token_accuracy": 0.717147932946682, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.04344193817878028, | |
| "eval_loss": 1.4401620626449585, | |
| "eval_mean_token_accuracy": 0.7001797112680617, | |
| "eval_runtime": 30.1368, | |
| "eval_samples_per_second": 111.492, | |
| "eval_steps_per_second": 13.936, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.04678362573099415, | |
| "grad_norm": 0.5241239070892334, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1731, | |
| "mean_token_accuracy": 0.7519763350486756, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.04678362573099415, | |
| "eval_loss": 1.4381680488586426, | |
| "eval_mean_token_accuracy": 0.6980286110724722, | |
| "eval_runtime": 29.4312, | |
| "eval_samples_per_second": 114.164, | |
| "eval_steps_per_second": 14.271, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.05012531328320802, | |
| "grad_norm": 0.5657021999359131, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9886, | |
| "mean_token_accuracy": 0.804797975718975, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.05012531328320802, | |
| "eval_loss": 1.4500703811645508, | |
| "eval_mean_token_accuracy": 0.7004464421243894, | |
| "eval_runtime": 29.379, | |
| "eval_samples_per_second": 114.368, | |
| "eval_steps_per_second": 14.296, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.053467000835421885, | |
| "grad_norm": 0.48124462366104126, | |
| "learning_rate": 0.0002, | |
| "loss": 1.8415, | |
| "mean_token_accuracy": 0.6195126965641975, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.053467000835421885, | |
| "eval_loss": 1.4379223585128784, | |
| "eval_mean_token_accuracy": 0.6959139862940424, | |
| "eval_runtime": 29.496, | |
| "eval_samples_per_second": 113.914, | |
| "eval_steps_per_second": 14.239, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.05680868838763575, | |
| "grad_norm": 0.4167322516441345, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5117, | |
| "mean_token_accuracy": 0.6729505002498627, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.05680868838763575, | |
| "eval_loss": 1.4370402097702026, | |
| "eval_mean_token_accuracy": 0.6991755010116668, | |
| "eval_runtime": 30.4827, | |
| "eval_samples_per_second": 110.227, | |
| "eval_steps_per_second": 13.778, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.06015037593984962, | |
| "grad_norm": 0.44749510288238525, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2954, | |
| "mean_token_accuracy": 0.7225258648395538, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.06015037593984962, | |
| "eval_loss": 1.423570156097412, | |
| "eval_mean_token_accuracy": 0.7020650133490562, | |
| "eval_runtime": 30.6017, | |
| "eval_samples_per_second": 109.798, | |
| "eval_steps_per_second": 13.725, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.06349206349206349, | |
| "grad_norm": 0.3989886939525604, | |
| "learning_rate": 0.0002, | |
| "loss": 1.213, | |
| "mean_token_accuracy": 0.7481454327702522, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.06349206349206349, | |
| "eval_loss": 1.4212963581085205, | |
| "eval_mean_token_accuracy": 0.7001493394374847, | |
| "eval_runtime": 41.8797, | |
| "eval_samples_per_second": 80.23, | |
| "eval_steps_per_second": 10.029, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.06683375104427736, | |
| "grad_norm": 0.5422595739364624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.942, | |
| "mean_token_accuracy": 0.7962346121668815, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06683375104427736, | |
| "eval_loss": 1.421792984008789, | |
| "eval_mean_token_accuracy": 0.7010365227858225, | |
| "eval_runtime": 53.936, | |
| "eval_samples_per_second": 62.296, | |
| "eval_steps_per_second": 7.787, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.07017543859649122, | |
| "grad_norm": 0.39737701416015625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.9107, | |
| "mean_token_accuracy": 0.5946269743144512, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.07017543859649122, | |
| "eval_loss": 1.4200433492660522, | |
| "eval_mean_token_accuracy": 0.6980209651447478, | |
| "eval_runtime": 48.7912, | |
| "eval_samples_per_second": 68.865, | |
| "eval_steps_per_second": 8.608, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.07351712614870509, | |
| "grad_norm": 0.3731982707977295, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4745, | |
| "mean_token_accuracy": 0.6861546367406846, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.07351712614870509, | |
| "eval_loss": 1.426990032196045, | |
| "eval_mean_token_accuracy": 0.6979587059645426, | |
| "eval_runtime": 45.8174, | |
| "eval_samples_per_second": 73.335, | |
| "eval_steps_per_second": 9.167, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.07685881370091896, | |
| "grad_norm": 0.5165483951568604, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3166, | |
| "mean_token_accuracy": 0.717747439444065, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.07685881370091896, | |
| "eval_loss": 1.4164931774139404, | |
| "eval_mean_token_accuracy": 0.7008462209077109, | |
| "eval_runtime": 35.0477, | |
| "eval_samples_per_second": 95.869, | |
| "eval_steps_per_second": 11.984, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.08020050125313283, | |
| "grad_norm": 0.3445465862751007, | |
| "learning_rate": 0.0002, | |
| "loss": 1.138, | |
| "mean_token_accuracy": 0.7411063179373741, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.08020050125313283, | |
| "eval_loss": 1.4141920804977417, | |
| "eval_mean_token_accuracy": 0.7027672590953963, | |
| "eval_runtime": 34.6315, | |
| "eval_samples_per_second": 97.022, | |
| "eval_steps_per_second": 12.128, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.0835421888053467, | |
| "grad_norm": 0.9735682606697083, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8767, | |
| "mean_token_accuracy": 0.79200878739357, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0835421888053467, | |
| "eval_loss": 1.421015977859497, | |
| "eval_mean_token_accuracy": 0.6934712292892592, | |
| "eval_runtime": 34.4466, | |
| "eval_samples_per_second": 97.542, | |
| "eval_steps_per_second": 12.193, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.08688387635756056, | |
| "grad_norm": 0.4343126118183136, | |
| "learning_rate": 0.0002, | |
| "loss": 1.9246, | |
| "mean_token_accuracy": 0.5945770829916001, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.08688387635756056, | |
| "eval_loss": 1.4099905490875244, | |
| "eval_mean_token_accuracy": 0.7017570126624334, | |
| "eval_runtime": 29.6092, | |
| "eval_samples_per_second": 113.478, | |
| "eval_steps_per_second": 14.185, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.09022556390977443, | |
| "grad_norm": 0.3334052562713623, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4759, | |
| "mean_token_accuracy": 0.6715509802103042, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.09022556390977443, | |
| "eval_loss": 1.4085925817489624, | |
| "eval_mean_token_accuracy": 0.7048604423091525, | |
| "eval_runtime": 38.8094, | |
| "eval_samples_per_second": 86.577, | |
| "eval_steps_per_second": 10.822, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.0935672514619883, | |
| "grad_norm": 0.5291116237640381, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3162, | |
| "mean_token_accuracy": 0.7213364154100418, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.0935672514619883, | |
| "eval_loss": 1.4125802516937256, | |
| "eval_mean_token_accuracy": 0.7007201626896858, | |
| "eval_runtime": 41.0882, | |
| "eval_samples_per_second": 81.775, | |
| "eval_steps_per_second": 10.222, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.09690893901420217, | |
| "grad_norm": 0.3959917724132538, | |
| "learning_rate": 0.0002, | |
| "loss": 1.192, | |
| "mean_token_accuracy": 0.7394131779670715, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.09690893901420217, | |
| "eval_loss": 1.4038469791412354, | |
| "eval_mean_token_accuracy": 0.7027802584426743, | |
| "eval_runtime": 31.4074, | |
| "eval_samples_per_second": 106.981, | |
| "eval_steps_per_second": 13.373, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.10025062656641603, | |
| "grad_norm": 0.6445237398147583, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8861, | |
| "mean_token_accuracy": 0.7985727787017822, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.10025062656641603, | |
| "eval_loss": 1.4055042266845703, | |
| "eval_mean_token_accuracy": 0.7020482325837726, | |
| "eval_runtime": 36.2285, | |
| "eval_samples_per_second": 92.745, | |
| "eval_steps_per_second": 11.593, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.1035923141186299, | |
| "grad_norm": 0.3228004276752472, | |
| "learning_rate": 0.0002, | |
| "loss": 1.9139, | |
| "mean_token_accuracy": 0.6049163021147251, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1035923141186299, | |
| "eval_loss": 1.401644229888916, | |
| "eval_mean_token_accuracy": 0.70244310824644, | |
| "eval_runtime": 29.3696, | |
| "eval_samples_per_second": 114.404, | |
| "eval_steps_per_second": 14.301, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.10693400167084377, | |
| "grad_norm": 0.35528433322906494, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4937, | |
| "mean_token_accuracy": 0.6764601737260818, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.10693400167084377, | |
| "eval_loss": 1.396972417831421, | |
| "eval_mean_token_accuracy": 0.699098062302385, | |
| "eval_runtime": 40.4694, | |
| "eval_samples_per_second": 83.026, | |
| "eval_steps_per_second": 10.378, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.11027568922305764, | |
| "grad_norm": 0.4269411563873291, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2599, | |
| "mean_token_accuracy": 0.7228553861379623, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.11027568922305764, | |
| "eval_loss": 1.398388385772705, | |
| "eval_mean_token_accuracy": 0.7035389555352075, | |
| "eval_runtime": 33.2865, | |
| "eval_samples_per_second": 100.942, | |
| "eval_steps_per_second": 12.618, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1136173767752715, | |
| "grad_norm": 0.372363805770874, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1301, | |
| "mean_token_accuracy": 0.7524892643094063, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.1136173767752715, | |
| "eval_loss": 1.398653507232666, | |
| "eval_mean_token_accuracy": 0.6984730128731046, | |
| "eval_runtime": 44.4473, | |
| "eval_samples_per_second": 75.595, | |
| "eval_steps_per_second": 9.449, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.11695906432748537, | |
| "grad_norm": 0.4013306796550751, | |
| "learning_rate": 0.0002, | |
| "loss": 0.939, | |
| "mean_token_accuracy": 0.7970763191580772, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.11695906432748537, | |
| "eval_loss": 1.3984951972961426, | |
| "eval_mean_token_accuracy": 0.7021824714683351, | |
| "eval_runtime": 35.5812, | |
| "eval_samples_per_second": 94.432, | |
| "eval_steps_per_second": 11.804, | |
| "step": 350 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 14960, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 3, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 3 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 934541258360832.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |