diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,19233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.02280718426304286, + "eval_steps": 500, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 295.25, + "epoch": 1.9005986885869048e-05, + "grad_norm": 2.673111623301322, + "kl": 0.0, + "learning_rate": 9.999999991087068e-07, + "loss": -0.0, + "reward": 1.7687500715255737, + "reward_std": 0.20764468610286713, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 0.9750000238418579, + "step": 1, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 367.3500061035156, + "epoch": 3.8011973771738095e-05, + "grad_norm": 2.082116157134833, + "kl": 0.000606536865234375, + "learning_rate": 9.99999996434827e-07, + "loss": 0.0, + "reward": 1.5409200191497803, + "reward_std": 0.2872665822505951, + "rewards/accuracy_reward": 0.46467000246047974, + "rewards/format_reward": 1.0, + "step": 2, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 279.8999938964844, + "epoch": 5.7017960657607147e-05, + "grad_norm": 4.104595525624102, + "kl": 0.000659942626953125, + "learning_rate": 9.99999991978361e-07, + "loss": 0.0, + "reward": 1.492989420890808, + "reward_std": 0.32215064764022827, + "rewards/accuracy_reward": 0.4917394816875458, + "rewards/format_reward": 0.9750000238418579, + "step": 3, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 299.6499938964844, + "epoch": 7.602394754347619e-05, + "grad_norm": 4.066751398647228, + "kl": 0.000591278076171875, + "learning_rate": 9.999999857393084e-07, + "loss": 0.0, + "reward": 1.9667459726333618, + "reward_std": 0.2620104253292084, + "rewards/accuracy_reward": 0.7704960703849792, + "rewards/format_reward": 1.0, + "step": 4, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 337.8500061035156, + "epoch": 9.502993442934525e-05, + "grad_norm": 1.8447963431136554, + "kl": 0.0005340576171875, + "learning_rate": 9.999999777176696e-07, + "loss": 0.0, + "reward": 1.6162500381469727, + "reward_std": 0.2701939046382904, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 0.824999988079071, + "step": 5, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 214.75, + "epoch": 0.00011403592131521429, + "grad_norm": 4.865076982742215, + "kl": 0.000720977783203125, + "learning_rate": 9.999999679134443e-07, + "loss": 0.0, + "reward": 1.943750023841858, + "reward_std": 0.2636513411998749, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 6, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 251.22500610351562, + "epoch": 0.00013304190820108335, + "grad_norm": 2.734172384623042, + "kl": 0.00066375732421875, + "learning_rate": 9.999999563266326e-07, + "loss": 0.0, + "reward": 1.7112499475479126, + "reward_std": 0.42926207184791565, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 7, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 285.2749938964844, + "epoch": 0.00015204789508695238, + "grad_norm": 2.5397443822078283, + "kl": 0.000606536865234375, + "learning_rate": 9.999999429572349e-07, + "loss": 0.0, + "reward": 1.7121597528457642, + "reward_std": 0.24398386478424072, + "rewards/accuracy_reward": 0.6559095978736877, + "rewards/format_reward": 0.925000011920929, + "step": 8, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 322.875, + "epoch": 0.00017105388197282144, + "grad_norm": 2.8129541643991116, + "kl": 0.000865936279296875, + "learning_rate": 9.999999278052507e-07, + "loss": 0.0, + "reward": 1.6165634393692017, + "reward_std": 0.45454278588294983, + "rewards/accuracy_reward": 0.5178134441375732, + "rewards/format_reward": 0.9750000238418579, + "step": 9, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 221.0, + "epoch": 0.0001900598688586905, + "grad_norm": 1.9687283034904635, + "kl": 0.00093841552734375, + "learning_rate": 9.999999108706803e-07, + "loss": 0.0, + "reward": 1.777500033378601, + "reward_std": 0.3578013777732849, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 10, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 194.4250030517578, + "epoch": 0.00020906585574455953, + "grad_norm": 2.3991361721082303, + "kl": 0.00106048583984375, + "learning_rate": 9.999998921535239e-07, + "loss": 0.0, + "reward": 1.9177086353302002, + "reward_std": 0.1179908737540245, + "rewards/accuracy_reward": 0.6414585709571838, + "rewards/format_reward": 1.0, + "step": 11, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 314.5, + "epoch": 0.00022807184263042859, + "grad_norm": 1.886022960586048, + "kl": 0.00086212158203125, + "learning_rate": 9.999998716537811e-07, + "loss": 0.0, + "reward": 1.2020833492279053, + "reward_std": 0.1961066573858261, + "rewards/accuracy_reward": 0.3283333480358124, + "rewards/format_reward": 0.800000011920929, + "step": 12, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 383.6750183105469, + "epoch": 0.0002470778295162976, + "grad_norm": 2.709438114169319, + "kl": 0.0014495849609375, + "learning_rate": 9.999998493714527e-07, + "loss": 0.0001, + "reward": 1.718187689781189, + "reward_std": 0.4320615828037262, + "rewards/accuracy_reward": 0.6806877255439758, + "rewards/format_reward": 0.925000011920929, + "step": 13, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 304.45001220703125, + "epoch": 0.0002660838164021667, + "grad_norm": 3.7270686122717542, + "kl": 0.001220703125, + "learning_rate": 9.999998253065383e-07, + "loss": 0.0001, + "reward": 1.809033989906311, + "reward_std": 0.04920737445354462, + "rewards/accuracy_reward": 0.6952840685844421, + "rewards/format_reward": 1.0, + "step": 14, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 333.375, + "epoch": 0.00028508980328803573, + "grad_norm": 4.998729494746825, + "kl": 0.0012664794921875, + "learning_rate": 9.99999799459038e-07, + "loss": 0.0, + "reward": 1.8887499570846558, + "reward_std": 0.2809317409992218, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 0.925000011920929, + "step": 15, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 286.1750183105469, + "epoch": 0.00030409579017390476, + "grad_norm": 2.0233101130622044, + "kl": 0.00133514404296875, + "learning_rate": 9.99999771828952e-07, + "loss": 0.0001, + "reward": 1.5862499475479126, + "reward_std": 0.13032536208629608, + "rewards/accuracy_reward": 0.45000001788139343, + "rewards/format_reward": 1.0, + "step": 16, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 250.8000030517578, + "epoch": 0.00032310177705977385, + "grad_norm": 2.3714583420433533, + "kl": 0.0020294189453125, + "learning_rate": 9.999997424162806e-07, + "loss": 0.0001, + "reward": 1.9924728870391846, + "reward_std": 0.11796430498361588, + "rewards/accuracy_reward": 0.776222825050354, + "rewards/format_reward": 1.0, + "step": 17, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 303.3999938964844, + "epoch": 0.0003421077639456429, + "grad_norm": 1.9635571237272242, + "kl": 0.00160980224609375, + "learning_rate": 9.999997112210234e-07, + "loss": 0.0001, + "reward": 1.7024999856948853, + "reward_std": 0.1976005882024765, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 1.0, + "step": 18, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 292.5249938964844, + "epoch": 0.0003611137508315119, + "grad_norm": 2.2403687588157464, + "kl": 0.0022430419921875, + "learning_rate": 9.999996782431807e-07, + "loss": 0.0001, + "reward": 1.723750114440918, + "reward_std": 0.1547650843858719, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 1.0, + "step": 19, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 236.10000610351562, + "epoch": 0.000380119737717381, + "grad_norm": 2.817841479009199, + "kl": 0.0023193359375, + "learning_rate": 9.99999643482753e-07, + "loss": 0.0001, + "reward": 2.0712499618530273, + "reward_std": 0.2146197408437729, + "rewards/accuracy_reward": 0.9000000357627869, + "rewards/format_reward": 1.0, + "step": 20, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 356.32501220703125, + "epoch": 0.00039912572460325, + "grad_norm": 2.36363757335534, + "kl": 0.00174713134765625, + "learning_rate": 9.999996069397399e-07, + "loss": 0.0001, + "reward": 1.4609562158584595, + "reward_std": 0.47292444109916687, + "rewards/accuracy_reward": 0.4997062683105469, + "rewards/format_reward": 0.949999988079071, + "step": 21, + "temporal_rewards": 0.29999998211860657 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 291.0500183105469, + "epoch": 0.00041813171148911906, + "grad_norm": 2.368744313375501, + "kl": 0.0030364990234375, + "learning_rate": 9.999995686141417e-07, + "loss": 0.0001, + "reward": 2.262500047683716, + "reward_std": 0.12062937021255493, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 22, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 377.82501220703125, + "epoch": 0.00043713769837498814, + "grad_norm": 1.9474999500583028, + "kl": 0.0031585693359375, + "learning_rate": 9.999995285059588e-07, + "loss": 0.0001, + "reward": 1.4155882596969604, + "reward_std": 0.29959985613822937, + "rewards/accuracy_reward": 0.4380883276462555, + "rewards/format_reward": 0.949999988079071, + "step": 23, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 297.3999938964844, + "epoch": 0.00045614368526085717, + "grad_norm": 2.610511274752285, + "kl": 0.0042724609375, + "learning_rate": 9.999994866151911e-07, + "loss": 0.0002, + "reward": 1.799687385559082, + "reward_std": 0.13027535378932953, + "rewards/accuracy_reward": 0.629687488079071, + "rewards/format_reward": 0.9750000238418579, + "step": 24, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 281.07501220703125, + "epoch": 0.0004751496721467262, + "grad_norm": 2.114629919066998, + "kl": 0.0040283203125, + "learning_rate": 9.999994429418386e-07, + "loss": 0.0002, + "reward": 1.6384057998657227, + "reward_std": 0.18284937739372253, + "rewards/accuracy_reward": 0.540905773639679, + "rewards/format_reward": 1.0, + "step": 25, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 324.2749938964844, + "epoch": 0.0004941556590325952, + "grad_norm": 2.755769295658409, + "kl": 0.003692626953125, + "learning_rate": 9.99999397485902e-07, + "loss": 0.0001, + "reward": 1.8387451171875, + "reward_std": 0.18429307639598846, + "rewards/accuracy_reward": 0.7824951410293579, + "rewards/format_reward": 0.9750000238418579, + "step": 26, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 234.65000915527344, + "epoch": 0.0005131616459184643, + "grad_norm": 4.940365643192497, + "kl": 0.00787353515625, + "learning_rate": 9.999993502473808e-07, + "loss": 0.0003, + "reward": 1.9099998474121094, + "reward_std": 0.25944098830223083, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 1.0, + "step": 27, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 251.65000915527344, + "epoch": 0.0005321676328043334, + "grad_norm": 2.3419283166589544, + "kl": 0.007720947265625, + "learning_rate": 9.999993012262756e-07, + "loss": 0.0003, + "reward": 1.896433711051941, + "reward_std": 0.2827633321285248, + "rewards/accuracy_reward": 0.6639335751533508, + "rewards/format_reward": 1.0, + "step": 28, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 316.20001220703125, + "epoch": 0.0005511736196902024, + "grad_norm": 1.86592975728736, + "kl": 0.006134033203125, + "learning_rate": 9.999992504225862e-07, + "loss": 0.0002, + "reward": 1.3874719142913818, + "reward_std": 0.2548384666442871, + "rewards/accuracy_reward": 0.32372185587882996, + "rewards/format_reward": 1.0, + "step": 29, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 271.3500061035156, + "epoch": 0.0005701796065760715, + "grad_norm": 3.968800523270615, + "kl": 0.0074462890625, + "learning_rate": 9.999991978363134e-07, + "loss": 0.0003, + "reward": 1.781022310256958, + "reward_std": 0.21934275329113007, + "rewards/accuracy_reward": 0.6797724962234497, + "rewards/format_reward": 1.0, + "step": 30, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 391.3999938964844, + "epoch": 0.0005891855934619405, + "grad_norm": 1.7032289212136063, + "kl": 0.00665283203125, + "learning_rate": 9.999991434674566e-07, + "loss": 0.0003, + "reward": 1.7510108947753906, + "reward_std": 0.2869863510131836, + "rewards/accuracy_reward": 0.7847608923912048, + "rewards/format_reward": 0.949999988079071, + "step": 31, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 424.1499938964844, + "epoch": 0.0006081915803478095, + "grad_norm": 2.358577127026973, + "kl": 0.00469970703125, + "learning_rate": 9.999990873160167e-07, + "loss": 0.0002, + "reward": 1.7725000381469727, + "reward_std": 0.45565930008888245, + "rewards/accuracy_reward": 0.824999988079071, + "rewards/format_reward": 0.9000000357627869, + "step": 32, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 352.7749938964844, + "epoch": 0.0006271975672336786, + "grad_norm": 2.056994484616452, + "kl": 0.006317138671875, + "learning_rate": 9.999990293819936e-07, + "loss": 0.0003, + "reward": 1.6348778009414673, + "reward_std": 0.4777977466583252, + "rewards/accuracy_reward": 0.6273777484893799, + "rewards/format_reward": 0.925000011920929, + "step": 33, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 393.7749938964844, + "epoch": 0.0006462035541195477, + "grad_norm": 2.221129428967049, + "kl": 0.007415771484375, + "learning_rate": 9.999989696653875e-07, + "loss": 0.0003, + "reward": 1.435416579246521, + "reward_std": 0.33462318778038025, + "rewards/accuracy_reward": 0.416666716337204, + "rewards/format_reward": 0.9750000238418579, + "step": 34, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 299.4250183105469, + "epoch": 0.0006652095410054167, + "grad_norm": 2.2926228153550388, + "kl": 0.010986328125, + "learning_rate": 9.999989081661987e-07, + "loss": 0.0004, + "reward": 1.2972177267074585, + "reward_std": 0.3400118052959442, + "rewards/accuracy_reward": 0.1997176855802536, + "rewards/format_reward": 1.0, + "step": 35, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 247.9250030517578, + "epoch": 0.0006842155278912858, + "grad_norm": 2.5878857876544337, + "kl": 0.0179443359375, + "learning_rate": 9.999988448844271e-07, + "loss": 0.0007, + "reward": 2.221250057220459, + "reward_std": 0.17021353542804718, + "rewards/accuracy_reward": 0.9000000357627869, + "rewards/format_reward": 1.0, + "step": 36, + "temporal_rewards": 1.0 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 283.9250183105469, + "epoch": 0.0007032215147771548, + "grad_norm": 2.2395132656722803, + "kl": 0.0133056640625, + "learning_rate": 9.999987798200732e-07, + "loss": 0.0005, + "reward": 2.171196699142456, + "reward_std": 0.0765160396695137, + "rewards/accuracy_reward": 0.8761968612670898, + "rewards/format_reward": 1.0, + "step": 37, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 381.3500061035156, + "epoch": 0.0007222275016630238, + "grad_norm": 2.1015139286583184, + "kl": 0.01080322265625, + "learning_rate": 9.999987129731372e-07, + "loss": 0.0004, + "reward": 1.7929672002792358, + "reward_std": 0.17871348559856415, + "rewards/accuracy_reward": 0.7979673743247986, + "rewards/format_reward": 0.875, + "step": 38, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 346.8999938964844, + "epoch": 0.0007412334885488929, + "grad_norm": 2.133894867879289, + "kl": 0.0126953125, + "learning_rate": 9.999986443436198e-07, + "loss": 0.0005, + "reward": 1.6820032596588135, + "reward_std": 0.21932058036327362, + "rewards/accuracy_reward": 0.5907532572746277, + "rewards/format_reward": 0.949999988079071, + "step": 39, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 393.2749938964844, + "epoch": 0.000760239475434762, + "grad_norm": 5.01450145069222, + "kl": 0.0133056640625, + "learning_rate": 9.999985739315202e-07, + "loss": 0.0005, + "reward": 1.6507374048233032, + "reward_std": 0.3641747832298279, + "rewards/accuracy_reward": 0.6694874167442322, + "rewards/format_reward": 0.875, + "step": 40, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 343.82501220703125, + "epoch": 0.000779245462320631, + "grad_norm": 2.2408385652997183, + "kl": 0.01483154296875, + "learning_rate": 9.999985017368396e-07, + "loss": 0.0006, + "reward": 1.7862499952316284, + "reward_std": 0.3176189959049225, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 1.0, + "step": 41, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 272.75, + "epoch": 0.0007982514492065, + "grad_norm": 2.690457314397398, + "kl": 0.018310546875, + "learning_rate": 9.999984277595777e-07, + "loss": 0.0007, + "reward": 1.8350000381469727, + "reward_std": 0.37591132521629333, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 1.0, + "step": 42, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 296.1499938964844, + "epoch": 0.0008172574360923691, + "grad_norm": 2.4773046353194883, + "kl": 0.0228271484375, + "learning_rate": 9.99998351999735e-07, + "loss": 0.0009, + "reward": 1.8862498998641968, + "reward_std": 0.22204652428627014, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 1.0, + "step": 43, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 374.2749938964844, + "epoch": 0.0008362634229782381, + "grad_norm": 2.023591117900132, + "kl": 0.0155029296875, + "learning_rate": 9.999982744573119e-07, + "loss": 0.0006, + "reward": 1.4919394254684448, + "reward_std": 0.2940160036087036, + "rewards/accuracy_reward": 0.590689480304718, + "rewards/format_reward": 0.9000000357627869, + "step": 44, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.6, + "completion_length": 354.8999938964844, + "epoch": 0.0008552694098641071, + "grad_norm": 1.4699723957467639, + "kl": 0.0181884765625, + "learning_rate": 9.999981951323081e-07, + "loss": 0.0007, + "reward": 1.3650000095367432, + "reward_std": 0.24048756062984467, + "rewards/accuracy_reward": 0.3499999940395355, + "rewards/format_reward": 0.925000011920929, + "step": 45, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 368.5, + "epoch": 0.0008742753967499763, + "grad_norm": 2.056801555870482, + "kl": 0.01324462890625, + "learning_rate": 9.999981140247246e-07, + "loss": 0.0005, + "reward": 1.7650002241134644, + "reward_std": 0.35767483711242676, + "rewards/accuracy_reward": 0.6462500095367432, + "rewards/format_reward": 1.0, + "step": 46, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 288.4750061035156, + "epoch": 0.0008932813836358453, + "grad_norm": 2.983353682348288, + "kl": 0.0205078125, + "learning_rate": 9.999980311345615e-07, + "loss": 0.0008, + "reward": 1.6775000095367432, + "reward_std": 0.5823832750320435, + "rewards/accuracy_reward": 0.574999988079071, + "rewards/format_reward": 1.0, + "step": 47, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 428.1499938964844, + "epoch": 0.0009122873705217143, + "grad_norm": 1.6983275803418565, + "kl": 0.01104736328125, + "learning_rate": 9.999979464618186e-07, + "loss": 0.0004, + "reward": 1.7168115377426147, + "reward_std": 0.23992919921875, + "rewards/accuracy_reward": 0.6680614948272705, + "rewards/format_reward": 0.9750000238418579, + "step": 48, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 357.125, + "epoch": 0.0009312933574075834, + "grad_norm": 1.5213134378521853, + "kl": 0.01953125, + "learning_rate": 9.99997860006497e-07, + "loss": 0.0008, + "reward": 1.7687500715255737, + "reward_std": 0.2421073466539383, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 0.9750000238418579, + "step": 49, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 328.95001220703125, + "epoch": 0.0009502993442934524, + "grad_norm": 2.390613929329034, + "kl": 0.020751953125, + "learning_rate": 9.999977717685962e-07, + "loss": 0.0008, + "reward": 1.6322113275527954, + "reward_std": 0.42426714301109314, + "rewards/accuracy_reward": 0.5122115015983582, + "rewards/format_reward": 1.0, + "step": 50, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 385.6000061035156, + "epoch": 0.0009693053311793214, + "grad_norm": 1.5275343267282913, + "kl": 0.0181884765625, + "learning_rate": 9.99997681748117e-07, + "loss": 0.0007, + "reward": 1.912500023841858, + "reward_std": 0.4116156995296478, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 1.0, + "step": 51, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 332.1750183105469, + "epoch": 0.0009883113180651905, + "grad_norm": 1.4369320573249387, + "kl": 0.0096435546875, + "learning_rate": 9.999975899450599e-07, + "loss": 0.0004, + "reward": 1.9117647409439087, + "reward_std": 0.206694558262825, + "rewards/accuracy_reward": 0.8617647290229797, + "rewards/format_reward": 1.0, + "step": 52, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 328.5249938964844, + "epoch": 0.0010073173049510596, + "grad_norm": 1.8136528430577885, + "kl": 0.0179443359375, + "learning_rate": 9.999974963594246e-07, + "loss": 0.0007, + "reward": 1.6705681085586548, + "reward_std": 0.23624132573604584, + "rewards/accuracy_reward": 0.6318181753158569, + "rewards/format_reward": 1.0, + "step": 53, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 420.6750183105469, + "epoch": 0.0010263232918369285, + "grad_norm": 4.1499151215157, + "kl": 0.016845703125, + "learning_rate": 9.99997400991212e-07, + "loss": 0.0007, + "reward": 1.2430310249328613, + "reward_std": 0.479884535074234, + "rewards/accuracy_reward": 0.2567810118198395, + "rewards/format_reward": 0.9750000238418579, + "step": 54, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 412.9250183105469, + "epoch": 0.0010453292787227977, + "grad_norm": 1.773815811249763, + "kl": 0.01495361328125, + "learning_rate": 9.999973038404219e-07, + "loss": 0.0006, + "reward": 1.3813902139663696, + "reward_std": 0.24255304038524628, + "rewards/accuracy_reward": 0.5513902306556702, + "rewards/format_reward": 0.800000011920929, + "step": 55, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 389.6000061035156, + "epoch": 0.0010643352656086668, + "grad_norm": 2.5237128428586373, + "kl": 0.0191650390625, + "learning_rate": 9.999972049070554e-07, + "loss": 0.0008, + "reward": 1.743749976158142, + "reward_std": 0.32777532935142517, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.925000011920929, + "step": 56, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 314.07501220703125, + "epoch": 0.0010833412524945357, + "grad_norm": 2.826625934788473, + "kl": 0.0262451171875, + "learning_rate": 9.99997104191112e-07, + "loss": 0.001, + "reward": 2.0825002193450928, + "reward_std": 0.2573499381542206, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 57, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 422.70001220703125, + "epoch": 0.0011023472393804049, + "grad_norm": 2.21783790000557, + "kl": 0.0223388671875, + "learning_rate": 9.999970016925928e-07, + "loss": 0.0009, + "reward": 1.3523801565170288, + "reward_std": 0.24788342416286469, + "rewards/accuracy_reward": 0.46488019824028015, + "rewards/format_reward": 0.800000011920929, + "step": 58, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 387.07501220703125, + "epoch": 0.0011213532262662738, + "grad_norm": 1.711331367857792, + "kl": 0.018798828125, + "learning_rate": 9.999968974114975e-07, + "loss": 0.0008, + "reward": 1.3595693111419678, + "reward_std": 0.18932317197322845, + "rewards/accuracy_reward": 0.35081931948661804, + "rewards/format_reward": 1.0, + "step": 59, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 376.82501220703125, + "epoch": 0.001140359213152143, + "grad_norm": 1.4756213822670716, + "kl": 0.01470947265625, + "learning_rate": 9.999967913478272e-07, + "loss": 0.0006, + "reward": 1.5280908346176147, + "reward_std": 0.20485416054725647, + "rewards/accuracy_reward": 0.5118408203125, + "rewards/format_reward": 1.0, + "step": 60, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.8, + "all_wrong": 0.2, + "completion_length": 374.8000183105469, + "epoch": 0.001159365200038012, + "grad_norm": 1.903452860609033, + "kl": 0.02685546875, + "learning_rate": 9.999966835015817e-07, + "loss": 0.0011, + "reward": 1.7850000858306885, + "reward_std": 0.024494878947734833, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 0.800000011920929, + "step": 61, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 380.5500183105469, + "epoch": 0.001178371186923881, + "grad_norm": 1.6696030822325638, + "kl": 0.0203857421875, + "learning_rate": 9.999965738727617e-07, + "loss": 0.0008, + "reward": 1.730209231376648, + "reward_std": 0.16135059297084808, + "rewards/accuracy_reward": 0.5977091193199158, + "rewards/format_reward": 1.0, + "step": 62, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 380.07501220703125, + "epoch": 0.0011973771738097501, + "grad_norm": 1.8256827508939328, + "kl": 0.01495361328125, + "learning_rate": 9.999964624613672e-07, + "loss": 0.0006, + "reward": 1.389145016670227, + "reward_std": 0.1907978653907776, + "rewards/accuracy_reward": 0.42789506912231445, + "rewards/format_reward": 0.949999988079071, + "step": 63, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 364.25, + "epoch": 0.001216383160695619, + "grad_norm": 3.37428251166175, + "kl": 0.0244140625, + "learning_rate": 9.999963492673991e-07, + "loss": 0.001, + "reward": 1.7062500715255737, + "reward_std": 0.3639249801635742, + "rewards/accuracy_reward": 0.5250000357627869, + "rewards/format_reward": 1.0, + "step": 64, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 343.6000061035156, + "epoch": 0.0012353891475814882, + "grad_norm": 27.10310867256576, + "kl": 0.016357421875, + "learning_rate": 9.999962342908576e-07, + "loss": 0.0007, + "reward": 1.545454978942871, + "reward_std": 0.2258632928133011, + "rewards/accuracy_reward": 0.5217050313949585, + "rewards/format_reward": 1.0, + "step": 65, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 426.3000183105469, + "epoch": 0.0012543951344673571, + "grad_norm": 1.834166770400696, + "kl": 0.018310546875, + "learning_rate": 9.999961175317429e-07, + "loss": 0.0007, + "reward": 1.2920359373092651, + "reward_std": 0.3662143349647522, + "rewards/accuracy_reward": 0.36703595519065857, + "rewards/format_reward": 0.949999988079071, + "step": 66, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 400.8500061035156, + "epoch": 0.0012734011213532263, + "grad_norm": 1.7231419925264362, + "kl": 0.0250244140625, + "learning_rate": 9.999959989900558e-07, + "loss": 0.001, + "reward": 1.4859206676483154, + "reward_std": 0.5184847712516785, + "rewards/accuracy_reward": 0.3896706998348236, + "rewards/format_reward": 0.9750000238418579, + "step": 67, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 324.125, + "epoch": 0.0012924071082390954, + "grad_norm": 2.0570183466608007, + "kl": 0.02734375, + "learning_rate": 9.999958786657963e-07, + "loss": 0.0011, + "reward": 2.1500000953674316, + "reward_std": 0.09946426004171371, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 68, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 332.6750183105469, + "epoch": 0.0013114130951249643, + "grad_norm": 1.7805400018342927, + "kl": 0.0286865234375, + "learning_rate": 9.999957565589651e-07, + "loss": 0.0011, + "reward": 2.137500047683716, + "reward_std": 0.1191316768527031, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 69, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 345.8500061035156, + "epoch": 0.0013304190820108335, + "grad_norm": 1.7616052253473493, + "kl": 0.0262451171875, + "learning_rate": 9.999956326695626e-07, + "loss": 0.0011, + "reward": 1.6295082569122314, + "reward_std": 0.061908524483442307, + "rewards/accuracy_reward": 0.5357584357261658, + "rewards/format_reward": 1.0, + "step": 70, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 433.6499938964844, + "epoch": 0.0013494250688967024, + "grad_norm": 6.06299305105756, + "kl": 0.02197265625, + "learning_rate": 9.999955069975894e-07, + "loss": 0.0009, + "reward": 1.840000033378601, + "reward_std": 0.4016962945461273, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 0.949999988079071, + "step": 71, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 457.3000183105469, + "epoch": 0.0013684310557825715, + "grad_norm": 3.8644206596649666, + "kl": 0.0150146484375, + "learning_rate": 9.999953795430456e-07, + "loss": 0.0006, + "reward": 1.4176387786865234, + "reward_std": 0.5008097290992737, + "rewards/accuracy_reward": 0.5276389122009277, + "rewards/format_reward": 0.875, + "step": 72, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 381.5249938964844, + "epoch": 0.0013874370426684407, + "grad_norm": 1.8633577290688386, + "kl": 0.02197265625, + "learning_rate": 9.999952503059319e-07, + "loss": 0.0009, + "reward": 1.5517667531967163, + "reward_std": 0.2887406051158905, + "rewards/accuracy_reward": 0.4705166816711426, + "rewards/format_reward": 1.0, + "step": 73, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 408.4750061035156, + "epoch": 0.0014064430295543096, + "grad_norm": 22.293175208669798, + "kl": 0.02685546875, + "learning_rate": 9.999951192862486e-07, + "loss": 0.0011, + "reward": 1.863899827003479, + "reward_std": 0.17262814939022064, + "rewards/accuracy_reward": 0.7163999080657959, + "rewards/format_reward": 1.0, + "step": 74, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 427.625, + "epoch": 0.0014254490164401787, + "grad_norm": 1.8972113246695448, + "kl": 0.022216796875, + "learning_rate": 9.999949864839963e-07, + "loss": 0.0009, + "reward": 1.5265105962753296, + "reward_std": 0.3422698676586151, + "rewards/accuracy_reward": 0.4840105175971985, + "rewards/format_reward": 1.0, + "step": 75, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 475.4250183105469, + "epoch": 0.0014444550033260476, + "grad_norm": 1.7901602657202653, + "kl": 0.016357421875, + "learning_rate": 9.999948518991755e-07, + "loss": 0.0007, + "reward": 1.3103300333023071, + "reward_std": 0.44092264771461487, + "rewards/accuracy_reward": 0.5003300309181213, + "rewards/format_reward": 0.824999988079071, + "step": 76, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 430.3999938964844, + "epoch": 0.0014634609902119168, + "grad_norm": 1.7444814779855429, + "kl": 0.0196533203125, + "learning_rate": 9.999947155317865e-07, + "loss": 0.0008, + "reward": 1.6164630651474, + "reward_std": 0.319307804107666, + "rewards/accuracy_reward": 0.5464630126953125, + "rewards/format_reward": 0.949999988079071, + "step": 77, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 420.1000061035156, + "epoch": 0.0014824669770977857, + "grad_norm": 1.972675507869758, + "kl": 0.027099609375, + "learning_rate": 9.999945773818298e-07, + "loss": 0.0011, + "reward": 1.7337499856948853, + "reward_std": 0.47605353593826294, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 78, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 406.2250061035156, + "epoch": 0.0015014729639836548, + "grad_norm": 2.177804655688531, + "kl": 0.0267333984375, + "learning_rate": 9.99994437449306e-07, + "loss": 0.0011, + "reward": 1.8521394729614258, + "reward_std": 0.324115127325058, + "rewards/accuracy_reward": 0.6583895087242126, + "rewards/format_reward": 0.949999988079071, + "step": 79, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 299.9750061035156, + "epoch": 0.001520478950869524, + "grad_norm": 2.7777346773266824, + "kl": 0.02880859375, + "learning_rate": 9.999942957342157e-07, + "loss": 0.0012, + "reward": 1.299134612083435, + "reward_std": 0.29816532135009766, + "rewards/accuracy_reward": 0.26538464426994324, + "rewards/format_reward": 1.0, + "step": 80, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 410.6499938964844, + "epoch": 0.001539484937755393, + "grad_norm": 1.7769340762731953, + "kl": 0.0306396484375, + "learning_rate": 9.999941522365595e-07, + "loss": 0.0012, + "reward": 1.3981298208236694, + "reward_std": 0.22073152661323547, + "rewards/accuracy_reward": 0.4568799138069153, + "rewards/format_reward": 0.824999988079071, + "step": 81, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 409.6750183105469, + "epoch": 0.001558490924641262, + "grad_norm": 1.5432801343755391, + "kl": 0.0194091796875, + "learning_rate": 9.999940069563375e-07, + "loss": 0.0008, + "reward": 1.4534090757369995, + "reward_std": 0.09737285226583481, + "rewards/accuracy_reward": 0.4284090995788574, + "rewards/format_reward": 1.0, + "step": 82, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 420.7749938964844, + "epoch": 0.001577496911527131, + "grad_norm": 3.0572783343126035, + "kl": 0.0218505859375, + "learning_rate": 9.999938598935503e-07, + "loss": 0.0009, + "reward": 1.3602596521377563, + "reward_std": 0.40853962302207947, + "rewards/accuracy_reward": 0.5077596306800842, + "rewards/format_reward": 0.824999988079071, + "step": 83, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 402.6000061035156, + "epoch": 0.001596502898413, + "grad_norm": 2.17268441438714, + "kl": 0.0322265625, + "learning_rate": 9.999937110481986e-07, + "loss": 0.0013, + "reward": 1.5801323652267456, + "reward_std": 0.20872293412685394, + "rewards/accuracy_reward": 0.7001323103904724, + "rewards/format_reward": 0.824999988079071, + "step": 84, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 393.1750183105469, + "epoch": 0.0016155088852988692, + "grad_norm": 2.251746293822135, + "kl": 0.0308837890625, + "learning_rate": 9.99993560420283e-07, + "loss": 0.0012, + "reward": 2.0899999141693115, + "reward_std": 0.14470690488815308, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 85, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 348.6750183105469, + "epoch": 0.0016345148721847382, + "grad_norm": 4.678195652996588, + "kl": 0.03564453125, + "learning_rate": 9.999934080098037e-07, + "loss": 0.0014, + "reward": 1.906760811805725, + "reward_std": 0.17761114239692688, + "rewards/accuracy_reward": 0.7080109119415283, + "rewards/format_reward": 1.0, + "step": 86, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 388.4750061035156, + "epoch": 0.0016535208590706073, + "grad_norm": 5.109596701914393, + "kl": 0.022216796875, + "learning_rate": 9.999932538167616e-07, + "loss": 0.0009, + "reward": 1.5554808378219604, + "reward_std": 0.19344615936279297, + "rewards/accuracy_reward": 0.4942307472229004, + "rewards/format_reward": 1.0, + "step": 87, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 455.7749938964844, + "epoch": 0.0016725268459564762, + "grad_norm": 1.7715382719224089, + "kl": 0.0238037109375, + "learning_rate": 9.999930978411573e-07, + "loss": 0.0009, + "reward": 1.2448269128799438, + "reward_std": 0.3760250210762024, + "rewards/accuracy_reward": 0.34857693314552307, + "rewards/format_reward": 0.8500000238418579, + "step": 88, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 417.8999938964844, + "epoch": 0.0016915328328423454, + "grad_norm": 2.718023008251072, + "kl": 0.028564453125, + "learning_rate": 9.99992940082991e-07, + "loss": 0.0011, + "reward": 1.9399998188018799, + "reward_std": 0.3909613788127899, + "rewards/accuracy_reward": 0.824999988079071, + "rewards/format_reward": 0.9000000357627869, + "step": 89, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 406.9750061035156, + "epoch": 0.0017105388197282143, + "grad_norm": 1.3912795718135529, + "kl": 0.0234375, + "learning_rate": 9.999927805422633e-07, + "loss": 0.0009, + "reward": 1.7662500143051147, + "reward_std": 0.19376306235790253, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 90, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 421.875, + "epoch": 0.0017295448066140834, + "grad_norm": 1.6675486147647514, + "kl": 0.0201416015625, + "learning_rate": 9.99992619218975e-07, + "loss": 0.0008, + "reward": 1.8392499685287476, + "reward_std": 0.2976999282836914, + "rewards/accuracy_reward": 0.7480000853538513, + "rewards/format_reward": 1.0, + "step": 91, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 422.57501220703125, + "epoch": 0.0017485507934999526, + "grad_norm": 2.3307118152368393, + "kl": 0.0225830078125, + "learning_rate": 9.999924561131264e-07, + "loss": 0.0009, + "reward": 1.7600001096725464, + "reward_std": 0.3509061932563782, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 1.0, + "step": 92, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 495.375, + "epoch": 0.0017675567803858215, + "grad_norm": 3.649647628545239, + "kl": 0.0205078125, + "learning_rate": 9.999922912247185e-07, + "loss": 0.0008, + "reward": 1.1277230978012085, + "reward_std": 0.5832155346870422, + "rewards/accuracy_reward": 0.308973103761673, + "rewards/format_reward": 0.824999988079071, + "step": 93, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 470.9750061035156, + "epoch": 0.0017865627672716906, + "grad_norm": 2.4778149390422177, + "kl": 0.0206298828125, + "learning_rate": 9.999921245537516e-07, + "loss": 0.0008, + "reward": 1.4761621952056885, + "reward_std": 0.29485660791397095, + "rewards/accuracy_reward": 0.5574120879173279, + "rewards/format_reward": 0.875, + "step": 94, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 385.8500061035156, + "epoch": 0.0018055687541575596, + "grad_norm": 2.01354489856464, + "kl": 0.0299072265625, + "learning_rate": 9.999919561002262e-07, + "loss": 0.0012, + "reward": 1.6849998235702515, + "reward_std": 0.3496052920818329, + "rewards/accuracy_reward": 0.574999988079071, + "rewards/format_reward": 1.0, + "step": 95, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 423.57501220703125, + "epoch": 0.0018245747410434287, + "grad_norm": 1.6843216852448375, + "kl": 0.0194091796875, + "learning_rate": 9.999917858641431e-07, + "loss": 0.0008, + "reward": 1.464853048324585, + "reward_std": 0.305313378572464, + "rewards/accuracy_reward": 0.5086029767990112, + "rewards/format_reward": 0.925000011920929, + "step": 96, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 370.3500061035156, + "epoch": 0.0018435807279292978, + "grad_norm": 1.7691210684176437, + "kl": 0.029296875, + "learning_rate": 9.999916138455027e-07, + "loss": 0.0012, + "reward": 1.7887500524520874, + "reward_std": 0.20897598564624786, + "rewards/accuracy_reward": 0.699999988079071, + "rewards/format_reward": 1.0, + "step": 97, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 395.4250183105469, + "epoch": 0.0018625867148151668, + "grad_norm": 1.6729182059362162, + "kl": 0.0311279296875, + "learning_rate": 9.99991440044306e-07, + "loss": 0.0012, + "reward": 1.3479642868041992, + "reward_std": 0.25888824462890625, + "rewards/accuracy_reward": 0.3267143666744232, + "rewards/format_reward": 0.9750000238418579, + "step": 98, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 368.9750061035156, + "epoch": 0.001881592701701036, + "grad_norm": 1.6641565139502756, + "kl": 0.03564453125, + "learning_rate": 9.999912644605532e-07, + "loss": 0.0014, + "reward": 1.9850000143051147, + "reward_std": 0.5545549392700195, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 1.0, + "step": 99, + "temporal_rewards": 1.0 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 386.6499938964844, + "epoch": 0.0019005986885869048, + "grad_norm": 1.661428182116276, + "kl": 0.029052734375, + "learning_rate": 9.999910870942452e-07, + "loss": 0.0012, + "reward": 1.962499976158142, + "reward_std": 0.37154579162597656, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 1.0, + "step": 100, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 383.3500061035156, + "epoch": 0.001919604675472774, + "grad_norm": 1.9380789848090383, + "kl": 0.03515625, + "learning_rate": 9.999909079453825e-07, + "loss": 0.0014, + "reward": 1.7787500619888306, + "reward_std": 0.37119224667549133, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 1.0, + "step": 101, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 365.0, + "epoch": 0.0019386106623586429, + "grad_norm": 1.8562862398595406, + "kl": 0.031005859375, + "learning_rate": 9.999907270139655e-07, + "loss": 0.0012, + "reward": 1.8941665887832642, + "reward_std": 0.15514008700847626, + "rewards/accuracy_reward": 0.7266666293144226, + "rewards/format_reward": 1.0, + "step": 102, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 416.375, + "epoch": 0.001957616649244512, + "grad_norm": 1.311548261470929, + "kl": 0.0113525390625, + "learning_rate": 9.999905442999955e-07, + "loss": 0.0005, + "reward": 1.5067170858383179, + "reward_std": 0.4453655183315277, + "rewards/accuracy_reward": 0.4992171823978424, + "rewards/format_reward": 1.0, + "step": 103, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 424.4250183105469, + "epoch": 0.001976622636130381, + "grad_norm": 1.831309505314721, + "kl": 0.025390625, + "learning_rate": 9.999903598034726e-07, + "loss": 0.001, + "reward": 1.6432164907455444, + "reward_std": 0.38924336433410645, + "rewards/accuracy_reward": 0.640716552734375, + "rewards/format_reward": 0.925000011920929, + "step": 104, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 435.6000061035156, + "epoch": 0.0019956286230162503, + "grad_norm": 2.3635153149852437, + "kl": 0.0206298828125, + "learning_rate": 9.999901735243977e-07, + "loss": 0.0008, + "reward": 1.3154586553573608, + "reward_std": 0.29778310656547546, + "rewards/accuracy_reward": 0.45920857787132263, + "rewards/format_reward": 0.824999988079071, + "step": 105, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 477.5249938964844, + "epoch": 0.002014634609902119, + "grad_norm": 1.958280285952236, + "kl": 0.0211181640625, + "learning_rate": 9.999899854627713e-07, + "loss": 0.0008, + "reward": 1.4671143293380737, + "reward_std": 0.1926993429660797, + "rewards/accuracy_reward": 0.6296143531799316, + "rewards/format_reward": 0.800000011920929, + "step": 106, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 447.4250183105469, + "epoch": 0.002033640596787988, + "grad_norm": 1.870154208847281, + "kl": 0.021240234375, + "learning_rate": 9.999897956185942e-07, + "loss": 0.0008, + "reward": 1.3505624532699585, + "reward_std": 0.4228193461894989, + "rewards/accuracy_reward": 0.3843124806880951, + "rewards/format_reward": 0.925000011920929, + "step": 107, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 455.6750183105469, + "epoch": 0.002052646583673857, + "grad_norm": 1.74426244908067, + "kl": 0.0198974609375, + "learning_rate": 9.999896039918671e-07, + "loss": 0.0008, + "reward": 1.4027106761932373, + "reward_std": 0.49836865067481995, + "rewards/accuracy_reward": 0.4889605641365051, + "rewards/format_reward": 0.9000000357627869, + "step": 108, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 447.875, + "epoch": 0.0020716525705597264, + "grad_norm": 1.4266038092185571, + "kl": 0.0252685546875, + "learning_rate": 9.999894105825904e-07, + "loss": 0.001, + "reward": 1.9647228717803955, + "reward_std": 0.23401236534118652, + "rewards/accuracy_reward": 0.9209728240966797, + "rewards/format_reward": 0.9750000238418579, + "step": 109, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 440.45001220703125, + "epoch": 0.0020906585574455953, + "grad_norm": 1.6661206643921196, + "kl": 0.0244140625, + "learning_rate": 9.999892153907652e-07, + "loss": 0.001, + "reward": 1.7705137729644775, + "reward_std": 0.24715518951416016, + "rewards/accuracy_reward": 0.7055138349533081, + "rewards/format_reward": 0.949999988079071, + "step": 110, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 406.4750061035156, + "epoch": 0.0021096645443314643, + "grad_norm": 2.858453558713371, + "kl": 0.033935546875, + "learning_rate": 9.99989018416392e-07, + "loss": 0.0014, + "reward": 1.6805261373519897, + "reward_std": 0.35290464758872986, + "rewards/accuracy_reward": 0.5055261850357056, + "rewards/format_reward": 1.0, + "step": 111, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 438.375, + "epoch": 0.0021286705312173336, + "grad_norm": 1.7454017233232972, + "kl": 0.033935546875, + "learning_rate": 9.999888196594714e-07, + "loss": 0.0014, + "reward": 1.8360843658447266, + "reward_std": 0.2726055085659027, + "rewards/accuracy_reward": 0.7735845446586609, + "rewards/format_reward": 0.875, + "step": 112, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 353.0, + "epoch": 0.0021476765181032025, + "grad_norm": 2.4722137217847258, + "kl": 0.03466796875, + "learning_rate": 9.999886191200043e-07, + "loss": 0.0014, + "reward": 1.5362499952316284, + "reward_std": 0.3735165297985077, + "rewards/accuracy_reward": 0.4749999940395355, + "rewards/format_reward": 1.0, + "step": 113, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 521.25, + "epoch": 0.0021666825049890715, + "grad_norm": 1.6236430715967438, + "kl": 0.02197265625, + "learning_rate": 9.999884167979913e-07, + "loss": 0.0009, + "reward": 1.2192209959030151, + "reward_std": 0.41181594133377075, + "rewards/accuracy_reward": 0.4542209804058075, + "rewards/format_reward": 0.7250000238418579, + "step": 114, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 357.57501220703125, + "epoch": 0.002185688491874941, + "grad_norm": 2.284632892835743, + "kl": 0.041015625, + "learning_rate": 9.999882126934332e-07, + "loss": 0.0016, + "reward": 2.074758291244507, + "reward_std": 0.14666441082954407, + "rewards/accuracy_reward": 0.8460081219673157, + "rewards/format_reward": 1.0, + "step": 115, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 389.20001220703125, + "epoch": 0.0022046944787608097, + "grad_norm": 3.16351175778279, + "kl": 0.024169921875, + "learning_rate": 9.999880068063305e-07, + "loss": 0.001, + "reward": 1.2041759490966797, + "reward_std": 0.24465619027614594, + "rewards/accuracy_reward": 0.2766759991645813, + "rewards/format_reward": 0.925000011920929, + "step": 116, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 391.5500183105469, + "epoch": 0.0022237004656466787, + "grad_norm": 2.0304763663719276, + "kl": 0.03369140625, + "learning_rate": 9.999877991366843e-07, + "loss": 0.0014, + "reward": 1.879212737083435, + "reward_std": 0.2156396210193634, + "rewards/accuracy_reward": 0.6679627895355225, + "rewards/format_reward": 1.0, + "step": 117, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 465.2250061035156, + "epoch": 0.0022427064525325476, + "grad_norm": 1.6546801011563235, + "kl": 0.0203857421875, + "learning_rate": 9.99987589684495e-07, + "loss": 0.0008, + "reward": 1.3929481506347656, + "reward_std": 0.28699302673339844, + "rewards/accuracy_reward": 0.5216981768608093, + "rewards/format_reward": 0.8500000238418579, + "step": 118, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 405.7749938964844, + "epoch": 0.002261712439418417, + "grad_norm": 1.7681545304480941, + "kl": 0.0306396484375, + "learning_rate": 9.999873784497636e-07, + "loss": 0.0012, + "reward": 1.6875, + "reward_std": 0.29129740595817566, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 0.9750000238418579, + "step": 119, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 517.0, + "epoch": 0.002280718426304286, + "grad_norm": 2.9999589200376198, + "kl": 0.02294921875, + "learning_rate": 9.999871654324907e-07, + "loss": 0.0009, + "reward": 1.3318989276885986, + "reward_std": 0.5621644854545593, + "rewards/accuracy_reward": 0.4856489598751068, + "rewards/format_reward": 0.7750000357627869, + "step": 120, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 408.57501220703125, + "epoch": 0.002299724413190155, + "grad_norm": 1.757897593859272, + "kl": 0.0361328125, + "learning_rate": 9.999869506326773e-07, + "loss": 0.0014, + "reward": 1.6224998235702515, + "reward_std": 0.32296404242515564, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 121, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 361.375, + "epoch": 0.002318730400076024, + "grad_norm": 1.8040697104095194, + "kl": 0.030517578125, + "learning_rate": 9.999867340503238e-07, + "loss": 0.0012, + "reward": 1.5762500762939453, + "reward_std": 0.2644350230693817, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 122, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 394.5249938964844, + "epoch": 0.002337736386961893, + "grad_norm": 2.125098480736845, + "kl": 0.04052734375, + "learning_rate": 9.999865156854312e-07, + "loss": 0.0016, + "reward": 1.9887498617172241, + "reward_std": 0.09045388549566269, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 1.0, + "step": 123, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 372.9250183105469, + "epoch": 0.002356742373847762, + "grad_norm": 3.6327000620063026, + "kl": 0.023681640625, + "learning_rate": 9.99986295538e-07, + "loss": 0.001, + "reward": 1.830775499343872, + "reward_std": 0.08539687097072601, + "rewards/accuracy_reward": 0.785775363445282, + "rewards/format_reward": 1.0, + "step": 124, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 409.4750061035156, + "epoch": 0.002375748360733631, + "grad_norm": 1.7642240926650516, + "kl": 0.033935546875, + "learning_rate": 9.999860736080315e-07, + "loss": 0.0014, + "reward": 1.7674999237060547, + "reward_std": 0.4329659938812256, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 0.9750000238418579, + "step": 125, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 439.20001220703125, + "epoch": 0.0023947543476195003, + "grad_norm": 1.5360351515377255, + "kl": 0.026123046875, + "learning_rate": 9.999858498955262e-07, + "loss": 0.001, + "reward": 1.462499976158142, + "reward_std": 0.2535651624202728, + "rewards/accuracy_reward": 0.4749999940395355, + "rewards/format_reward": 0.949999988079071, + "step": 126, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 442.1000061035156, + "epoch": 0.002413760334505369, + "grad_norm": 2.3843788079336665, + "kl": 0.02587890625, + "learning_rate": 9.999856244004847e-07, + "loss": 0.001, + "reward": 1.4300276041030884, + "reward_std": 0.3910476267337799, + "rewards/accuracy_reward": 0.47002753615379333, + "rewards/format_reward": 0.875, + "step": 127, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 401.8999938964844, + "epoch": 0.002432766321391238, + "grad_norm": 2.056022106306336, + "kl": 0.0286865234375, + "learning_rate": 9.999853971229081e-07, + "loss": 0.0011, + "reward": 1.4300159215927124, + "reward_std": 0.25307053327560425, + "rewards/accuracy_reward": 0.3525159955024719, + "rewards/format_reward": 1.0, + "step": 128, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 430.5500183105469, + "epoch": 0.0024517723082771075, + "grad_norm": 1.198429674553318, + "kl": 0.02734375, + "learning_rate": 9.999851680627973e-07, + "loss": 0.0011, + "reward": 1.4500000476837158, + "reward_std": 0.4522148072719574, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.925000011920929, + "step": 129, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 347.6750183105469, + "epoch": 0.0024707782951629764, + "grad_norm": 1.9389868981099811, + "kl": 0.032958984375, + "learning_rate": 9.999849372201528e-07, + "loss": 0.0013, + "reward": 1.8525002002716064, + "reward_std": 0.2627958655357361, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 1.0, + "step": 130, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 405.5249938964844, + "epoch": 0.0024897842820488453, + "grad_norm": 1.875951052181491, + "kl": 0.0303955078125, + "learning_rate": 9.999847045949754e-07, + "loss": 0.0012, + "reward": 1.8334811925888062, + "reward_std": 0.22262628376483917, + "rewards/accuracy_reward": 0.7034812569618225, + "rewards/format_reward": 1.0, + "step": 131, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 443.95001220703125, + "epoch": 0.0025087902689347142, + "grad_norm": 1.6541187018826684, + "kl": 0.02099609375, + "learning_rate": 9.999844701872662e-07, + "loss": 0.0008, + "reward": 1.7012500762939453, + "reward_std": 0.3903408646583557, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 0.9750000238418579, + "step": 132, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 431.8999938964844, + "epoch": 0.0025277962558205836, + "grad_norm": 2.2437243545249372, + "kl": 0.024658203125, + "learning_rate": 9.99984233997026e-07, + "loss": 0.001, + "reward": 1.2993817329406738, + "reward_std": 0.41282787919044495, + "rewards/accuracy_reward": 0.3556317985057831, + "rewards/format_reward": 0.9000000357627869, + "step": 133, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 421.95001220703125, + "epoch": 0.0025468022427064525, + "grad_norm": 1.9312409773493668, + "kl": 0.03759765625, + "learning_rate": 9.999839960242553e-07, + "loss": 0.0015, + "reward": 2.2100000381469727, + "reward_std": 0.17241929471492767, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 0.9750000238418579, + "step": 134, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 375.3000183105469, + "epoch": 0.0025658082295923214, + "grad_norm": 1.7486348360473247, + "kl": 0.03955078125, + "learning_rate": 9.999837562689555e-07, + "loss": 0.0016, + "reward": 2.102916717529297, + "reward_std": 0.26096248626708984, + "rewards/accuracy_reward": 0.8466667532920837, + "rewards/format_reward": 1.0, + "step": 135, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 469.70001220703125, + "epoch": 0.002584814216478191, + "grad_norm": 1.3447144107629372, + "kl": 0.020751953125, + "learning_rate": 9.999835147311272e-07, + "loss": 0.0008, + "reward": 1.6257904767990112, + "reward_std": 0.25289788842201233, + "rewards/accuracy_reward": 0.6632905006408691, + "rewards/format_reward": 0.925000011920929, + "step": 136, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 442.1750183105469, + "epoch": 0.0026038202033640597, + "grad_norm": 1.5618984064917965, + "kl": 0.0252685546875, + "learning_rate": 9.99983271410771e-07, + "loss": 0.001, + "reward": 1.5629686117172241, + "reward_std": 0.4603721797466278, + "rewards/accuracy_reward": 0.5992185473442078, + "rewards/format_reward": 0.925000011920929, + "step": 137, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.2, + "completion_length": 416.9250183105469, + "epoch": 0.0026228261902499286, + "grad_norm": 2.1516783669296142, + "kl": 0.03369140625, + "learning_rate": 9.99983026307888e-07, + "loss": 0.0013, + "reward": 1.840000033378601, + "reward_std": 0.12689465284347534, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 0.8500000238418579, + "step": 138, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 387.6000061035156, + "epoch": 0.002641832177135798, + "grad_norm": 1.594655353605103, + "kl": 0.0439453125, + "learning_rate": 9.999827794224791e-07, + "loss": 0.0018, + "reward": 1.7712500095367432, + "reward_std": 0.4626477360725403, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 0.9750000238418579, + "step": 139, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 391.57501220703125, + "epoch": 0.002660838164021667, + "grad_norm": 1.8211123826179678, + "kl": 0.0322265625, + "learning_rate": 9.999825307545453e-07, + "loss": 0.0013, + "reward": 1.7787498235702515, + "reward_std": 0.08908182382583618, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 140, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 453.875, + "epoch": 0.002679844150907536, + "grad_norm": 1.4377069826682358, + "kl": 0.03125, + "learning_rate": 9.999822803040872e-07, + "loss": 0.0013, + "reward": 1.7375000715255737, + "reward_std": 0.19416609406471252, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 0.9750000238418579, + "step": 141, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 455.4750061035156, + "epoch": 0.0026988501377934048, + "grad_norm": 1.5245498518911158, + "kl": 0.0294189453125, + "learning_rate": 9.99982028071106e-07, + "loss": 0.0012, + "reward": 2.0346338748931885, + "reward_std": 0.3204995095729828, + "rewards/accuracy_reward": 0.8821339011192322, + "rewards/format_reward": 0.9750000238418579, + "step": 142, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 396.5, + "epoch": 0.002717856124679274, + "grad_norm": 1.488686670355777, + "kl": 0.0311279296875, + "learning_rate": 9.999817740556023e-07, + "loss": 0.0012, + "reward": 1.7937500476837158, + "reward_std": 0.15125273168087006, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 1.0, + "step": 143, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 471.75, + "epoch": 0.002736862111565143, + "grad_norm": 1.8671895767371864, + "kl": 0.0284423828125, + "learning_rate": 9.99981518257577e-07, + "loss": 0.0011, + "reward": 1.7274999618530273, + "reward_std": 0.3801653981208801, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 0.949999988079071, + "step": 144, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 434.6750183105469, + "epoch": 0.002755868098451012, + "grad_norm": 1.421053342895394, + "kl": 0.02587890625, + "learning_rate": 9.999812606770313e-07, + "loss": 0.001, + "reward": 1.4275000095367432, + "reward_std": 0.2674916982650757, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.824999988079071, + "step": 145, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 424.875, + "epoch": 0.0027748740853368813, + "grad_norm": 1.4913815773458112, + "kl": 0.0303955078125, + "learning_rate": 9.99981001313966e-07, + "loss": 0.0012, + "reward": 1.5325000286102295, + "reward_std": 0.25276699662208557, + "rewards/accuracy_reward": 0.4749999940395355, + "rewards/format_reward": 0.949999988079071, + "step": 146, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 461.2250061035156, + "epoch": 0.0027938800722227502, + "grad_norm": 1.7956360796966173, + "kl": 0.0201416015625, + "learning_rate": 9.999807401683819e-07, + "loss": 0.0008, + "reward": 1.584999918937683, + "reward_std": 0.3503597676753998, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 0.949999988079071, + "step": 147, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 427.6750183105469, + "epoch": 0.002812886059108619, + "grad_norm": 2.365073143548544, + "kl": 0.03857421875, + "learning_rate": 9.9998047724028e-07, + "loss": 0.0015, + "reward": 1.8576242923736572, + "reward_std": 0.1773967295885086, + "rewards/accuracy_reward": 0.8163743019104004, + "rewards/format_reward": 0.9000000357627869, + "step": 148, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 413.2250061035156, + "epoch": 0.002831892045994488, + "grad_norm": 1.7252276542579688, + "kl": 0.0289306640625, + "learning_rate": 9.999802125296613e-07, + "loss": 0.0012, + "reward": 1.8983334302902222, + "reward_std": 0.11522179841995239, + "rewards/accuracy_reward": 0.8183333277702332, + "rewards/format_reward": 1.0, + "step": 149, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 403.75, + "epoch": 0.0028508980328803574, + "grad_norm": 2.0589334279901808, + "kl": 0.044677734375, + "learning_rate": 9.999799460365267e-07, + "loss": 0.0018, + "reward": 1.9924999475479126, + "reward_std": 0.40971073508262634, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 1.0, + "step": 150, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 447.32501220703125, + "epoch": 0.0028699040197662264, + "grad_norm": 1.6391078667422532, + "kl": 0.03466796875, + "learning_rate": 9.99979677760877e-07, + "loss": 0.0014, + "reward": 1.424102783203125, + "reward_std": 0.43598437309265137, + "rewards/accuracy_reward": 0.5078528523445129, + "rewards/format_reward": 0.925000011920929, + "step": 151, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 425.4250183105469, + "epoch": 0.0028889100066520953, + "grad_norm": 9.396291486368417, + "kl": 0.0289306640625, + "learning_rate": 9.999794077027135e-07, + "loss": 0.0012, + "reward": 1.6387499570846558, + "reward_std": 0.22128906846046448, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9000000357627869, + "step": 152, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 400.6750183105469, + "epoch": 0.0029079159935379646, + "grad_norm": 1.6113288987892758, + "kl": 0.03173828125, + "learning_rate": 9.99979135862037e-07, + "loss": 0.0013, + "reward": 1.7445834875106812, + "reward_std": 0.24926964938640594, + "rewards/accuracy_reward": 0.6333333253860474, + "rewards/format_reward": 1.0, + "step": 153, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 438.1750183105469, + "epoch": 0.0029269219804238336, + "grad_norm": 2.0588162325910186, + "kl": 0.0272216796875, + "learning_rate": 9.999788622388484e-07, + "loss": 0.0011, + "reward": 1.7337499856948853, + "reward_std": 0.566247820854187, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 0.949999988079071, + "step": 154, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 429.7749938964844, + "epoch": 0.0029459279673097025, + "grad_norm": 1.8617665853698921, + "kl": 0.046630859375, + "learning_rate": 9.999785868331486e-07, + "loss": 0.0019, + "reward": 2.2300000190734863, + "reward_std": 0.19540588557720184, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 0.9750000238418579, + "step": 155, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 418.95001220703125, + "epoch": 0.0029649339541955714, + "grad_norm": 1.719956446797739, + "kl": 0.041015625, + "learning_rate": 9.999783096449389e-07, + "loss": 0.0016, + "reward": 1.8305953741073608, + "reward_std": 0.3304825723171234, + "rewards/accuracy_reward": 0.630595326423645, + "rewards/format_reward": 0.9750000238418579, + "step": 156, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 422.70001220703125, + "epoch": 0.0029839399410814408, + "grad_norm": 1.9840915198333549, + "kl": 0.039306640625, + "learning_rate": 9.9997803067422e-07, + "loss": 0.0016, + "reward": 2.134999990463257, + "reward_std": 0.10920828580856323, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 157, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 458.8999938964844, + "epoch": 0.0030029459279673097, + "grad_norm": 1.7639869601003102, + "kl": 0.03271484375, + "learning_rate": 9.999777499209927e-07, + "loss": 0.0013, + "reward": 1.4048618078231812, + "reward_std": 0.48996439576148987, + "rewards/accuracy_reward": 0.4323618412017822, + "rewards/format_reward": 0.949999988079071, + "step": 158, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 389.7250061035156, + "epoch": 0.0030219519148531786, + "grad_norm": 2.3599515456427547, + "kl": 0.039794921875, + "learning_rate": 9.999774673852586e-07, + "loss": 0.0016, + "reward": 1.8200000524520874, + "reward_std": 0.3282524049282074, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 1.0, + "step": 159, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 405.7749938964844, + "epoch": 0.003040957901739048, + "grad_norm": 2.2616609557105343, + "kl": 0.036376953125, + "learning_rate": 9.999771830670182e-07, + "loss": 0.0015, + "reward": 2.129999876022339, + "reward_std": 0.1709517389535904, + "rewards/accuracy_reward": 0.925000011920929, + "rewards/format_reward": 1.0, + "step": 160, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 415.7749938964844, + "epoch": 0.003059963888624917, + "grad_norm": 1.7102218764435317, + "kl": 0.04248046875, + "learning_rate": 9.999768969662727e-07, + "loss": 0.0017, + "reward": 1.5400406122207642, + "reward_std": 0.31520184874534607, + "rewards/accuracy_reward": 0.41754060983657837, + "rewards/format_reward": 1.0, + "step": 161, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 372.125, + "epoch": 0.003078969875510786, + "grad_norm": 1.8026554065712848, + "kl": 0.033447265625, + "learning_rate": 9.999766090830233e-07, + "loss": 0.0013, + "reward": 1.5858334302902222, + "reward_std": 0.22057271003723145, + "rewards/accuracy_reward": 0.5633333325386047, + "rewards/format_reward": 1.0, + "step": 162, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 392.20001220703125, + "epoch": 0.0030979758623966547, + "grad_norm": 2.011151309629896, + "kl": 0.040771484375, + "learning_rate": 9.999763194172708e-07, + "loss": 0.0016, + "reward": 2.1462502479553223, + "reward_std": 0.03894924744963646, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 163, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 375.6750183105469, + "epoch": 0.003116981849282524, + "grad_norm": 1.8918527334820754, + "kl": 0.046875, + "learning_rate": 9.999760279690162e-07, + "loss": 0.0019, + "reward": 1.9532890319824219, + "reward_std": 0.22268572449684143, + "rewards/accuracy_reward": 0.7332891821861267, + "rewards/format_reward": 1.0, + "step": 164, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 405.6499938964844, + "epoch": 0.003135987836168393, + "grad_norm": 1.3711286517136527, + "kl": 0.0289306640625, + "learning_rate": 9.999757347382606e-07, + "loss": 0.0012, + "reward": 1.7587499618530273, + "reward_std": 0.2259451448917389, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 1.0, + "step": 165, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 383.07501220703125, + "epoch": 0.003154993823054262, + "grad_norm": 1.9232649231057724, + "kl": 0.04931640625, + "learning_rate": 9.99975439725005e-07, + "loss": 0.002, + "reward": 2.1524999141693115, + "reward_std": 0.1306147575378418, + "rewards/accuracy_reward": 0.925000011920929, + "rewards/format_reward": 1.0, + "step": 166, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 418.3500061035156, + "epoch": 0.0031739998099401313, + "grad_norm": 1.861616322884654, + "kl": 0.035888671875, + "learning_rate": 9.999751429292506e-07, + "loss": 0.0014, + "reward": 1.2532682418823242, + "reward_std": 0.3139711022377014, + "rewards/accuracy_reward": 0.3482682406902313, + "rewards/format_reward": 0.925000011920929, + "step": 167, + "temporal_rewards": 0.29999998211860657 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 473.3999938964844, + "epoch": 0.003193005796826, + "grad_norm": 2.1140474531654054, + "kl": 0.0322265625, + "learning_rate": 9.999748443509986e-07, + "loss": 0.0013, + "reward": 1.4529370069503784, + "reward_std": 0.4296736419200897, + "rewards/accuracy_reward": 0.5854371190071106, + "rewards/format_reward": 0.7750000357627869, + "step": 168, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 426.6499938964844, + "epoch": 0.003212011783711869, + "grad_norm": 2.081056245713128, + "kl": 0.045166015625, + "learning_rate": 9.999745439902495e-07, + "loss": 0.0018, + "reward": 1.66265070438385, + "reward_std": 0.11052091419696808, + "rewards/accuracy_reward": 0.590150773525238, + "rewards/format_reward": 1.0, + "step": 169, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 391.0500183105469, + "epoch": 0.0032310177705977385, + "grad_norm": 1.557973829500598, + "kl": 0.03173828125, + "learning_rate": 9.99974241847005e-07, + "loss": 0.0013, + "reward": 2.132500171661377, + "reward_std": 0.1289599984884262, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 170, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 397.125, + "epoch": 0.0032500237574836074, + "grad_norm": 1.8045856773772166, + "kl": 0.0225830078125, + "learning_rate": 9.999739379212658e-07, + "loss": 0.0009, + "reward": 1.7682842016220093, + "reward_std": 0.18975917994976044, + "rewards/accuracy_reward": 0.7132843136787415, + "rewards/format_reward": 1.0, + "step": 171, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 480.7749938964844, + "epoch": 0.0032690297443694763, + "grad_norm": 1.9547616570315738, + "kl": 0.03466796875, + "learning_rate": 9.999736322130328e-07, + "loss": 0.0014, + "reward": 1.6181875467300415, + "reward_std": 0.32576438784599304, + "rewards/accuracy_reward": 0.5969375371932983, + "rewards/format_reward": 0.9000000357627869, + "step": 172, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.4, + "completion_length": 460.3999938964844, + "epoch": 0.0032880357312553453, + "grad_norm": 1.334266052271473, + "kl": 0.03125, + "learning_rate": 9.999733247223077e-07, + "loss": 0.0013, + "reward": 0.9670197367668152, + "reward_std": 0.39288732409477234, + "rewards/accuracy_reward": 0.1620197594165802, + "rewards/format_reward": 0.8500000238418579, + "step": 173, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.4, + "completion_length": 486.2749938964844, + "epoch": 0.0033070417181412146, + "grad_norm": 2.0305263629651344, + "kl": 0.0286865234375, + "learning_rate": 9.999730154490912e-07, + "loss": 0.0011, + "reward": 1.2086223363876343, + "reward_std": 0.40833497047424316, + "rewards/accuracy_reward": 0.3061222732067108, + "rewards/format_reward": 0.925000011920929, + "step": 174, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 492.625, + "epoch": 0.0033260477050270835, + "grad_norm": 2.170161481168517, + "kl": 0.0286865234375, + "learning_rate": 9.999727043933842e-07, + "loss": 0.0011, + "reward": 1.491103172302246, + "reward_std": 0.39911141991615295, + "rewards/accuracy_reward": 0.4773530960083008, + "rewards/format_reward": 0.9000000357627869, + "step": 175, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 464.3000183105469, + "epoch": 0.0033450536919129525, + "grad_norm": 1.6902429526032825, + "kl": 0.03173828125, + "learning_rate": 9.999723915551882e-07, + "loss": 0.0013, + "reward": 1.4843275547027588, + "reward_std": 0.3326936662197113, + "rewards/accuracy_reward": 0.5693275332450867, + "rewards/format_reward": 0.824999988079071, + "step": 176, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 436.2749938964844, + "epoch": 0.003364059678798822, + "grad_norm": 1.7250359682167837, + "kl": 0.041748046875, + "learning_rate": 9.999720769345044e-07, + "loss": 0.0017, + "reward": 1.776833415031433, + "reward_std": 0.2125244438648224, + "rewards/accuracy_reward": 0.6368333697319031, + "rewards/format_reward": 1.0, + "step": 177, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 490.0249938964844, + "epoch": 0.0033830656656846907, + "grad_norm": 1.278385568657809, + "kl": 0.030517578125, + "learning_rate": 9.999717605313334e-07, + "loss": 0.0012, + "reward": 1.463749885559082, + "reward_std": 0.4122951626777649, + "rewards/accuracy_reward": 0.574999988079071, + "rewards/format_reward": 0.800000011920929, + "step": 178, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 435.82501220703125, + "epoch": 0.0034020716525705597, + "grad_norm": 3.7786056142158233, + "kl": 0.0267333984375, + "learning_rate": 9.999714423456768e-07, + "loss": 0.0011, + "reward": 1.6909410953521729, + "reward_std": 0.21448658406734467, + "rewards/accuracy_reward": 0.7021910548210144, + "rewards/format_reward": 0.8500000238418579, + "step": 179, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 444.2250061035156, + "epoch": 0.0034210776394564286, + "grad_norm": 8.131646771139263, + "kl": 0.044677734375, + "learning_rate": 9.999711223775355e-07, + "loss": 0.0018, + "reward": 1.7312500476837158, + "reward_std": 0.34000539779663086, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 0.875, + "step": 180, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 399.8999938964844, + "epoch": 0.003440083626342298, + "grad_norm": 2.2282777402698746, + "kl": 0.046142578125, + "learning_rate": 9.999708006269108e-07, + "loss": 0.0018, + "reward": 1.5961250066757202, + "reward_std": 0.3732485771179199, + "rewards/accuracy_reward": 0.48487502336502075, + "rewards/format_reward": 1.0, + "step": 181, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 422.6499938964844, + "epoch": 0.003459089613228167, + "grad_norm": 1.643446544121657, + "kl": 0.031494140625, + "learning_rate": 9.999704770938035e-07, + "loss": 0.0013, + "reward": 1.6674998998641968, + "reward_std": 0.24944470822811127, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/format_reward": 1.0, + "step": 182, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 395.8000183105469, + "epoch": 0.0034780956001140358, + "grad_norm": 2.3874501163374577, + "kl": 0.03369140625, + "learning_rate": 9.999701517782153e-07, + "loss": 0.0013, + "reward": 1.4371393918991089, + "reward_std": 0.4773987829685211, + "rewards/accuracy_reward": 0.38963934779167175, + "rewards/format_reward": 1.0, + "step": 183, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 415.125, + "epoch": 0.003497101586999905, + "grad_norm": 4.498598361526847, + "kl": 0.047607421875, + "learning_rate": 9.99969824680147e-07, + "loss": 0.0019, + "reward": 1.6487499475479126, + "reward_std": 0.31695321202278137, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 184, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 433.57501220703125, + "epoch": 0.003516107573885774, + "grad_norm": 1.6196430695974566, + "kl": 0.023681640625, + "learning_rate": 9.999694957995997e-07, + "loss": 0.0009, + "reward": 1.499170184135437, + "reward_std": 0.36921173334121704, + "rewards/accuracy_reward": 0.5179200172424316, + "rewards/format_reward": 0.9750000238418579, + "step": 185, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 445.3500061035156, + "epoch": 0.003535113560771643, + "grad_norm": 1.6563067051311027, + "kl": 0.0302734375, + "learning_rate": 9.99969165136575e-07, + "loss": 0.0012, + "reward": 1.5774999856948853, + "reward_std": 0.18491069972515106, + "rewards/accuracy_reward": 0.574999988079071, + "rewards/format_reward": 0.9750000238418579, + "step": 186, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 408.2749938964844, + "epoch": 0.003554119547657512, + "grad_norm": 1.6860997944834948, + "kl": 0.0264892578125, + "learning_rate": 9.999688326910734e-07, + "loss": 0.0011, + "reward": 1.8612499237060547, + "reward_std": 0.38037070631980896, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9750000238418579, + "step": 187, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 417.7250061035156, + "epoch": 0.0035731255345433813, + "grad_norm": 1.549050333294932, + "kl": 0.03369140625, + "learning_rate": 9.999684984630967e-07, + "loss": 0.0013, + "reward": 1.4499999284744263, + "reward_std": 0.2958005368709564, + "rewards/accuracy_reward": 0.45000001788139343, + "rewards/format_reward": 1.0, + "step": 188, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 398.45001220703125, + "epoch": 0.00359213152142925, + "grad_norm": 2.4893560926809686, + "kl": 0.03857421875, + "learning_rate": 9.99968162452646e-07, + "loss": 0.0015, + "reward": 1.869797945022583, + "reward_std": 0.31480318307876587, + "rewards/accuracy_reward": 0.7172979712486267, + "rewards/format_reward": 0.949999988079071, + "step": 189, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 339.125, + "epoch": 0.003611137508315119, + "grad_norm": 4.690755256141441, + "kl": 0.0279541015625, + "learning_rate": 9.999678246597221e-07, + "loss": 0.0011, + "reward": 1.8263648748397827, + "reward_std": 0.08638795465230942, + "rewards/accuracy_reward": 0.7126147150993347, + "rewards/format_reward": 1.0, + "step": 190, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 360.1000061035156, + "epoch": 0.0036301434952009885, + "grad_norm": 2.0870351156232627, + "kl": 0.035888671875, + "learning_rate": 9.999674850843264e-07, + "loss": 0.0014, + "reward": 1.9257738590240479, + "reward_std": 0.3620302975177765, + "rewards/accuracy_reward": 0.7782737612724304, + "rewards/format_reward": 1.0, + "step": 191, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 409.1750183105469, + "epoch": 0.0036491494820868574, + "grad_norm": 1.724990929171834, + "kl": 0.043212890625, + "learning_rate": 9.999671437264604e-07, + "loss": 0.0017, + "reward": 1.4066804647445679, + "reward_std": 0.23692777752876282, + "rewards/accuracy_reward": 0.4429304599761963, + "rewards/format_reward": 0.8500000238418579, + "step": 192, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 440.1499938964844, + "epoch": 0.0036681554689727263, + "grad_norm": 1.8795830935742788, + "kl": 0.037841796875, + "learning_rate": 9.99966800586125e-07, + "loss": 0.0015, + "reward": 1.964198112487793, + "reward_std": 0.2989325225353241, + "rewards/accuracy_reward": 0.8041982650756836, + "rewards/format_reward": 0.925000011920929, + "step": 193, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 362.4750061035156, + "epoch": 0.0036871614558585957, + "grad_norm": 14.906225680594083, + "kl": 0.03564453125, + "learning_rate": 9.999664556633216e-07, + "loss": 0.0014, + "reward": 1.687585473060608, + "reward_std": 0.1823313981294632, + "rewards/accuracy_reward": 0.5925855040550232, + "rewards/format_reward": 1.0, + "step": 194, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 433.875, + "epoch": 0.0037061674427444646, + "grad_norm": 1.8383321669606432, + "kl": 0.029296875, + "learning_rate": 9.999661089580513e-07, + "loss": 0.0012, + "reward": 1.6870087385177612, + "reward_std": 0.3377555012702942, + "rewards/accuracy_reward": 0.6270086765289307, + "rewards/format_reward": 0.9750000238418579, + "step": 195, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 444.4750061035156, + "epoch": 0.0037251734296303335, + "grad_norm": 1.5536260862831237, + "kl": 0.02880859375, + "learning_rate": 9.999657604703153e-07, + "loss": 0.0012, + "reward": 1.7862499952316284, + "reward_std": 0.2919498085975647, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.949999988079071, + "step": 196, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 463.1499938964844, + "epoch": 0.0037441794165162024, + "grad_norm": 1.9298679368934535, + "kl": 0.0272216796875, + "learning_rate": 9.99965410200115e-07, + "loss": 0.0011, + "reward": 1.9523206949234009, + "reward_std": 0.20473122596740723, + "rewards/accuracy_reward": 0.8385707139968872, + "rewards/format_reward": 0.9750000238418579, + "step": 197, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 411.4750061035156, + "epoch": 0.003763185403402072, + "grad_norm": 1.6415641870464832, + "kl": 0.0303955078125, + "learning_rate": 9.999650581474515e-07, + "loss": 0.0012, + "reward": 1.9512499570846558, + "reward_std": 0.20175762474536896, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 198, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 383.1000061035156, + "epoch": 0.0037821913902879407, + "grad_norm": 1.5556281838875161, + "kl": 0.0311279296875, + "learning_rate": 9.99964704312326e-07, + "loss": 0.0012, + "reward": 1.8170557022094727, + "reward_std": 0.036341097205877304, + "rewards/accuracy_reward": 0.6770557761192322, + "rewards/format_reward": 1.0, + "step": 199, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 434.3500061035156, + "epoch": 0.0038011973771738096, + "grad_norm": 1.9466362755351019, + "kl": 0.0223388671875, + "learning_rate": 9.999643486947402e-07, + "loss": 0.0009, + "reward": 1.576269507408142, + "reward_std": 0.4818333089351654, + "rewards/accuracy_reward": 0.6600195169448853, + "rewards/format_reward": 0.8500000238418579, + "step": 200, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 401.9250183105469, + "epoch": 0.003820203364059679, + "grad_norm": 1.789056343317186, + "kl": 0.0301513671875, + "learning_rate": 9.99963991294695e-07, + "loss": 0.0012, + "reward": 1.6654579639434814, + "reward_std": 0.3330591320991516, + "rewards/accuracy_reward": 0.6129579544067383, + "rewards/format_reward": 0.9750000238418579, + "step": 201, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 433.125, + "epoch": 0.003839209350945548, + "grad_norm": 2.076335731507936, + "kl": 0.025390625, + "learning_rate": 9.999636321121916e-07, + "loss": 0.001, + "reward": 1.6747560501098633, + "reward_std": 0.22126667201519012, + "rewards/accuracy_reward": 0.646006166934967, + "rewards/format_reward": 0.949999988079071, + "step": 202, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 432.95001220703125, + "epoch": 0.003858215337831417, + "grad_norm": 1.4563151581298903, + "kl": 0.0194091796875, + "learning_rate": 9.999632711472315e-07, + "loss": 0.0008, + "reward": 1.5349998474121094, + "reward_std": 0.43499547243118286, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 0.875, + "step": 203, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 460.25, + "epoch": 0.0038772213247172858, + "grad_norm": 1.9965813028927424, + "kl": 0.023193359375, + "learning_rate": 9.999629083998157e-07, + "loss": 0.0009, + "reward": 1.3198680877685547, + "reward_std": 0.44186311960220337, + "rewards/accuracy_reward": 0.4436180591583252, + "rewards/format_reward": 0.875, + "step": 204, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 396.4750061035156, + "epoch": 0.003896227311603155, + "grad_norm": 2.177392570358997, + "kl": 0.034423828125, + "learning_rate": 9.99962543869946e-07, + "loss": 0.0014, + "reward": 1.7174999713897705, + "reward_std": 0.2532403767108917, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/format_reward": 1.0, + "step": 205, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 415.5500183105469, + "epoch": 0.003915233298489024, + "grad_norm": 2.6421269200066275, + "kl": 0.036865234375, + "learning_rate": 9.999621775576233e-07, + "loss": 0.0015, + "reward": 1.6265392303466797, + "reward_std": 0.03958814591169357, + "rewards/accuracy_reward": 0.47903934121131897, + "rewards/format_reward": 1.0, + "step": 206, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 411.625, + "epoch": 0.003934239285374893, + "grad_norm": 1.8699388357712072, + "kl": 0.029296875, + "learning_rate": 9.999618094628489e-07, + "loss": 0.0012, + "reward": 1.4309269189834595, + "reward_std": 0.2140554040670395, + "rewards/accuracy_reward": 0.559677004814148, + "rewards/format_reward": 0.824999988079071, + "step": 207, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 391.6499938964844, + "epoch": 0.003953245272260762, + "grad_norm": 2.265716581167074, + "kl": 0.03662109375, + "learning_rate": 9.999614395856241e-07, + "loss": 0.0015, + "reward": 1.9126160144805908, + "reward_std": 0.1788838654756546, + "rewards/accuracy_reward": 0.7213660478591919, + "rewards/format_reward": 1.0, + "step": 208, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 391.20001220703125, + "epoch": 0.003972251259146631, + "grad_norm": 1.9098996418145857, + "kl": 0.0302734375, + "learning_rate": 9.999610679259507e-07, + "loss": 0.0012, + "reward": 1.8287500143051147, + "reward_std": 0.19593499600887299, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 1.0, + "step": 209, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 382.82501220703125, + "epoch": 0.003991257246032501, + "grad_norm": 1.846439228788096, + "kl": 0.0556640625, + "learning_rate": 9.999606944838293e-07, + "loss": 0.0022, + "reward": 1.8587497472763062, + "reward_std": 0.163147434592247, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 1.0, + "step": 210, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 362.7250061035156, + "epoch": 0.0040102632329183695, + "grad_norm": 2.0967587806330403, + "kl": 0.0361328125, + "learning_rate": 9.999603192592619e-07, + "loss": 0.0015, + "reward": 1.8357433080673218, + "reward_std": 0.17262892425060272, + "rewards/accuracy_reward": 0.6807434558868408, + "rewards/format_reward": 1.0, + "step": 211, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 398.32501220703125, + "epoch": 0.004029269219804238, + "grad_norm": 1.9610268313868995, + "kl": 0.038330078125, + "learning_rate": 9.999599422522493e-07, + "loss": 0.0015, + "reward": 1.9411624670028687, + "reward_std": 0.3060757517814636, + "rewards/accuracy_reward": 0.7724127769470215, + "rewards/format_reward": 0.9750000238418579, + "step": 212, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 342.7749938964844, + "epoch": 0.004048275206690107, + "grad_norm": 2.1030485166399076, + "kl": 0.037841796875, + "learning_rate": 9.99959563462793e-07, + "loss": 0.0015, + "reward": 1.8831208944320679, + "reward_std": 0.2587890028953552, + "rewards/accuracy_reward": 0.7806208729743958, + "rewards/format_reward": 1.0, + "step": 213, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 338.1499938964844, + "epoch": 0.004067281193575976, + "grad_norm": 1.9004382975827243, + "kl": 0.033935546875, + "learning_rate": 9.999591828908945e-07, + "loss": 0.0014, + "reward": 1.693750023841858, + "reward_std": 0.3791329860687256, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 214, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 389.7749938964844, + "epoch": 0.004086287180461845, + "grad_norm": 1.4921861618965029, + "kl": 0.03515625, + "learning_rate": 9.999588005365551e-07, + "loss": 0.0014, + "reward": 1.7537498474121094, + "reward_std": 0.27417823672294617, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 215, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 445.1750183105469, + "epoch": 0.004105293167347714, + "grad_norm": 1.9721176529898625, + "kl": 0.03564453125, + "learning_rate": 9.999584163997761e-07, + "loss": 0.0014, + "reward": 1.7214065790176392, + "reward_std": 0.22224795818328857, + "rewards/accuracy_reward": 0.6701565384864807, + "rewards/format_reward": 0.9000000357627869, + "step": 216, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 345.82501220703125, + "epoch": 0.004124299154233584, + "grad_norm": 1.6072084220040581, + "kl": 0.045654296875, + "learning_rate": 9.99958030480559e-07, + "loss": 0.0018, + "reward": 1.5512501001358032, + "reward_std": 0.1395757645368576, + "rewards/accuracy_reward": 0.45000001788139343, + "rewards/format_reward": 1.0, + "step": 217, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 402.0500183105469, + "epoch": 0.004143305141119453, + "grad_norm": 4.347914787153495, + "kl": 0.04248046875, + "learning_rate": 9.99957642778905e-07, + "loss": 0.0017, + "reward": 1.8440383672714233, + "reward_std": 0.20403912663459778, + "rewards/accuracy_reward": 0.7202884554862976, + "rewards/format_reward": 1.0, + "step": 218, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 434.4250183105469, + "epoch": 0.004162311128005322, + "grad_norm": 4.993495433573057, + "kl": 0.042236328125, + "learning_rate": 9.999572532948155e-07, + "loss": 0.0017, + "reward": 1.6324999332427979, + "reward_std": 0.5957102179527283, + "rewards/accuracy_reward": 0.574999988079071, + "rewards/format_reward": 0.9750000238418579, + "step": 219, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 479.6750183105469, + "epoch": 0.004181317114891191, + "grad_norm": 1.7391347502476673, + "kl": 0.02587890625, + "learning_rate": 9.999568620282921e-07, + "loss": 0.001, + "reward": 1.6527748107910156, + "reward_std": 0.3692838251590729, + "rewards/accuracy_reward": 0.7015247344970703, + "rewards/format_reward": 0.925000011920929, + "step": 220, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 469.0500183105469, + "epoch": 0.00420032310177706, + "grad_norm": 1.6241228943350263, + "kl": 0.0255126953125, + "learning_rate": 9.999564689793361e-07, + "loss": 0.001, + "reward": 1.6034713983535767, + "reward_std": 0.17141413688659668, + "rewards/accuracy_reward": 0.5809712409973145, + "rewards/format_reward": 0.9750000238418579, + "step": 221, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 449.375, + "epoch": 0.0042193290886629285, + "grad_norm": 1.7685131823196605, + "kl": 0.0233154296875, + "learning_rate": 9.999560741479488e-07, + "loss": 0.0009, + "reward": 1.6205015182495117, + "reward_std": 0.2021985799074173, + "rewards/accuracy_reward": 0.5430015325546265, + "rewards/format_reward": 0.9750000238418579, + "step": 222, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 435.6750183105469, + "epoch": 0.0042383350755487974, + "grad_norm": 1.6842644341593895, + "kl": 0.039794921875, + "learning_rate": 9.999556775341314e-07, + "loss": 0.0016, + "reward": 1.3455833196640015, + "reward_std": 0.11606644839048386, + "rewards/accuracy_reward": 0.48433348536491394, + "rewards/format_reward": 0.800000011920929, + "step": 223, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 373.4250183105469, + "epoch": 0.004257341062434667, + "grad_norm": 10.91069205223911, + "kl": 0.033203125, + "learning_rate": 9.999552791378858e-07, + "loss": 0.0013, + "reward": 1.625, + "reward_std": 0.31769201159477234, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/format_reward": 1.0, + "step": 224, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 415.1499938964844, + "epoch": 0.004276347049320536, + "grad_norm": 2.0543386020637833, + "kl": 0.0498046875, + "learning_rate": 9.999548789592131e-07, + "loss": 0.002, + "reward": 1.9660313129425049, + "reward_std": 0.18239329755306244, + "rewards/accuracy_reward": 0.7422811388969421, + "rewards/format_reward": 1.0, + "step": 225, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 408.7749938964844, + "epoch": 0.004295353036206405, + "grad_norm": 1.9001147314987352, + "kl": 0.05078125, + "learning_rate": 9.99954476998115e-07, + "loss": 0.002, + "reward": 1.7498661279678345, + "reward_std": 0.03813100978732109, + "rewards/accuracy_reward": 0.6148661375045776, + "rewards/format_reward": 1.0, + "step": 226, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 471.5500183105469, + "epoch": 0.004314359023092274, + "grad_norm": 2.0480415066524924, + "kl": 0.02587890625, + "learning_rate": 9.999540732545926e-07, + "loss": 0.001, + "reward": 1.2433445453643799, + "reward_std": 0.140644833445549, + "rewards/accuracy_reward": 0.2945944666862488, + "rewards/format_reward": 0.9750000238418579, + "step": 227, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 354.32501220703125, + "epoch": 0.004333365009978143, + "grad_norm": 1.443703893431704, + "kl": 0.037109375, + "learning_rate": 9.999536677286475e-07, + "loss": 0.0015, + "reward": 1.7266666889190674, + "reward_std": 0.20770369470119476, + "rewards/accuracy_reward": 0.5916666984558105, + "rewards/format_reward": 1.0, + "step": 228, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 414.1499938964844, + "epoch": 0.004352370996864012, + "grad_norm": 1.4456257892300983, + "kl": 0.041015625, + "learning_rate": 9.999532604202813e-07, + "loss": 0.0016, + "reward": 1.662500023841858, + "reward_std": 0.33464279770851135, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 0.9750000238418579, + "step": 229, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 473.8500061035156, + "epoch": 0.004371376983749882, + "grad_norm": 1.6367581525951993, + "kl": 0.021728515625, + "learning_rate": 9.99952851329495e-07, + "loss": 0.0009, + "reward": 1.346039056777954, + "reward_std": 0.23499193787574768, + "rewards/accuracy_reward": 0.3635389506816864, + "rewards/format_reward": 0.949999988079071, + "step": 230, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 449.6000061035156, + "epoch": 0.0043903829706357506, + "grad_norm": 1.7578070517860367, + "kl": 0.053466796875, + "learning_rate": 9.999524404562905e-07, + "loss": 0.0021, + "reward": 2.1212499141693115, + "reward_std": 0.1882200539112091, + "rewards/accuracy_reward": 0.9000000357627869, + "rewards/format_reward": 0.9750000238418579, + "step": 231, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 410.4250183105469, + "epoch": 0.0044093889575216195, + "grad_norm": 3.449030092917484, + "kl": 0.04931640625, + "learning_rate": 9.99952027800669e-07, + "loss": 0.002, + "reward": 1.7983547449111938, + "reward_std": 0.260799378156662, + "rewards/accuracy_reward": 0.7121047377586365, + "rewards/format_reward": 0.949999988079071, + "step": 232, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 455.25, + "epoch": 0.004428394944407488, + "grad_norm": 2.692992906063105, + "kl": 0.0361328125, + "learning_rate": 9.999516133626323e-07, + "loss": 0.0014, + "reward": 1.5289610624313354, + "reward_std": 0.2862740457057953, + "rewards/accuracy_reward": 0.5789610743522644, + "rewards/format_reward": 0.800000011920929, + "step": 233, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 423.4250183105469, + "epoch": 0.004447400931293357, + "grad_norm": 1.9270645096913896, + "kl": 0.030517578125, + "learning_rate": 9.999511971421815e-07, + "loss": 0.0012, + "reward": 1.8414939641952515, + "reward_std": 0.09297298640012741, + "rewards/accuracy_reward": 0.686493992805481, + "rewards/format_reward": 1.0, + "step": 234, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 462.8000183105469, + "epoch": 0.004466406918179226, + "grad_norm": 1.2439532476556123, + "kl": 0.0169677734375, + "learning_rate": 9.999507791393183e-07, + "loss": 0.0007, + "reward": 1.376250147819519, + "reward_std": 0.35701727867126465, + "rewards/accuracy_reward": 0.45000001788139343, + "rewards/format_reward": 0.949999988079071, + "step": 235, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 443.9250183105469, + "epoch": 0.004485412905065095, + "grad_norm": 4.492013529506022, + "kl": 0.03271484375, + "learning_rate": 9.99950359354044e-07, + "loss": 0.0013, + "reward": 1.83798086643219, + "reward_std": 0.21238887310028076, + "rewards/accuracy_reward": 0.7792307734489441, + "rewards/format_reward": 0.949999988079071, + "step": 236, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 364.2250061035156, + "epoch": 0.004504418891950965, + "grad_norm": 3.5382877146896314, + "kl": 0.0537109375, + "learning_rate": 9.999499377863605e-07, + "loss": 0.0021, + "reward": 2.152939558029175, + "reward_std": 0.06586786359548569, + "rewards/accuracy_reward": 0.9216896295547485, + "rewards/format_reward": 1.0, + "step": 237, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 425.5, + "epoch": 0.004523424878836834, + "grad_norm": 2.5140456355627796, + "kl": 0.039794921875, + "learning_rate": 9.999495144362688e-07, + "loss": 0.0016, + "reward": 1.8570069074630737, + "reward_std": 0.19184057414531708, + "rewards/accuracy_reward": 0.7457568049430847, + "rewards/format_reward": 1.0, + "step": 238, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 462.32501220703125, + "epoch": 0.004542430865722703, + "grad_norm": 1.7425844949320046, + "kl": 0.03173828125, + "learning_rate": 9.999490893037708e-07, + "loss": 0.0013, + "reward": 1.7546519041061401, + "reward_std": 0.3788732588291168, + "rewards/accuracy_reward": 0.6721518635749817, + "rewards/format_reward": 0.9750000238418579, + "step": 239, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 464.6499938964844, + "epoch": 0.004561436852608572, + "grad_norm": 1.870313907201393, + "kl": 0.0196533203125, + "learning_rate": 9.999486623888678e-07, + "loss": 0.0008, + "reward": 1.6782845258712769, + "reward_std": 0.30399638414382935, + "rewards/accuracy_reward": 0.7795344591140747, + "rewards/format_reward": 0.9000000357627869, + "step": 240, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.4, + "completion_length": 395.1000061035156, + "epoch": 0.004580442839494441, + "grad_norm": 1.7342872853087963, + "kl": 0.046630859375, + "learning_rate": 9.999482336915612e-07, + "loss": 0.0019, + "reward": 1.2387466430664062, + "reward_std": 0.13596639037132263, + "rewards/accuracy_reward": 0.2249966412782669, + "rewards/format_reward": 1.0, + "step": 241, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 411.45001220703125, + "epoch": 0.00459944882638031, + "grad_norm": 1.7048656705859881, + "kl": 0.03125, + "learning_rate": 9.99947803211853e-07, + "loss": 0.0012, + "reward": 1.830570101737976, + "reward_std": 0.28210732340812683, + "rewards/accuracy_reward": 0.7455701231956482, + "rewards/format_reward": 1.0, + "step": 242, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 438.2749938964844, + "epoch": 0.0046184548132661785, + "grad_norm": 1.7703257921397861, + "kl": 0.0291748046875, + "learning_rate": 9.999473709497444e-07, + "loss": 0.0012, + "reward": 1.7837803363800049, + "reward_std": 0.12731657922267914, + "rewards/accuracy_reward": 0.6737803220748901, + "rewards/format_reward": 0.9750000238418579, + "step": 243, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 414.5, + "epoch": 0.004637460800152048, + "grad_norm": 2.0279619453881312, + "kl": 0.03955078125, + "learning_rate": 9.999469369052368e-07, + "loss": 0.0016, + "reward": 2.035435438156128, + "reward_std": 0.17900130152702332, + "rewards/accuracy_reward": 0.7754355072975159, + "rewards/format_reward": 1.0, + "step": 244, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 392.4250183105469, + "epoch": 0.004656466787037917, + "grad_norm": 2.18718915370095, + "kl": 0.045654296875, + "learning_rate": 9.99946501078332e-07, + "loss": 0.0018, + "reward": 1.938750147819519, + "reward_std": 0.24295035004615784, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 245, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 420.9750061035156, + "epoch": 0.004675472773923786, + "grad_norm": 4.047967176597088, + "kl": 0.040283203125, + "learning_rate": 9.999460634690316e-07, + "loss": 0.0016, + "reward": 1.6499645709991455, + "reward_std": 0.2919858396053314, + "rewards/accuracy_reward": 0.538714587688446, + "rewards/format_reward": 1.0, + "step": 246, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 397.45001220703125, + "epoch": 0.004694478760809655, + "grad_norm": 1.5683094599419327, + "kl": 0.0213623046875, + "learning_rate": 9.99945624077337e-07, + "loss": 0.0009, + "reward": 1.68316650390625, + "reward_std": 0.40017709136009216, + "rewards/accuracy_reward": 0.6481666564941406, + "rewards/format_reward": 1.0, + "step": 247, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 397.95001220703125, + "epoch": 0.004713484747695524, + "grad_norm": 2.258023600731953, + "kl": 0.0289306640625, + "learning_rate": 9.999451829032496e-07, + "loss": 0.0012, + "reward": 1.6057461500167847, + "reward_std": 0.16116632521152496, + "rewards/accuracy_reward": 0.5182459950447083, + "rewards/format_reward": 0.949999988079071, + "step": 248, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 365.0, + "epoch": 0.004732490734581393, + "grad_norm": 1.5050973691188916, + "kl": 0.043212890625, + "learning_rate": 9.999447399467716e-07, + "loss": 0.0017, + "reward": 1.7637500762939453, + "reward_std": 0.23460076749324799, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 1.0, + "step": 249, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 372.875, + "epoch": 0.004751496721467262, + "grad_norm": 2.943327256962536, + "kl": 0.0546875, + "learning_rate": 9.999442952079038e-07, + "loss": 0.0022, + "reward": 1.9262498617172241, + "reward_std": 0.22076304256916046, + "rewards/accuracy_reward": 0.699999988079071, + "rewards/format_reward": 1.0, + "step": 250, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 365.625, + "epoch": 0.004770502708353132, + "grad_norm": 2.3868015907290046, + "kl": 0.037841796875, + "learning_rate": 9.999438486866483e-07, + "loss": 0.0015, + "reward": 1.5969265699386597, + "reward_std": 0.08501073718070984, + "rewards/accuracy_reward": 0.5581764578819275, + "rewards/format_reward": 1.0, + "step": 251, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 426.1499938964844, + "epoch": 0.0047895086952390005, + "grad_norm": 2.7695669849171662, + "kl": 0.03076171875, + "learning_rate": 9.999434003830065e-07, + "loss": 0.0012, + "reward": 1.4927480220794678, + "reward_std": 0.15079958736896515, + "rewards/accuracy_reward": 0.4289979934692383, + "rewards/format_reward": 1.0, + "step": 252, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 380.6750183105469, + "epoch": 0.0048085146821248695, + "grad_norm": 2.9327343872552443, + "kl": 0.033935546875, + "learning_rate": 9.9994295029698e-07, + "loss": 0.0014, + "reward": 1.4994280338287354, + "reward_std": 0.33042779564857483, + "rewards/accuracy_reward": 0.42817798256874084, + "rewards/format_reward": 1.0, + "step": 253, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 372.8999938964844, + "epoch": 0.004827520669010738, + "grad_norm": 1.746185537369398, + "kl": 0.044189453125, + "learning_rate": 9.999424984285707e-07, + "loss": 0.0018, + "reward": 2.1137502193450928, + "reward_std": 0.13382381200790405, + "rewards/accuracy_reward": 0.949999988079071, + "rewards/format_reward": 1.0, + "step": 254, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 408.7749938964844, + "epoch": 0.004846526655896607, + "grad_norm": 3.387153629636754, + "kl": 0.046630859375, + "learning_rate": 9.999420447777797e-07, + "loss": 0.0019, + "reward": 1.6637500524520874, + "reward_std": 0.2163672000169754, + "rewards/accuracy_reward": 0.5250000357627869, + "rewards/format_reward": 1.0, + "step": 255, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 429.20001220703125, + "epoch": 0.004865532642782476, + "grad_norm": 2.1413556603365933, + "kl": 0.04296875, + "learning_rate": 9.99941589344609e-07, + "loss": 0.0017, + "reward": 1.8574990034103394, + "reward_std": 0.15932835638523102, + "rewards/accuracy_reward": 0.7074990272521973, + "rewards/format_reward": 1.0, + "step": 256, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 406.7749938964844, + "epoch": 0.004884538629668345, + "grad_norm": 2.3319646144845403, + "kl": 0.032470703125, + "learning_rate": 9.9994113212906e-07, + "loss": 0.0013, + "reward": 1.8343689441680908, + "reward_std": 0.18212370574474335, + "rewards/accuracy_reward": 0.6893689632415771, + "rewards/format_reward": 0.9750000238418579, + "step": 257, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 389.7250061035156, + "epoch": 0.004903544616554215, + "grad_norm": 1.8436358357332203, + "kl": 0.043212890625, + "learning_rate": 9.999406731311345e-07, + "loss": 0.0017, + "reward": 2.137500047683716, + "reward_std": 0.11913169920444489, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 258, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 428.0500183105469, + "epoch": 0.004922550603440084, + "grad_norm": 1.4617709353194248, + "kl": 0.03369140625, + "learning_rate": 9.99940212350834e-07, + "loss": 0.0013, + "reward": 1.7537500858306885, + "reward_std": 0.08835282176733017, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 259, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.4, + "completion_length": 410.375, + "epoch": 0.004941556590325953, + "grad_norm": 2.2530722269993717, + "kl": 0.0322265625, + "learning_rate": 9.999397497881602e-07, + "loss": 0.0013, + "reward": 1.462499976158142, + "reward_std": 0.2487768679857254, + "rewards/accuracy_reward": 0.42500001192092896, + "rewards/format_reward": 1.0, + "step": 260, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 422.375, + "epoch": 0.004960562577211822, + "grad_norm": 1.5428129906627623, + "kl": 0.0308837890625, + "learning_rate": 9.999392854431147e-07, + "loss": 0.0012, + "reward": 1.8400497436523438, + "reward_std": 0.18627618253231049, + "rewards/accuracy_reward": 0.7262999415397644, + "rewards/format_reward": 0.9750000238418579, + "step": 261, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 352.7749938964844, + "epoch": 0.004979568564097691, + "grad_norm": 1.821662983092403, + "kl": 0.034423828125, + "learning_rate": 9.999388193156994e-07, + "loss": 0.0014, + "reward": 1.8456611633300781, + "reward_std": 0.1655697375535965, + "rewards/accuracy_reward": 0.6794113516807556, + "rewards/format_reward": 1.0, + "step": 262, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 383.70001220703125, + "epoch": 0.0049985745509835595, + "grad_norm": 1.6262339594971065, + "kl": 0.0341796875, + "learning_rate": 9.999383514059156e-07, + "loss": 0.0014, + "reward": 2.0712499618530273, + "reward_std": 0.20316505432128906, + "rewards/accuracy_reward": 0.9000000357627869, + "rewards/format_reward": 1.0, + "step": 263, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 430.2749938964844, + "epoch": 0.0050175805378694285, + "grad_norm": 2.032651744539549, + "kl": 0.037109375, + "learning_rate": 9.999378817137653e-07, + "loss": 0.0015, + "reward": 1.4592880010604858, + "reward_std": 0.4818175435066223, + "rewards/accuracy_reward": 0.41178807616233826, + "rewards/format_reward": 0.9750000238418579, + "step": 264, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 451.9750061035156, + "epoch": 0.005036586524755298, + "grad_norm": 1.874956085494121, + "kl": 0.031982421875, + "learning_rate": 9.999374102392499e-07, + "loss": 0.0013, + "reward": 1.7299950122833252, + "reward_std": 0.3897639214992523, + "rewards/accuracy_reward": 0.6899950504302979, + "rewards/format_reward": 0.925000011920929, + "step": 265, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 410.5249938964844, + "epoch": 0.005055592511641167, + "grad_norm": 2.5142976106624997, + "kl": 0.0284423828125, + "learning_rate": 9.999369369823713e-07, + "loss": 0.0011, + "reward": 1.472083330154419, + "reward_std": 0.2645128667354584, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/format_reward": 1.0, + "step": 266, + "temporal_rewards": 0.5 + }, + { + "all_correct": 1.0, + "all_wrong": 0.0, + "completion_length": 364.2749938964844, + "epoch": 0.005074598498527036, + "grad_norm": 5.5873798269407855, + "kl": 0.048583984375, + "learning_rate": 9.99936461943131e-07, + "loss": 0.0019, + "reward": 2.2362499237060547, + "reward_std": 0.03061859868466854, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 267, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 392.6000061035156, + "epoch": 0.005093604485412905, + "grad_norm": 2.1393830938553564, + "kl": 0.0400390625, + "learning_rate": 9.999359851215307e-07, + "loss": 0.0016, + "reward": 1.9487498998641968, + "reward_std": 0.22993634641170502, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 268, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 362.625, + "epoch": 0.005112610472298774, + "grad_norm": 1.8288490575949787, + "kl": 0.039794921875, + "learning_rate": 9.999355065175725e-07, + "loss": 0.0016, + "reward": 1.5055413246154785, + "reward_std": 0.16168683767318726, + "rewards/accuracy_reward": 0.4117913246154785, + "rewards/format_reward": 1.0, + "step": 269, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 436.6499938964844, + "epoch": 0.005131616459184643, + "grad_norm": 1.7773413687933584, + "kl": 0.03125, + "learning_rate": 9.999350261312574e-07, + "loss": 0.0013, + "reward": 1.3650546073913574, + "reward_std": 0.46858763694763184, + "rewards/accuracy_reward": 0.44380465149879456, + "rewards/format_reward": 0.9000000357627869, + "step": 270, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 400.7749938964844, + "epoch": 0.005150622446070512, + "grad_norm": 2.1996420793665163, + "kl": 0.044189453125, + "learning_rate": 9.999345439625877e-07, + "loss": 0.0018, + "reward": 1.6639299392700195, + "reward_std": 0.4508049190044403, + "rewards/accuracy_reward": 0.5039300322532654, + "rewards/format_reward": 1.0, + "step": 271, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 450.6750183105469, + "epoch": 0.005169628432956382, + "grad_norm": 3.840479202326024, + "kl": 0.0191650390625, + "learning_rate": 9.999340600115648e-07, + "loss": 0.0008, + "reward": 1.4488624334335327, + "reward_std": 0.2074168175458908, + "rewards/accuracy_reward": 0.4713623523712158, + "rewards/format_reward": 0.9750000238418579, + "step": 272, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 421.75, + "epoch": 0.0051886344198422505, + "grad_norm": 1.7917107119442928, + "kl": 0.041748046875, + "learning_rate": 9.999335742781908e-07, + "loss": 0.0017, + "reward": 2.077500104904175, + "reward_std": 0.13986968994140625, + "rewards/accuracy_reward": 0.925000011920929, + "rewards/format_reward": 1.0, + "step": 273, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 385.8000183105469, + "epoch": 0.005207640406728119, + "grad_norm": 1.9068560667666392, + "kl": 0.036376953125, + "learning_rate": 9.99933086762467e-07, + "loss": 0.0015, + "reward": 1.9535537958145142, + "reward_std": 0.07865867763757706, + "rewards/accuracy_reward": 0.7960537075996399, + "rewards/format_reward": 1.0, + "step": 274, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 446.75, + "epoch": 0.005226646393613988, + "grad_norm": 1.686649728486627, + "kl": 0.04150390625, + "learning_rate": 9.999325974643953e-07, + "loss": 0.0017, + "reward": 1.90625, + "reward_std": 0.22701247036457062, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 0.9000000357627869, + "step": 275, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 386.1499938964844, + "epoch": 0.005245652380499857, + "grad_norm": 1.9749736422183155, + "kl": 0.049560546875, + "learning_rate": 9.999321063839773e-07, + "loss": 0.002, + "reward": 1.8651264905929565, + "reward_std": 0.1253584325313568, + "rewards/accuracy_reward": 0.692626416683197, + "rewards/format_reward": 1.0, + "step": 276, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 390.82501220703125, + "epoch": 0.005264658367385726, + "grad_norm": 2.0171568395282455, + "kl": 0.057861328125, + "learning_rate": 9.999316135212151e-07, + "loss": 0.0023, + "reward": 2.0, + "reward_std": 0.3786541521549225, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 1.0, + "step": 277, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 429.6499938964844, + "epoch": 0.005283664354271596, + "grad_norm": 1.9974755299257028, + "kl": 0.046142578125, + "learning_rate": 9.9993111887611e-07, + "loss": 0.0018, + "reward": 1.4188908338546753, + "reward_std": 0.2939603924751282, + "rewards/accuracy_reward": 0.49389082193374634, + "rewards/format_reward": 0.875, + "step": 278, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 449.9250183105469, + "epoch": 0.005302670341157465, + "grad_norm": 2.516161294244043, + "kl": 0.033447265625, + "learning_rate": 9.999306224486645e-07, + "loss": 0.0013, + "reward": 1.4664536714553833, + "reward_std": 0.26865848898887634, + "rewards/accuracy_reward": 0.4177037179470062, + "rewards/format_reward": 1.0, + "step": 279, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 420.6000061035156, + "epoch": 0.005321676328043334, + "grad_norm": 1.8123336510918249, + "kl": 0.0361328125, + "learning_rate": 9.999301242388796e-07, + "loss": 0.0014, + "reward": 1.6839393377304077, + "reward_std": 0.21438370645046234, + "rewards/accuracy_reward": 0.625189483165741, + "rewards/format_reward": 0.9750000238418579, + "step": 280, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 405.82501220703125, + "epoch": 0.005340682314929203, + "grad_norm": 2.202812449318228, + "kl": 0.033935546875, + "learning_rate": 9.999296242467575e-07, + "loss": 0.0014, + "reward": 1.7349998950958252, + "reward_std": 0.3030186593532562, + "rewards/accuracy_reward": 0.6649999618530273, + "rewards/format_reward": 1.0, + "step": 281, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 427.5500183105469, + "epoch": 0.005359688301815072, + "grad_norm": 2.5372341454729423, + "kl": 0.0272216796875, + "learning_rate": 9.999291224722996e-07, + "loss": 0.0011, + "reward": 1.4512499570846558, + "reward_std": 0.37611016631126404, + "rewards/accuracy_reward": 0.42500001192092896, + "rewards/format_reward": 0.9750000238418579, + "step": 282, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 404.82501220703125, + "epoch": 0.005378694288700941, + "grad_norm": 1.8983014932514142, + "kl": 0.0294189453125, + "learning_rate": 9.999286189155084e-07, + "loss": 0.0012, + "reward": 1.4568980932235718, + "reward_std": 0.2850346863269806, + "rewards/accuracy_reward": 0.4418979585170746, + "rewards/format_reward": 1.0, + "step": 283, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 418.25, + "epoch": 0.0053977002755868095, + "grad_norm": 2.1893349041774774, + "kl": 0.031494140625, + "learning_rate": 9.99928113576385e-07, + "loss": 0.0013, + "reward": 1.5455259084701538, + "reward_std": 0.3938310146331787, + "rewards/accuracy_reward": 0.5380258560180664, + "rewards/format_reward": 0.949999988079071, + "step": 284, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 445.07501220703125, + "epoch": 0.005416706262472679, + "grad_norm": 1.3788351111737587, + "kl": 0.0308837890625, + "learning_rate": 9.999276064549312e-07, + "loss": 0.0012, + "reward": 1.3413587808609009, + "reward_std": 0.025265518575906754, + "rewards/accuracy_reward": 0.5576087236404419, + "rewards/format_reward": 0.800000011920929, + "step": 285, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 433.3000183105469, + "epoch": 0.005435712249358548, + "grad_norm": 1.7270606628152778, + "kl": 0.05029296875, + "learning_rate": 9.999270975511492e-07, + "loss": 0.002, + "reward": 1.8880033493041992, + "reward_std": 0.12568174302577972, + "rewards/accuracy_reward": 0.7255033850669861, + "rewards/format_reward": 1.0, + "step": 286, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 432.9750061035156, + "epoch": 0.005454718236244417, + "grad_norm": 1.7570532941872483, + "kl": 0.03564453125, + "learning_rate": 9.999265868650407e-07, + "loss": 0.0014, + "reward": 1.8522926568984985, + "reward_std": 0.23886366188526154, + "rewards/accuracy_reward": 0.6947928071022034, + "rewards/format_reward": 0.9750000238418579, + "step": 287, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 398.25, + "epoch": 0.005473724223130286, + "grad_norm": 1.2257385015768534, + "kl": 0.0286865234375, + "learning_rate": 9.999260743966076e-07, + "loss": 0.0011, + "reward": 1.6737499237060547, + "reward_std": 0.033717554062604904, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 288, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 340.32501220703125, + "epoch": 0.005492730210016155, + "grad_norm": 1.9855940068002134, + "kl": 0.03662109375, + "learning_rate": 9.999255601458514e-07, + "loss": 0.0015, + "reward": 1.905234694480896, + "reward_std": 0.2063770741224289, + "rewards/accuracy_reward": 0.7927348017692566, + "rewards/format_reward": 1.0, + "step": 289, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 396.95001220703125, + "epoch": 0.005511736196902024, + "grad_norm": 1.9685183838556808, + "kl": 0.055419921875, + "learning_rate": 9.999250441127741e-07, + "loss": 0.0022, + "reward": 1.527500033378601, + "reward_std": 0.35372811555862427, + "rewards/accuracy_reward": 0.45000001788139343, + "rewards/format_reward": 1.0, + "step": 290, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 372.375, + "epoch": 0.005530742183787893, + "grad_norm": 1.9848425205936178, + "kl": 0.038818359375, + "learning_rate": 9.999245262973778e-07, + "loss": 0.0016, + "reward": 1.6812708377838135, + "reward_std": 0.09697379171848297, + "rewards/accuracy_reward": 0.5825207829475403, + "rewards/format_reward": 1.0, + "step": 291, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 407.625, + "epoch": 0.005549748170673763, + "grad_norm": 1.9446097606410466, + "kl": 0.03662109375, + "learning_rate": 9.999240066996642e-07, + "loss": 0.0015, + "reward": 1.5091886520385742, + "reward_std": 0.41227608919143677, + "rewards/accuracy_reward": 0.47543859481811523, + "rewards/format_reward": 1.0, + "step": 292, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 452.125, + "epoch": 0.0055687541575596316, + "grad_norm": 1.5105927127067236, + "kl": 0.0269775390625, + "learning_rate": 9.99923485319635e-07, + "loss": 0.0011, + "reward": 1.42807137966156, + "reward_std": 0.49917203187942505, + "rewards/accuracy_reward": 0.4680713713169098, + "rewards/format_reward": 0.949999988079071, + "step": 293, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 385.0249938964844, + "epoch": 0.0055877601444455005, + "grad_norm": 2.7735985161630983, + "kl": 0.03662109375, + "learning_rate": 9.99922962157292e-07, + "loss": 0.0015, + "reward": 1.9042307138442993, + "reward_std": 0.1640632450580597, + "rewards/accuracy_reward": 0.8192307353019714, + "rewards/format_reward": 0.949999988079071, + "step": 294, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 396.5, + "epoch": 0.005606766131331369, + "grad_norm": 1.6919466482631782, + "kl": 0.0361328125, + "learning_rate": 9.999224372126374e-07, + "loss": 0.0014, + "reward": 1.9099998474121094, + "reward_std": 0.23988580703735352, + "rewards/accuracy_reward": 0.7550000548362732, + "rewards/format_reward": 1.0, + "step": 295, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 459.0500183105469, + "epoch": 0.005625772118217238, + "grad_norm": 1.5434745574990358, + "kl": 0.0201416015625, + "learning_rate": 9.999219104856726e-07, + "loss": 0.0008, + "reward": 1.4433367252349854, + "reward_std": 0.4637759327888489, + "rewards/accuracy_reward": 0.5270866751670837, + "rewards/format_reward": 0.925000011920929, + "step": 296, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 398.70001220703125, + "epoch": 0.005644778105103107, + "grad_norm": 1.7890713273018475, + "kl": 0.046875, + "learning_rate": 9.999213819764e-07, + "loss": 0.0019, + "reward": 1.53458833694458, + "reward_std": 0.27791649103164673, + "rewards/accuracy_reward": 0.44333839416503906, + "rewards/format_reward": 1.0, + "step": 297, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 381.07501220703125, + "epoch": 0.005663784091988976, + "grad_norm": 2.094499926828871, + "kl": 0.04150390625, + "learning_rate": 9.999208516848211e-07, + "loss": 0.0017, + "reward": 1.8804165124893188, + "reward_std": 0.23431554436683655, + "rewards/accuracy_reward": 0.7166666984558105, + "rewards/format_reward": 1.0, + "step": 298, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 478.1000061035156, + "epoch": 0.005682790078874846, + "grad_norm": 1.6197449172994878, + "kl": 0.03076171875, + "learning_rate": 9.99920319610938e-07, + "loss": 0.0012, + "reward": 1.382501482963562, + "reward_std": 0.11060315370559692, + "rewards/accuracy_reward": 0.4325014054775238, + "rewards/format_reward": 0.800000011920929, + "step": 299, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 418.0, + "epoch": 0.005701796065760715, + "grad_norm": 1.5217130478601588, + "kl": 0.03955078125, + "learning_rate": 9.999197857547526e-07, + "loss": 0.0016, + "reward": 1.8587499856948853, + "reward_std": 0.22571049630641937, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 1.0, + "step": 300, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 422.0500183105469, + "epoch": 0.005720802052646584, + "grad_norm": 2.332531459916934, + "kl": 0.04736328125, + "learning_rate": 9.999192501162666e-07, + "loss": 0.0019, + "reward": 1.9512500762939453, + "reward_std": 0.2596333920955658, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 301, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.6, + "completion_length": 444.9750061035156, + "epoch": 0.005739808039532453, + "grad_norm": 1.6562509499100286, + "kl": 0.03759765625, + "learning_rate": 9.999187126954823e-07, + "loss": 0.0015, + "reward": 1.3737499713897705, + "reward_std": 0.17272552847862244, + "rewards/accuracy_reward": 0.32500001788139343, + "rewards/format_reward": 1.0, + "step": 302, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 400.5500183105469, + "epoch": 0.005758814026418322, + "grad_norm": 2.407287220935292, + "kl": 0.043212890625, + "learning_rate": 9.999181734924011e-07, + "loss": 0.0017, + "reward": 1.7440416812896729, + "reward_std": 0.2270553857088089, + "rewards/accuracy_reward": 0.6052916646003723, + "rewards/format_reward": 1.0, + "step": 303, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 395.20001220703125, + "epoch": 0.0057778200133041906, + "grad_norm": 1.5804410452974618, + "kl": 0.04052734375, + "learning_rate": 9.999176325070252e-07, + "loss": 0.0016, + "reward": 1.1881250143051147, + "reward_std": 0.31124377250671387, + "rewards/accuracy_reward": 0.17812500894069672, + "rewards/format_reward": 1.0, + "step": 304, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 384.4750061035156, + "epoch": 0.0057968260001900595, + "grad_norm": 1.7580727061972712, + "kl": 0.043212890625, + "learning_rate": 9.999170897393564e-07, + "loss": 0.0017, + "reward": 1.7383193969726562, + "reward_std": 0.28262844681739807, + "rewards/accuracy_reward": 0.6033194661140442, + "rewards/format_reward": 1.0, + "step": 305, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 456.32501220703125, + "epoch": 0.005815831987075929, + "grad_norm": 1.7699612076645639, + "kl": 0.037841796875, + "learning_rate": 9.99916545189397e-07, + "loss": 0.0015, + "reward": 1.5950278043746948, + "reward_std": 0.3412419855594635, + "rewards/accuracy_reward": 0.5387776494026184, + "rewards/format_reward": 0.9750000238418579, + "step": 306, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 388.1499938964844, + "epoch": 0.005834837973961798, + "grad_norm": 2.9426697811317077, + "kl": 0.0439453125, + "learning_rate": 9.999159988571486e-07, + "loss": 0.0018, + "reward": 1.4479975700378418, + "reward_std": 0.3414693772792816, + "rewards/accuracy_reward": 0.4167475700378418, + "rewards/format_reward": 1.0, + "step": 307, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 405.45001220703125, + "epoch": 0.005853843960847667, + "grad_norm": 3.8830806162209903, + "kl": 0.04296875, + "learning_rate": 9.999154507426131e-07, + "loss": 0.0017, + "reward": 1.7688621282577515, + "reward_std": 0.29286906123161316, + "rewards/accuracy_reward": 0.6538621783256531, + "rewards/format_reward": 0.9750000238418579, + "step": 308, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 386.3500061035156, + "epoch": 0.005872849947733536, + "grad_norm": 1.781044153110498, + "kl": 0.040771484375, + "learning_rate": 9.999149008457927e-07, + "loss": 0.0016, + "reward": 1.8679708242416382, + "reward_std": 0.13607355952262878, + "rewards/accuracy_reward": 0.8142208456993103, + "rewards/format_reward": 1.0, + "step": 309, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 363.7749938964844, + "epoch": 0.005891855934619405, + "grad_norm": 2.2107186154410865, + "kl": 0.053466796875, + "learning_rate": 9.999143491666893e-07, + "loss": 0.0021, + "reward": 1.7591158151626587, + "reward_std": 0.22363010048866272, + "rewards/accuracy_reward": 0.5991159081459045, + "rewards/format_reward": 1.0, + "step": 310, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 437.6750183105469, + "epoch": 0.005910861921505274, + "grad_norm": 1.7226129105690773, + "kl": 0.0400390625, + "learning_rate": 9.999137957053048e-07, + "loss": 0.0016, + "reward": 1.323028802871704, + "reward_std": 0.4296836853027344, + "rewards/accuracy_reward": 0.30802878737449646, + "rewards/format_reward": 0.949999988079071, + "step": 311, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 380.1499938964844, + "epoch": 0.005929867908391143, + "grad_norm": 4.031065740331065, + "kl": 0.041748046875, + "learning_rate": 9.999132404616411e-07, + "loss": 0.0017, + "reward": 1.590849757194519, + "reward_std": 0.23907624185085297, + "rewards/accuracy_reward": 0.4983498752117157, + "rewards/format_reward": 1.0, + "step": 312, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 434.1499938964844, + "epoch": 0.005948873895277013, + "grad_norm": 1.9627437006929076, + "kl": 0.04736328125, + "learning_rate": 9.999126834357003e-07, + "loss": 0.0019, + "reward": 1.938750147819519, + "reward_std": 0.1775941550731659, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 313, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 389.5500183105469, + "epoch": 0.0059678798821628815, + "grad_norm": 2.697616212323388, + "kl": 0.04443359375, + "learning_rate": 9.999121246274844e-07, + "loss": 0.0018, + "reward": 1.9043397903442383, + "reward_std": 0.24692252278327942, + "rewards/accuracy_reward": 0.7355899214744568, + "rewards/format_reward": 1.0, + "step": 314, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 418.5249938964844, + "epoch": 0.0059868858690487504, + "grad_norm": 1.9088638488853837, + "kl": 0.040283203125, + "learning_rate": 9.999115640369952e-07, + "loss": 0.0016, + "reward": 1.7094920873641968, + "reward_std": 0.06522587686777115, + "rewards/accuracy_reward": 0.5194922685623169, + "rewards/format_reward": 1.0, + "step": 315, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 390.1750183105469, + "epoch": 0.006005891855934619, + "grad_norm": 3.3724263207938017, + "kl": 0.048828125, + "learning_rate": 9.99911001664235e-07, + "loss": 0.002, + "reward": 1.6162500381469727, + "reward_std": 0.36210280656814575, + "rewards/accuracy_reward": 0.5250000357627869, + "rewards/format_reward": 1.0, + "step": 316, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 437.9250183105469, + "epoch": 0.006024897842820488, + "grad_norm": 2.61414351147298, + "kl": 0.03515625, + "learning_rate": 9.999104375092055e-07, + "loss": 0.0014, + "reward": 1.5493055582046509, + "reward_std": 0.32617637515068054, + "rewards/accuracy_reward": 0.605555534362793, + "rewards/format_reward": 0.9000000357627869, + "step": 317, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 405.0500183105469, + "epoch": 0.006043903829706357, + "grad_norm": 2.2876864698009736, + "kl": 0.041015625, + "learning_rate": 9.999098715719089e-07, + "loss": 0.0016, + "reward": 1.7958471775054932, + "reward_std": 0.1639290750026703, + "rewards/accuracy_reward": 0.5970970988273621, + "rewards/format_reward": 1.0, + "step": 318, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 404.4750061035156, + "epoch": 0.006062909816592226, + "grad_norm": 1.8550519555730431, + "kl": 0.0498046875, + "learning_rate": 9.999093038523474e-07, + "loss": 0.002, + "reward": 1.9999569654464722, + "reward_std": 0.10818469524383545, + "rewards/accuracy_reward": 0.8362069129943848, + "rewards/format_reward": 1.0, + "step": 319, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 402.20001220703125, + "epoch": 0.006081915803478096, + "grad_norm": 2.988112669247468, + "kl": 0.033203125, + "learning_rate": 9.999087343505226e-07, + "loss": 0.0013, + "reward": 1.8866890668869019, + "reward_std": 0.12124647945165634, + "rewards/accuracy_reward": 0.7766891121864319, + "rewards/format_reward": 1.0, + "step": 320, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 429.45001220703125, + "epoch": 0.006100921790363965, + "grad_norm": 1.6852191045663105, + "kl": 0.03662109375, + "learning_rate": 9.999081630664368e-07, + "loss": 0.0015, + "reward": 1.6287879943847656, + "reward_std": 0.30897006392478943, + "rewards/accuracy_reward": 0.527538001537323, + "rewards/format_reward": 1.0, + "step": 321, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 399.3000183105469, + "epoch": 0.006119927777249834, + "grad_norm": 2.145423119385929, + "kl": 0.037841796875, + "learning_rate": 9.999075900000919e-07, + "loss": 0.0015, + "reward": 1.7036985158920288, + "reward_std": 0.14790479838848114, + "rewards/accuracy_reward": 0.6086985468864441, + "rewards/format_reward": 1.0, + "step": 322, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 377.7250061035156, + "epoch": 0.006138933764135703, + "grad_norm": 1.6399069447510508, + "kl": 0.04052734375, + "learning_rate": 9.9990701515149e-07, + "loss": 0.0016, + "reward": 1.9162501096725464, + "reward_std": 0.28791746497154236, + "rewards/accuracy_reward": 0.9000000357627869, + "rewards/format_reward": 0.9750000238418579, + "step": 323, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 424.3500061035156, + "epoch": 0.006157939751021572, + "grad_norm": 1.9349427211289458, + "kl": 0.03515625, + "learning_rate": 9.999064385206333e-07, + "loss": 0.0014, + "reward": 1.5591973066329956, + "reward_std": 0.4124367833137512, + "rewards/accuracy_reward": 0.6341972351074219, + "rewards/format_reward": 0.925000011920929, + "step": 324, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 480.0, + "epoch": 0.0061769457379074405, + "grad_norm": 1.4796136349327658, + "kl": 0.031005859375, + "learning_rate": 9.999058601075234e-07, + "loss": 0.0012, + "reward": 1.5225000381469727, + "reward_std": 0.6301305890083313, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 0.7750000357627869, + "step": 325, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 422.625, + "epoch": 0.0061959517247933095, + "grad_norm": 1.953144297505596, + "kl": 0.041015625, + "learning_rate": 9.99905279912163e-07, + "loss": 0.0016, + "reward": 2.0349998474121094, + "reward_std": 0.14549851417541504, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 326, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 393.0500183105469, + "epoch": 0.006214957711679179, + "grad_norm": 1.8833621265410123, + "kl": 0.046875, + "learning_rate": 9.999046979345538e-07, + "loss": 0.0019, + "reward": 1.9712499380111694, + "reward_std": 0.2521544098854065, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 327, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 427.875, + "epoch": 0.006233963698565048, + "grad_norm": 1.6375846005801433, + "kl": 0.041015625, + "learning_rate": 9.999041141746978e-07, + "loss": 0.0016, + "reward": 1.520936369895935, + "reward_std": 0.4715319573879242, + "rewards/accuracy_reward": 0.49843630194664, + "rewards/format_reward": 0.949999988079071, + "step": 328, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 394.125, + "epoch": 0.006252969685450917, + "grad_norm": 1.5972663021511073, + "kl": 0.056640625, + "learning_rate": 9.999035286325973e-07, + "loss": 0.0023, + "reward": 2.1449999809265137, + "reward_std": 0.10506661981344223, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 329, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 384.9250183105469, + "epoch": 0.006271975672336786, + "grad_norm": 1.7875585389955349, + "kl": 0.036376953125, + "learning_rate": 9.999029413082543e-07, + "loss": 0.0015, + "reward": 1.7178245782852173, + "reward_std": 0.1480218768119812, + "rewards/accuracy_reward": 0.616574764251709, + "rewards/format_reward": 1.0, + "step": 330, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 445.4750061035156, + "epoch": 0.006290981659222655, + "grad_norm": 2.5762864445319504, + "kl": 0.02734375, + "learning_rate": 9.999023522016707e-07, + "loss": 0.0011, + "reward": 1.4148296117782593, + "reward_std": 0.11809279769659042, + "rewards/accuracy_reward": 0.37357956171035767, + "rewards/format_reward": 1.0, + "step": 331, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 410.5500183105469, + "epoch": 0.006309987646108524, + "grad_norm": 1.9593416173996685, + "kl": 0.051513671875, + "learning_rate": 9.999017613128492e-07, + "loss": 0.0021, + "reward": 2.0071799755096436, + "reward_std": 0.2119959443807602, + "rewards/accuracy_reward": 0.842180073261261, + "rewards/format_reward": 1.0, + "step": 332, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 422.0500183105469, + "epoch": 0.006328993632994394, + "grad_norm": 1.620746774279616, + "kl": 0.0322265625, + "learning_rate": 9.99901168641791e-07, + "loss": 0.0013, + "reward": 1.4732033014297485, + "reward_std": 0.2191545069217682, + "rewards/accuracy_reward": 0.4119533598423004, + "rewards/format_reward": 1.0, + "step": 333, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 409.3000183105469, + "epoch": 0.006347999619880263, + "grad_norm": 2.103129597124734, + "kl": 0.05224609375, + "learning_rate": 9.999005741884988e-07, + "loss": 0.0021, + "reward": 2.04705548286438, + "reward_std": 0.03932321071624756, + "rewards/accuracy_reward": 0.8808054327964783, + "rewards/format_reward": 1.0, + "step": 334, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 420.8999938964844, + "epoch": 0.0063670056067661315, + "grad_norm": 1.7259102939137225, + "kl": 0.049072265625, + "learning_rate": 9.99899977952975e-07, + "loss": 0.002, + "reward": 1.906690001487732, + "reward_std": 0.15094737708568573, + "rewards/accuracy_reward": 0.759190022945404, + "rewards/format_reward": 1.0, + "step": 335, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 468.7749938964844, + "epoch": 0.006386011593652, + "grad_norm": 1.6827581107423368, + "kl": 0.0458984375, + "learning_rate": 9.99899379935221e-07, + "loss": 0.0018, + "reward": 1.3009380102157593, + "reward_std": 0.27229711413383484, + "rewards/accuracy_reward": 0.429688036441803, + "rewards/format_reward": 0.8500000238418579, + "step": 336, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 436.3500061035156, + "epoch": 0.006405017580537869, + "grad_norm": 1.221869542475631, + "kl": 0.05859375, + "learning_rate": 9.998987801352394e-07, + "loss": 0.0023, + "reward": 1.6899999380111694, + "reward_std": 0.06650522351264954, + "rewards/accuracy_reward": 0.5512499809265137, + "rewards/format_reward": 1.0, + "step": 337, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 421.8999938964844, + "epoch": 0.006424023567423738, + "grad_norm": 2.024033674147605, + "kl": 0.05126953125, + "learning_rate": 9.998981785530324e-07, + "loss": 0.002, + "reward": 1.6939667463302612, + "reward_std": 0.10195688158273697, + "rewards/accuracy_reward": 0.6152166128158569, + "rewards/format_reward": 1.0, + "step": 338, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 416.9750061035156, + "epoch": 0.006443029554309607, + "grad_norm": 1.6706206833094994, + "kl": 0.064453125, + "learning_rate": 9.998975751886016e-07, + "loss": 0.0026, + "reward": 2.112499952316284, + "reward_std": 0.18197335302829742, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 339, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 417.4250183105469, + "epoch": 0.006462035541195477, + "grad_norm": 1.5353881044695634, + "kl": 0.0712890625, + "learning_rate": 9.998969700419497e-07, + "loss": 0.0029, + "reward": 1.8312500715255737, + "reward_std": 0.10216512531042099, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 340, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 403.4750061035156, + "epoch": 0.006481041528081346, + "grad_norm": 1.5509881900311082, + "kl": 0.039306640625, + "learning_rate": 9.998963631130785e-07, + "loss": 0.0016, + "reward": 1.501399278640747, + "reward_std": 0.34407320618629456, + "rewards/accuracy_reward": 0.5813993215560913, + "rewards/format_reward": 0.875, + "step": 341, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 428.375, + "epoch": 0.006500047514967215, + "grad_norm": 3.207707638276993, + "kl": 0.06787109375, + "learning_rate": 9.998957544019906e-07, + "loss": 0.0027, + "reward": 1.7646111249923706, + "reward_std": 0.14869549870491028, + "rewards/accuracy_reward": 0.6221112608909607, + "rewards/format_reward": 1.0, + "step": 342, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 414.6499938964844, + "epoch": 0.006519053501853084, + "grad_norm": 2.228689409588524, + "kl": 0.0400390625, + "learning_rate": 9.998951439086879e-07, + "loss": 0.0016, + "reward": 1.3966782093048096, + "reward_std": 0.38540342450141907, + "rewards/accuracy_reward": 0.3366781771183014, + "rewards/format_reward": 0.9750000238418579, + "step": 343, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 480.20001220703125, + "epoch": 0.006538059488738953, + "grad_norm": 1.5924744575492689, + "kl": 0.04296875, + "learning_rate": 9.998945316331725e-07, + "loss": 0.0017, + "reward": 1.6147444248199463, + "reward_std": 0.2346866875886917, + "rewards/accuracy_reward": 0.6559944152832031, + "rewards/format_reward": 0.8500000238418579, + "step": 344, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 391.5, + "epoch": 0.006557065475624822, + "grad_norm": 1.845479006981145, + "kl": 0.0517578125, + "learning_rate": 9.998939175754465e-07, + "loss": 0.0021, + "reward": 1.8144477605819702, + "reward_std": 0.1335175484418869, + "rewards/accuracy_reward": 0.6406978964805603, + "rewards/format_reward": 1.0, + "step": 345, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 413.0249938964844, + "epoch": 0.0065760714625106905, + "grad_norm": 2.1479510426702806, + "kl": 0.0625, + "learning_rate": 9.998933017355125e-07, + "loss": 0.0025, + "reward": 1.7408040761947632, + "reward_std": 0.27547261118888855, + "rewards/accuracy_reward": 0.5920543074607849, + "rewards/format_reward": 1.0, + "step": 346, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 423.5249938964844, + "epoch": 0.00659507744939656, + "grad_norm": 1.9013688449713897, + "kl": 0.07763671875, + "learning_rate": 9.998926841133723e-07, + "loss": 0.0031, + "reward": 2.0399999618530273, + "reward_std": 0.10641022026538849, + "rewards/accuracy_reward": 0.824999988079071, + "rewards/format_reward": 1.0, + "step": 347, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 436.6000061035156, + "epoch": 0.006614083436282429, + "grad_norm": 1.6456004700430817, + "kl": 0.0458984375, + "learning_rate": 9.998920647090284e-07, + "loss": 0.0018, + "reward": 2.046250104904175, + "reward_std": 0.2025504857301712, + "rewards/accuracy_reward": 0.9000000357627869, + "rewards/format_reward": 1.0, + "step": 348, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 450.95001220703125, + "epoch": 0.006633089423168298, + "grad_norm": 1.6585390593393898, + "kl": 0.05029296875, + "learning_rate": 9.998914435224825e-07, + "loss": 0.002, + "reward": 1.7212451696395874, + "reward_std": 0.1151413694024086, + "rewards/accuracy_reward": 0.6524952054023743, + "rewards/format_reward": 1.0, + "step": 349, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 411.8000183105469, + "epoch": 0.006652095410054167, + "grad_norm": 1.7439355336313964, + "kl": 0.04248046875, + "learning_rate": 9.998908205537375e-07, + "loss": 0.0017, + "reward": 1.680641770362854, + "reward_std": 0.06813951581716537, + "rewards/accuracy_reward": 0.6068916916847229, + "rewards/format_reward": 1.0, + "step": 350, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 451.8999938964844, + "epoch": 0.006671101396940036, + "grad_norm": 1.8796301429933802, + "kl": 0.053955078125, + "learning_rate": 9.998901958027952e-07, + "loss": 0.0022, + "reward": 1.6309250593185425, + "reward_std": 0.37135452032089233, + "rewards/accuracy_reward": 0.6484249830245972, + "rewards/format_reward": 0.949999988079071, + "step": 351, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 380.625, + "epoch": 0.006690107383825905, + "grad_norm": 1.6554985002376068, + "kl": 0.05419921875, + "learning_rate": 9.998895692696581e-07, + "loss": 0.0022, + "reward": 1.5699999332427979, + "reward_std": 0.21335946023464203, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 352, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.2, + "completion_length": 440.7749938964844, + "epoch": 0.006709113370711774, + "grad_norm": 2.6934214757735915, + "kl": 0.06005859375, + "learning_rate": 9.998889409543282e-07, + "loss": 0.0024, + "reward": 1.96999990940094, + "reward_std": 0.11868967115879059, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 0.9750000238418579, + "step": 353, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 450.6750183105469, + "epoch": 0.006728119357597644, + "grad_norm": 1.8864143064385501, + "kl": 0.03173828125, + "learning_rate": 9.998883108568076e-07, + "loss": 0.0013, + "reward": 1.2243750095367432, + "reward_std": 0.28353166580200195, + "rewards/accuracy_reward": 0.2593750059604645, + "rewards/format_reward": 0.925000011920929, + "step": 354, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 472.8500061035156, + "epoch": 0.0067471253444835125, + "grad_norm": 1.608030280986472, + "kl": 0.041015625, + "learning_rate": 9.998876789770988e-07, + "loss": 0.0016, + "reward": 1.8496843576431274, + "reward_std": 0.06093672662973404, + "rewards/accuracy_reward": 0.7346843481063843, + "rewards/format_reward": 1.0, + "step": 355, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 466.57501220703125, + "epoch": 0.0067661313313693815, + "grad_norm": 1.2996917671241226, + "kl": 0.043701171875, + "learning_rate": 9.998870453152041e-07, + "loss": 0.0018, + "reward": 1.4723917245864868, + "reward_std": 0.31451860070228577, + "rewards/accuracy_reward": 0.45239168405532837, + "rewards/format_reward": 0.9750000238418579, + "step": 356, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 405.6000061035156, + "epoch": 0.00678513731825525, + "grad_norm": 1.6829502773588834, + "kl": 0.043701171875, + "learning_rate": 9.998864098711256e-07, + "loss": 0.0018, + "reward": 1.8915386199951172, + "reward_std": 0.031106257811188698, + "rewards/accuracy_reward": 0.7865384817123413, + "rewards/format_reward": 1.0, + "step": 357, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 473.45001220703125, + "epoch": 0.006804143305141119, + "grad_norm": 1.8442333951772039, + "kl": 0.0361328125, + "learning_rate": 9.998857726448657e-07, + "loss": 0.0014, + "reward": 1.6287158727645874, + "reward_std": 0.24379150569438934, + "rewards/accuracy_reward": 0.6162160038948059, + "rewards/format_reward": 0.949999988079071, + "step": 358, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 400.625, + "epoch": 0.006823149292026988, + "grad_norm": 2.089704128109271, + "kl": 0.06494140625, + "learning_rate": 9.998851336364266e-07, + "loss": 0.0026, + "reward": 1.9037498235702515, + "reward_std": 0.2772168815135956, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 359, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 466.4750061035156, + "epoch": 0.006842155278912857, + "grad_norm": 1.5966099844671438, + "kl": 0.03955078125, + "learning_rate": 9.998844928458105e-07, + "loss": 0.0016, + "reward": 1.6224985122680664, + "reward_std": 0.19626890122890472, + "rewards/accuracy_reward": 0.6537485122680664, + "rewards/format_reward": 0.8500000238418579, + "step": 360, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 467.2749938964844, + "epoch": 0.006861161265798727, + "grad_norm": 1.2285476563398856, + "kl": 0.0654296875, + "learning_rate": 9.998838502730197e-07, + "loss": 0.0026, + "reward": 1.738376498222351, + "reward_std": 0.18706470727920532, + "rewards/accuracy_reward": 0.6521264314651489, + "rewards/format_reward": 0.925000011920929, + "step": 361, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 452.4750061035156, + "epoch": 0.006880167252684596, + "grad_norm": 2.8371789308331703, + "kl": 0.052978515625, + "learning_rate": 9.998832059180566e-07, + "loss": 0.0021, + "reward": 1.8680633306503296, + "reward_std": 0.15290091931819916, + "rewards/accuracy_reward": 0.7280632853507996, + "rewards/format_reward": 1.0, + "step": 362, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 426.5500183105469, + "epoch": 0.006899173239570465, + "grad_norm": 1.8890417363934238, + "kl": 0.068359375, + "learning_rate": 9.998825597809236e-07, + "loss": 0.0027, + "reward": 2.134999990463257, + "reward_std": 0.10920830816030502, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 363, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 426.70001220703125, + "epoch": 0.006918179226456334, + "grad_norm": 2.087265422991059, + "kl": 0.0517578125, + "learning_rate": 9.998819118616225e-07, + "loss": 0.0021, + "reward": 1.8948386907577515, + "reward_std": 0.1319272369146347, + "rewards/accuracy_reward": 0.7548387050628662, + "rewards/format_reward": 1.0, + "step": 364, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 436.25, + "epoch": 0.006937185213342203, + "grad_norm": 2.655390747868118, + "kl": 0.04150390625, + "learning_rate": 9.998812621601563e-07, + "loss": 0.0017, + "reward": 1.2124675512313843, + "reward_std": 0.365411639213562, + "rewards/accuracy_reward": 0.2399676889181137, + "rewards/format_reward": 0.949999988079071, + "step": 365, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 462.8500061035156, + "epoch": 0.0069561912002280716, + "grad_norm": 2.1552617311384537, + "kl": 0.058837890625, + "learning_rate": 9.998806106765268e-07, + "loss": 0.0023, + "reward": 1.9191792011260986, + "reward_std": 0.2516794204711914, + "rewards/accuracy_reward": 0.7091791033744812, + "rewards/format_reward": 0.9750000238418579, + "step": 366, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 427.2250061035156, + "epoch": 0.0069751971871139405, + "grad_norm": 1.680821189014191, + "kl": 0.046875, + "learning_rate": 9.998799574107366e-07, + "loss": 0.0019, + "reward": 1.8996073007583618, + "reward_std": 0.15216723084449768, + "rewards/accuracy_reward": 0.749607264995575, + "rewards/format_reward": 1.0, + "step": 367, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 359.3999938964844, + "epoch": 0.00699420317399981, + "grad_norm": 1.7035912917544618, + "kl": 0.046630859375, + "learning_rate": 9.998793023627879e-07, + "loss": 0.0019, + "reward": 1.6501998901367188, + "reward_std": 0.1932554692029953, + "rewards/accuracy_reward": 0.6126998066902161, + "rewards/format_reward": 1.0, + "step": 368, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 457.57501220703125, + "epoch": 0.007013209160885679, + "grad_norm": 1.8126025432320474, + "kl": 0.043212890625, + "learning_rate": 9.998786455326828e-07, + "loss": 0.0017, + "reward": 1.6159191131591797, + "reward_std": 0.4298867881298065, + "rewards/accuracy_reward": 0.5759193301200867, + "rewards/format_reward": 0.949999988079071, + "step": 369, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 430.4750061035156, + "epoch": 0.007032215147771548, + "grad_norm": 1.8250904494274842, + "kl": 0.068359375, + "learning_rate": 9.998779869204243e-07, + "loss": 0.0027, + "reward": 2.0712497234344482, + "reward_std": 0.18371863663196564, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 370, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 442.07501220703125, + "epoch": 0.007051221134657417, + "grad_norm": 3.1025661461638943, + "kl": 0.051025390625, + "learning_rate": 9.99877326526014e-07, + "loss": 0.002, + "reward": 1.9512499570846558, + "reward_std": 0.16346576809883118, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 1.0, + "step": 371, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 386.20001220703125, + "epoch": 0.007070227121543286, + "grad_norm": 2.1965569940976493, + "kl": 0.05419921875, + "learning_rate": 9.998766643494549e-07, + "loss": 0.0022, + "reward": 1.8899999856948853, + "reward_std": 0.16408979892730713, + "rewards/accuracy_reward": 0.699999988079071, + "rewards/format_reward": 1.0, + "step": 372, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 424.125, + "epoch": 0.007089233108429155, + "grad_norm": 3.503327292169236, + "kl": 0.0439453125, + "learning_rate": 9.99876000390749e-07, + "loss": 0.0018, + "reward": 1.4891948699951172, + "reward_std": 0.17772071063518524, + "rewards/accuracy_reward": 0.5254448652267456, + "rewards/format_reward": 0.9000000357627869, + "step": 373, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 441.7749938964844, + "epoch": 0.007108239095315024, + "grad_norm": 1.8971048143789693, + "kl": 0.0277099609375, + "learning_rate": 9.998753346498988e-07, + "loss": 0.0011, + "reward": 1.3545420169830322, + "reward_std": 0.2112666368484497, + "rewards/accuracy_reward": 0.36204198002815247, + "rewards/format_reward": 1.0, + "step": 374, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 398.3500061035156, + "epoch": 0.007127245082200894, + "grad_norm": 1.6972808617420034, + "kl": 0.0439453125, + "learning_rate": 9.998746671269063e-07, + "loss": 0.0018, + "reward": 1.582077980041504, + "reward_std": 0.0559798963367939, + "rewards/accuracy_reward": 0.43707799911499023, + "rewards/format_reward": 1.0, + "step": 375, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 425.32501220703125, + "epoch": 0.0071462510690867625, + "grad_norm": 1.6154083998539943, + "kl": 0.046142578125, + "learning_rate": 9.998739978217744e-07, + "loss": 0.0018, + "reward": 1.5512499809265137, + "reward_std": 0.19012121856212616, + "rewards/accuracy_reward": 0.4749999940395355, + "rewards/format_reward": 1.0, + "step": 376, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 444.70001220703125, + "epoch": 0.0071652570559726314, + "grad_norm": 1.790521644083571, + "kl": 0.05322265625, + "learning_rate": 9.998733267345052e-07, + "loss": 0.0021, + "reward": 1.4208310842514038, + "reward_std": 0.3444187343120575, + "rewards/accuracy_reward": 0.45208102464675903, + "rewards/format_reward": 0.9750000238418579, + "step": 377, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 387.3999938964844, + "epoch": 0.0071842630428585, + "grad_norm": 2.0289734289878614, + "kl": 0.0556640625, + "learning_rate": 9.998726538651012e-07, + "loss": 0.0022, + "reward": 1.8264998197555542, + "reward_std": 0.22795183956623077, + "rewards/accuracy_reward": 0.727749764919281, + "rewards/format_reward": 1.0, + "step": 378, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 417.82501220703125, + "epoch": 0.007203269029744369, + "grad_norm": 2.437693174872833, + "kl": 0.042236328125, + "learning_rate": 9.998719792135648e-07, + "loss": 0.0017, + "reward": 1.6114012002944946, + "reward_std": 0.17477957904338837, + "rewards/accuracy_reward": 0.5289012789726257, + "rewards/format_reward": 1.0, + "step": 379, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 428.7250061035156, + "epoch": 0.007222275016630238, + "grad_norm": 1.8435416122800077, + "kl": 0.05615234375, + "learning_rate": 9.998713027798984e-07, + "loss": 0.0022, + "reward": 1.7316067218780518, + "reward_std": 0.032526444643735886, + "rewards/accuracy_reward": 0.5466069579124451, + "rewards/format_reward": 1.0, + "step": 380, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 430.375, + "epoch": 0.007241281003516108, + "grad_norm": 1.9998852169321388, + "kl": 0.047607421875, + "learning_rate": 9.998706245641044e-07, + "loss": 0.0019, + "reward": 1.8103069067001343, + "reward_std": 0.16898544132709503, + "rewards/accuracy_reward": 0.6915570497512817, + "rewards/format_reward": 0.9750000238418579, + "step": 381, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 398.2250061035156, + "epoch": 0.007260286990401977, + "grad_norm": 7.520919082948273, + "kl": 0.05322265625, + "learning_rate": 9.998699445661852e-07, + "loss": 0.0021, + "reward": 1.8115625381469727, + "reward_std": 0.30977484583854675, + "rewards/accuracy_reward": 0.714062511920929, + "rewards/format_reward": 1.0, + "step": 382, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 414.7250061035156, + "epoch": 0.007279292977287846, + "grad_norm": 1.562333371677345, + "kl": 0.05908203125, + "learning_rate": 9.99869262786143e-07, + "loss": 0.0024, + "reward": 1.7900002002716064, + "reward_std": 0.14036639034748077, + "rewards/accuracy_reward": 0.699999988079071, + "rewards/format_reward": 1.0, + "step": 383, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 433.2250061035156, + "epoch": 0.007298298964173715, + "grad_norm": 1.787448528538503, + "kl": 0.040771484375, + "learning_rate": 9.998685792239805e-07, + "loss": 0.0016, + "reward": 1.6576591730117798, + "reward_std": 0.266426146030426, + "rewards/accuracy_reward": 0.5114091038703918, + "rewards/format_reward": 1.0, + "step": 384, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 456.32501220703125, + "epoch": 0.007317304951059584, + "grad_norm": 1.893005359468317, + "kl": 0.0303955078125, + "learning_rate": 9.998678938797003e-07, + "loss": 0.0012, + "reward": 1.054965615272522, + "reward_std": 0.42117562890052795, + "rewards/accuracy_reward": 0.28371554613113403, + "rewards/format_reward": 0.800000011920929, + "step": 385, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 412.20001220703125, + "epoch": 0.007336310937945453, + "grad_norm": 3.547055383632895, + "kl": 0.043212890625, + "learning_rate": 9.998672067533045e-07, + "loss": 0.0017, + "reward": 1.8491076231002808, + "reward_std": 0.35944467782974243, + "rewards/accuracy_reward": 0.736607551574707, + "rewards/format_reward": 1.0, + "step": 386, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 427.9750061035156, + "epoch": 0.0073553169248313215, + "grad_norm": 1.8100462172422234, + "kl": 0.054931640625, + "learning_rate": 9.998665178447959e-07, + "loss": 0.0022, + "reward": 1.9475001096725464, + "reward_std": 0.21716085076332092, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 387, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 411.20001220703125, + "epoch": 0.007374322911717191, + "grad_norm": 1.9122638378072812, + "kl": 0.05859375, + "learning_rate": 9.998658271541765e-07, + "loss": 0.0023, + "reward": 2.0924999713897705, + "reward_std": 0.13992176949977875, + "rewards/accuracy_reward": 0.925000011920929, + "rewards/format_reward": 1.0, + "step": 388, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 442.3999938964844, + "epoch": 0.00739332889860306, + "grad_norm": 1.648036839441318, + "kl": 0.0390625, + "learning_rate": 9.998651346814491e-07, + "loss": 0.0016, + "reward": 1.4970048666000366, + "reward_std": 0.155747190117836, + "rewards/accuracy_reward": 0.4220047891139984, + "rewards/format_reward": 1.0, + "step": 389, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 398.8500061035156, + "epoch": 0.007412334885488929, + "grad_norm": 1.884855440401696, + "kl": 0.03515625, + "learning_rate": 9.998644404266159e-07, + "loss": 0.0014, + "reward": 2.0762500762939453, + "reward_std": 0.04517301544547081, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 390, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 397.875, + "epoch": 0.007431340872374798, + "grad_norm": 2.012870740898151, + "kl": 0.0732421875, + "learning_rate": 9.998637443896798e-07, + "loss": 0.0029, + "reward": 2.024012327194214, + "reward_std": 0.03822634741663933, + "rewards/accuracy_reward": 0.8040122985839844, + "rewards/format_reward": 1.0, + "step": 391, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 437.4750061035156, + "epoch": 0.007450346859260667, + "grad_norm": 2.136602140886897, + "kl": 0.0517578125, + "learning_rate": 9.998630465706429e-07, + "loss": 0.0021, + "reward": 1.887841820716858, + "reward_std": 0.29177364706993103, + "rewards/accuracy_reward": 0.7940918803215027, + "rewards/format_reward": 1.0, + "step": 392, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 439.45001220703125, + "epoch": 0.007469352846146536, + "grad_norm": 1.6753465748215923, + "kl": 0.047119140625, + "learning_rate": 9.998623469695077e-07, + "loss": 0.0019, + "reward": 1.6429789066314697, + "reward_std": 0.2791181206703186, + "rewards/accuracy_reward": 0.5617288947105408, + "rewards/format_reward": 0.949999988079071, + "step": 393, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 457.0, + "epoch": 0.007488358833032405, + "grad_norm": 3.4292571857131438, + "kl": 0.052978515625, + "learning_rate": 9.998616455862769e-07, + "loss": 0.0021, + "reward": 1.5460175275802612, + "reward_std": 0.19698475301265717, + "rewards/accuracy_reward": 0.5110175013542175, + "rewards/format_reward": 0.9000000357627869, + "step": 394, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 430.0500183105469, + "epoch": 0.007507364819918275, + "grad_norm": 1.8761488822131187, + "kl": 0.05859375, + "learning_rate": 9.99860942420953e-07, + "loss": 0.0023, + "reward": 1.9135173559188843, + "reward_std": 0.1456160545349121, + "rewards/accuracy_reward": 0.7097675204277039, + "rewards/format_reward": 1.0, + "step": 395, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 371.375, + "epoch": 0.007526370806804144, + "grad_norm": 2.4885891300655985, + "kl": 0.06689453125, + "learning_rate": 9.998602374735382e-07, + "loss": 0.0027, + "reward": 2.1424999237060547, + "reward_std": 0.12062933295965195, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 396, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 423.32501220703125, + "epoch": 0.0075453767936900125, + "grad_norm": 1.9929969264561822, + "kl": 0.0517578125, + "learning_rate": 9.998595307440355e-07, + "loss": 0.0021, + "reward": 1.6995066404342651, + "reward_std": 0.31410613656044006, + "rewards/accuracy_reward": 0.6020066142082214, + "rewards/format_reward": 1.0, + "step": 397, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 448.9250183105469, + "epoch": 0.007564382780575881, + "grad_norm": 1.6375051527548983, + "kl": 0.0478515625, + "learning_rate": 9.99858822232447e-07, + "loss": 0.0019, + "reward": 1.507171630859375, + "reward_std": 0.1280670464038849, + "rewards/accuracy_reward": 0.5134217143058777, + "rewards/format_reward": 1.0, + "step": 398, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 411.45001220703125, + "epoch": 0.00758338876746175, + "grad_norm": 2.1896361836524307, + "kl": 0.06396484375, + "learning_rate": 9.998581119387757e-07, + "loss": 0.0026, + "reward": 2.0359530448913574, + "reward_std": 0.21659286320209503, + "rewards/accuracy_reward": 0.8134528994560242, + "rewards/format_reward": 1.0, + "step": 399, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 367.7250061035156, + "epoch": 0.007602394754347619, + "grad_norm": 1.84768057076166, + "kl": 0.0458984375, + "learning_rate": 9.998573998630233e-07, + "loss": 0.0018, + "reward": 1.475223422050476, + "reward_std": 0.44805580377578735, + "rewards/accuracy_reward": 0.45022326707839966, + "rewards/format_reward": 1.0, + "step": 400, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 406.0249938964844, + "epoch": 0.007621400741233488, + "grad_norm": 1.7659669090674897, + "kl": 0.041259765625, + "learning_rate": 9.998566860051931e-07, + "loss": 0.0016, + "reward": 1.8125, + "reward_std": 0.34115922451019287, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 1.0, + "step": 401, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 435.7749938964844, + "epoch": 0.007640406728119358, + "grad_norm": 2.811419109647443, + "kl": 0.04443359375, + "learning_rate": 9.998559703652875e-07, + "loss": 0.0018, + "reward": 1.9911285638809204, + "reward_std": 0.11318478733301163, + "rewards/accuracy_reward": 0.8373786211013794, + "rewards/format_reward": 1.0, + "step": 402, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 449.6499938964844, + "epoch": 0.007659412715005227, + "grad_norm": 1.783109772005397, + "kl": 0.051513671875, + "learning_rate": 9.998552529433087e-07, + "loss": 0.0021, + "reward": 1.385178565979004, + "reward_std": 0.351605087518692, + "rewards/accuracy_reward": 0.3514285683631897, + "rewards/format_reward": 0.9750000238418579, + "step": 403, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 436.0, + "epoch": 0.007678418701891096, + "grad_norm": 1.7785382428420244, + "kl": 0.04931640625, + "learning_rate": 9.998545337392597e-07, + "loss": 0.002, + "reward": 1.9119949340820312, + "reward_std": 0.15657897293567657, + "rewards/accuracy_reward": 0.7657451033592224, + "rewards/format_reward": 1.0, + "step": 404, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 451.0, + "epoch": 0.007697424688776965, + "grad_norm": 3.121557264536802, + "kl": 0.042724609375, + "learning_rate": 9.998538127531426e-07, + "loss": 0.0017, + "reward": 1.6889053583145142, + "reward_std": 0.2106253206729889, + "rewards/accuracy_reward": 0.5889055132865906, + "rewards/format_reward": 1.0, + "step": 405, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 446.2749938964844, + "epoch": 0.007716430675662834, + "grad_norm": 2.7189021736733188, + "kl": 0.04833984375, + "learning_rate": 9.998530899849608e-07, + "loss": 0.0019, + "reward": 1.7862499952316284, + "reward_std": 0.4135487675666809, + "rewards/accuracy_reward": 0.699999988079071, + "rewards/format_reward": 0.9750000238418579, + "step": 406, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.4, + "completion_length": 433.1000061035156, + "epoch": 0.007735436662548703, + "grad_norm": 1.5164413881509486, + "kl": 0.060546875, + "learning_rate": 9.998523654347158e-07, + "loss": 0.0024, + "reward": 1.7937500476837158, + "reward_std": 0.04809223487973213, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 407, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 421.0249938964844, + "epoch": 0.0077544426494345715, + "grad_norm": 1.7936475114070736, + "kl": 0.06396484375, + "learning_rate": 9.998516391024107e-07, + "loss": 0.0026, + "reward": 2.2200000286102295, + "reward_std": 0.19642554223537445, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 0.9750000238418579, + "step": 408, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 436.5249938964844, + "epoch": 0.007773448636320441, + "grad_norm": 2.469095547587512, + "kl": 0.056396484375, + "learning_rate": 9.998509109880482e-07, + "loss": 0.0022, + "reward": 1.8845361471176147, + "reward_std": 0.3339006304740906, + "rewards/accuracy_reward": 0.6895362138748169, + "rewards/format_reward": 1.0, + "step": 409, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 396.1499938964844, + "epoch": 0.00779245462320631, + "grad_norm": 1.6910365414575763, + "kl": 0.038818359375, + "learning_rate": 9.99850181091631e-07, + "loss": 0.0016, + "reward": 1.8951218128204346, + "reward_std": 0.03609613701701164, + "rewards/accuracy_reward": 0.795121967792511, + "rewards/format_reward": 1.0, + "step": 410, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 385.6499938964844, + "epoch": 0.007811460610092179, + "grad_norm": 2.4764594373217266, + "kl": 0.050048828125, + "learning_rate": 9.998494494131611e-07, + "loss": 0.002, + "reward": 1.8788856267929077, + "reward_std": 0.07852822542190552, + "rewards/accuracy_reward": 0.7288856506347656, + "rewards/format_reward": 1.0, + "step": 411, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 377.875, + "epoch": 0.007830466596978048, + "grad_norm": 1.6739286220487823, + "kl": 0.047119140625, + "learning_rate": 9.998487159526416e-07, + "loss": 0.0019, + "reward": 1.8614593744277954, + "reward_std": 0.11115310341119766, + "rewards/accuracy_reward": 0.7039594650268555, + "rewards/format_reward": 1.0, + "step": 412, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 427.8999938964844, + "epoch": 0.007849472583863917, + "grad_norm": 1.8420906369117143, + "kl": 0.036376953125, + "learning_rate": 9.998479807100753e-07, + "loss": 0.0015, + "reward": 1.5246965885162354, + "reward_std": 0.22792772948741913, + "rewards/accuracy_reward": 0.5071965456008911, + "rewards/format_reward": 1.0, + "step": 413, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 416.45001220703125, + "epoch": 0.007868478570749786, + "grad_norm": 1.9987084326326474, + "kl": 0.064453125, + "learning_rate": 9.99847243685464e-07, + "loss": 0.0026, + "reward": 2.083038806915283, + "reward_std": 0.18066976964473724, + "rewards/accuracy_reward": 0.8742886781692505, + "rewards/format_reward": 1.0, + "step": 414, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 404.2250061035156, + "epoch": 0.007887484557635655, + "grad_norm": 1.6017550448750113, + "kl": 0.05712890625, + "learning_rate": 9.998465048788114e-07, + "loss": 0.0023, + "reward": 1.652500033378601, + "reward_std": 0.25705400109291077, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/format_reward": 1.0, + "step": 415, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 421.9250183105469, + "epoch": 0.007906490544521524, + "grad_norm": 2.317179197968382, + "kl": 0.042236328125, + "learning_rate": 9.998457642901193e-07, + "loss": 0.0017, + "reward": 1.4829562902450562, + "reward_std": 0.3408910036087036, + "rewards/accuracy_reward": 0.5017063617706299, + "rewards/format_reward": 0.925000011920929, + "step": 416, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 420.625, + "epoch": 0.007925496531407393, + "grad_norm": 1.7210194138161985, + "kl": 0.04541015625, + "learning_rate": 9.998450219193906e-07, + "loss": 0.0018, + "reward": 1.8924999237060547, + "reward_std": 0.1583809107542038, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 417, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 404.6000061035156, + "epoch": 0.007944502518293262, + "grad_norm": 2.5968579347435194, + "kl": 0.03955078125, + "learning_rate": 9.99844277766628e-07, + "loss": 0.0016, + "reward": 1.7641693353652954, + "reward_std": 0.14600078761577606, + "rewards/accuracy_reward": 0.7091692686080933, + "rewards/format_reward": 0.9750000238418579, + "step": 418, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 418.0249938964844, + "epoch": 0.007963508505179132, + "grad_norm": 2.1050169840326096, + "kl": 0.0556640625, + "learning_rate": 9.998435318318344e-07, + "loss": 0.0022, + "reward": 2.0356645584106445, + "reward_std": 0.15891394019126892, + "rewards/accuracy_reward": 0.8581647276878357, + "rewards/format_reward": 0.9750000238418579, + "step": 419, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 387.32501220703125, + "epoch": 0.007982514492065001, + "grad_norm": 1.9107633229926246, + "kl": 0.080078125, + "learning_rate": 9.99842784115012e-07, + "loss": 0.0032, + "reward": 2.1649999618530273, + "reward_std": 0.13496284186840057, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 420, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 443.20001220703125, + "epoch": 0.00800152047895087, + "grad_norm": 1.3999056191462678, + "kl": 0.033203125, + "learning_rate": 9.998420346161635e-07, + "loss": 0.0013, + "reward": 1.5151971578598022, + "reward_std": 0.3034624755382538, + "rewards/accuracy_reward": 0.5676971673965454, + "rewards/format_reward": 0.9000000357627869, + "step": 421, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 414.25, + "epoch": 0.008020526465836739, + "grad_norm": 6.772662006510986, + "kl": 0.054931640625, + "learning_rate": 9.99841283335292e-07, + "loss": 0.0022, + "reward": 1.835519790649414, + "reward_std": 0.06594506651163101, + "rewards/accuracy_reward": 0.6855198740959167, + "rewards/format_reward": 1.0, + "step": 422, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 433.95001220703125, + "epoch": 0.008039532452722608, + "grad_norm": 3.103936355535458, + "kl": 0.050048828125, + "learning_rate": 9.998405302723999e-07, + "loss": 0.002, + "reward": 1.5305408239364624, + "reward_std": 0.14203767478466034, + "rewards/accuracy_reward": 0.5067909955978394, + "rewards/format_reward": 1.0, + "step": 423, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 437.75, + "epoch": 0.008058538439608477, + "grad_norm": 1.751885930886735, + "kl": 0.045654296875, + "learning_rate": 9.998397754274896e-07, + "loss": 0.0018, + "reward": 1.7733423709869385, + "reward_std": 0.25026729702949524, + "rewards/accuracy_reward": 0.6433423161506653, + "rewards/format_reward": 0.9750000238418579, + "step": 424, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 455.2749938964844, + "epoch": 0.008077544426494346, + "grad_norm": 1.594292295318808, + "kl": 0.040283203125, + "learning_rate": 9.998390188005643e-07, + "loss": 0.0016, + "reward": 1.3378807306289673, + "reward_std": 0.2880254089832306, + "rewards/accuracy_reward": 0.36163076758384705, + "rewards/format_reward": 0.925000011920929, + "step": 425, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 456.1750183105469, + "epoch": 0.008096550413380215, + "grad_norm": 1.5825411019513485, + "kl": 0.029296875, + "learning_rate": 9.998382603916263e-07, + "loss": 0.0012, + "reward": 1.5910849571228027, + "reward_std": 0.3767656087875366, + "rewards/accuracy_reward": 0.5748350620269775, + "rewards/format_reward": 0.949999988079071, + "step": 426, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 490.4250183105469, + "epoch": 0.008115556400266084, + "grad_norm": 9.285023650541742, + "kl": 0.0546875, + "learning_rate": 9.998375002006786e-07, + "loss": 0.0022, + "reward": 1.625339150428772, + "reward_std": 0.37460535764694214, + "rewards/accuracy_reward": 0.6265891194343567, + "rewards/format_reward": 0.824999988079071, + "step": 427, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 424.8999938964844, + "epoch": 0.008134562387151953, + "grad_norm": 1.7176171170008445, + "kl": 0.05908203125, + "learning_rate": 9.998367382277238e-07, + "loss": 0.0024, + "reward": 1.8588972091674805, + "reward_std": 0.12745192646980286, + "rewards/accuracy_reward": 0.7801470756530762, + "rewards/format_reward": 1.0, + "step": 428, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 445.20001220703125, + "epoch": 0.008153568374037821, + "grad_norm": 3.7194328086291146, + "kl": 0.061279296875, + "learning_rate": 9.998359744727647e-07, + "loss": 0.0025, + "reward": 1.848841905593872, + "reward_std": 0.16556741297245026, + "rewards/accuracy_reward": 0.7400919198989868, + "rewards/format_reward": 1.0, + "step": 429, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 463.82501220703125, + "epoch": 0.00817257436092369, + "grad_norm": 1.7967336621898506, + "kl": 0.043212890625, + "learning_rate": 9.998352089358037e-07, + "loss": 0.0017, + "reward": 1.481200098991394, + "reward_std": 0.19158399105072021, + "rewards/accuracy_reward": 0.4224500358104706, + "rewards/format_reward": 1.0, + "step": 430, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 380.9750061035156, + "epoch": 0.00819158034780956, + "grad_norm": 8.46157327695237, + "kl": 0.06396484375, + "learning_rate": 9.99834441616844e-07, + "loss": 0.0026, + "reward": 2.0628411769866943, + "reward_std": 0.053140003234148026, + "rewards/accuracy_reward": 0.8340908885002136, + "rewards/format_reward": 1.0, + "step": 431, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 452.7250061035156, + "epoch": 0.008210586334695428, + "grad_norm": 3.2258464214695555, + "kl": 0.0859375, + "learning_rate": 9.99833672515888e-07, + "loss": 0.0034, + "reward": 2.2587499618530273, + "reward_std": 0.16861510276794434, + "rewards/accuracy_reward": 0.949999988079071, + "rewards/format_reward": 1.0, + "step": 432, + "temporal_rewards": 1.0 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 482.5500183105469, + "epoch": 0.008229592321581299, + "grad_norm": 2.3400375203817796, + "kl": 0.04150390625, + "learning_rate": 9.998329016329386e-07, + "loss": 0.0017, + "reward": 1.6458524465560913, + "reward_std": 0.4171806275844574, + "rewards/accuracy_reward": 0.6333524584770203, + "rewards/format_reward": 0.925000011920929, + "step": 433, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 464.8999938964844, + "epoch": 0.008248598308467168, + "grad_norm": 1.996777851806519, + "kl": 0.061279296875, + "learning_rate": 9.998321289679984e-07, + "loss": 0.0025, + "reward": 1.9785350561141968, + "reward_std": 0.13937710225582123, + "rewards/accuracy_reward": 0.8335351943969727, + "rewards/format_reward": 0.9750000238418579, + "step": 434, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 482.2749938964844, + "epoch": 0.008267604295353037, + "grad_norm": 6.700843684552652, + "kl": 0.052978515625, + "learning_rate": 9.998313545210703e-07, + "loss": 0.0021, + "reward": 1.4458342790603638, + "reward_std": 0.40489354729652405, + "rewards/accuracy_reward": 0.40333423018455505, + "rewards/format_reward": 0.949999988079071, + "step": 435, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 451.1499938964844, + "epoch": 0.008286610282238906, + "grad_norm": 1.3066923042092622, + "kl": 0.0576171875, + "learning_rate": 9.99830578292157e-07, + "loss": 0.0023, + "reward": 1.6087499856948853, + "reward_std": 0.049174584448337555, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 436, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 444.70001220703125, + "epoch": 0.008305616269124775, + "grad_norm": 2.2048405462097587, + "kl": 0.04345703125, + "learning_rate": 9.998298002812611e-07, + "loss": 0.0017, + "reward": 1.5705558061599731, + "reward_std": 0.2630314528942108, + "rewards/accuracy_reward": 0.5193058848381042, + "rewards/format_reward": 0.9750000238418579, + "step": 437, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 445.8000183105469, + "epoch": 0.008324622256010644, + "grad_norm": 2.556293043683744, + "kl": 0.06494140625, + "learning_rate": 9.998290204883858e-07, + "loss": 0.0026, + "reward": 2.2065789699554443, + "reward_std": 0.09058817476034164, + "rewards/accuracy_reward": 0.9253290295600891, + "rewards/format_reward": 1.0, + "step": 438, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 451.1000061035156, + "epoch": 0.008343628242896512, + "grad_norm": 1.6765442463037867, + "kl": 0.0654296875, + "learning_rate": 9.998282389135336e-07, + "loss": 0.0026, + "reward": 2.0200002193450928, + "reward_std": 0.10454743355512619, + "rewards/accuracy_reward": 0.824999988079071, + "rewards/format_reward": 1.0, + "step": 439, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 450.875, + "epoch": 0.008362634229782381, + "grad_norm": 1.7624051832032692, + "kl": 0.052734375, + "learning_rate": 9.998274555567072e-07, + "loss": 0.0021, + "reward": 1.876250147819519, + "reward_std": 0.20211093127727509, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 0.9750000238418579, + "step": 440, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 409.375, + "epoch": 0.00838164021666825, + "grad_norm": 1.502506444784469, + "kl": 0.047607421875, + "learning_rate": 9.998266704179095e-07, + "loss": 0.0019, + "reward": 1.6598774194717407, + "reward_std": 0.20364916324615479, + "rewards/accuracy_reward": 0.564877450466156, + "rewards/format_reward": 1.0, + "step": 441, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 412.70001220703125, + "epoch": 0.00840064620355412, + "grad_norm": 1.2978180991720345, + "kl": 0.050537109375, + "learning_rate": 9.998258834971435e-07, + "loss": 0.002, + "reward": 1.5098358392715454, + "reward_std": 0.18291102349758148, + "rewards/accuracy_reward": 0.5410858392715454, + "rewards/format_reward": 0.949999988079071, + "step": 442, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 379.5, + "epoch": 0.008419652190439988, + "grad_norm": 1.598554996381953, + "kl": 0.078125, + "learning_rate": 9.998250947944114e-07, + "loss": 0.0031, + "reward": 1.5625, + "reward_std": 0.12271541357040405, + "rewards/accuracy_reward": 0.4749999940395355, + "rewards/format_reward": 1.0, + "step": 443, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 444.0249938964844, + "epoch": 0.008438658177325857, + "grad_norm": 1.7640028260705563, + "kl": 0.07373046875, + "learning_rate": 9.998243043097168e-07, + "loss": 0.003, + "reward": 1.9037498235702515, + "reward_std": 0.12421140819787979, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 1.0, + "step": 444, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 439.125, + "epoch": 0.008457664164211726, + "grad_norm": 2.1733653590761377, + "kl": 0.06787109375, + "learning_rate": 9.99823512043062e-07, + "loss": 0.0027, + "reward": 2.233750104904175, + "reward_std": 0.13382382690906525, + "rewards/accuracy_reward": 0.949999988079071, + "rewards/format_reward": 1.0, + "step": 445, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 418.82501220703125, + "epoch": 0.008476670151097595, + "grad_norm": 1.769665720019225, + "kl": 0.050537109375, + "learning_rate": 9.9982271799445e-07, + "loss": 0.002, + "reward": 1.7072620391845703, + "reward_std": 0.2012491673231125, + "rewards/accuracy_reward": 0.6085121035575867, + "rewards/format_reward": 1.0, + "step": 446, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 402.5, + "epoch": 0.008495676137983466, + "grad_norm": 2.008099728650602, + "kl": 0.06201171875, + "learning_rate": 9.998219221638836e-07, + "loss": 0.0025, + "reward": 2.0126137733459473, + "reward_std": 0.1306818574666977, + "rewards/accuracy_reward": 0.8488636016845703, + "rewards/format_reward": 1.0, + "step": 447, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 391.4750061035156, + "epoch": 0.008514682124869334, + "grad_norm": 3.8300353028265888, + "kl": 0.06689453125, + "learning_rate": 9.998211245513654e-07, + "loss": 0.0027, + "reward": 2.1112499237060547, + "reward_std": 0.2548518776893616, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 1.0, + "step": 448, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 436.70001220703125, + "epoch": 0.008533688111755203, + "grad_norm": 2.2214874490514793, + "kl": 0.06689453125, + "learning_rate": 9.998203251568987e-07, + "loss": 0.0027, + "reward": 1.5724999904632568, + "reward_std": 0.29077717661857605, + "rewards/accuracy_reward": 0.45000001788139343, + "rewards/format_reward": 1.0, + "step": 449, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 443.7749938964844, + "epoch": 0.008552694098641072, + "grad_norm": 3.765816352045816, + "kl": 0.0439453125, + "learning_rate": 9.99819523980486e-07, + "loss": 0.0018, + "reward": 1.750025749206543, + "reward_std": 0.1611539125442505, + "rewards/accuracy_reward": 0.6850257515907288, + "rewards/format_reward": 0.925000011920929, + "step": 450, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 379.3999938964844, + "epoch": 0.008571700085526941, + "grad_norm": 1.6304799044093234, + "kl": 0.07177734375, + "learning_rate": 9.998187210221304e-07, + "loss": 0.0029, + "reward": 1.6189264059066772, + "reward_std": 0.10391286760568619, + "rewards/accuracy_reward": 0.5301764607429504, + "rewards/format_reward": 1.0, + "step": 451, + "temporal_rewards": 0.5 + }, + { + "all_correct": 1.0, + "all_wrong": 0.0, + "completion_length": 431.1499938964844, + "epoch": 0.00859070607241281, + "grad_norm": 6.060644775858865, + "kl": 0.06787109375, + "learning_rate": 9.998179162818347e-07, + "loss": 0.0027, + "reward": 2.291249990463257, + "reward_std": 0.03622095659375191, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 452, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 416.25, + "epoch": 0.008609712059298679, + "grad_norm": 2.5573521251097415, + "kl": 0.046142578125, + "learning_rate": 9.998171097596018e-07, + "loss": 0.0018, + "reward": 1.8114269971847534, + "reward_std": 0.1684807389974594, + "rewards/accuracy_reward": 0.6476770639419556, + "rewards/format_reward": 1.0, + "step": 453, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 414.5500183105469, + "epoch": 0.008628718046184548, + "grad_norm": 44.9747760791854, + "kl": 0.05322265625, + "learning_rate": 9.998163014554343e-07, + "loss": 0.0021, + "reward": 1.611638069152832, + "reward_std": 0.28337562084198, + "rewards/accuracy_reward": 0.5853880047798157, + "rewards/format_reward": 0.949999988079071, + "step": 454, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 395.32501220703125, + "epoch": 0.008647724033070417, + "grad_norm": 1.4439523409893464, + "kl": 0.048095703125, + "learning_rate": 9.998154913693353e-07, + "loss": 0.0019, + "reward": 1.662500023841858, + "reward_std": 0.3478153645992279, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/format_reward": 1.0, + "step": 455, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 394.6750183105469, + "epoch": 0.008666730019956286, + "grad_norm": 1.8460985799610978, + "kl": 0.044677734375, + "learning_rate": 9.998146795013077e-07, + "loss": 0.0018, + "reward": 1.8881248235702515, + "reward_std": 0.2242121696472168, + "rewards/accuracy_reward": 0.7906250357627869, + "rewards/format_reward": 1.0, + "step": 456, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 442.57501220703125, + "epoch": 0.008685736006842155, + "grad_norm": 1.5509793941472338, + "kl": 0.047607421875, + "learning_rate": 9.998138658513542e-07, + "loss": 0.0019, + "reward": 2.0087499618530273, + "reward_std": 0.2518826425075531, + "rewards/accuracy_reward": 0.949999988079071, + "rewards/format_reward": 0.9750000238418579, + "step": 457, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 391.1750183105469, + "epoch": 0.008704741993728024, + "grad_norm": 1.8502122141626356, + "kl": 0.0546875, + "learning_rate": 9.998130504194779e-07, + "loss": 0.0022, + "reward": 1.9807662963867188, + "reward_std": 0.24359726905822754, + "rewards/accuracy_reward": 0.8107662200927734, + "rewards/format_reward": 1.0, + "step": 458, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 413.0, + "epoch": 0.008723747980613893, + "grad_norm": 1.989755215059745, + "kl": 0.051025390625, + "learning_rate": 9.998122332056818e-07, + "loss": 0.002, + "reward": 1.8674999475479126, + "reward_std": 0.11793271452188492, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 1.0, + "step": 459, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 405.0249938964844, + "epoch": 0.008742753967499763, + "grad_norm": 2.0166022130076526, + "kl": 0.0703125, + "learning_rate": 9.998114142099685e-07, + "loss": 0.0028, + "reward": 2.0862159729003906, + "reward_std": 0.04510483145713806, + "rewards/accuracy_reward": 0.8499659895896912, + "rewards/format_reward": 1.0, + "step": 460, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 450.0, + "epoch": 0.008761759954385632, + "grad_norm": 2.151680256921219, + "kl": 0.0517578125, + "learning_rate": 9.998105934323412e-07, + "loss": 0.0021, + "reward": 1.7433960437774658, + "reward_std": 0.2005983144044876, + "rewards/accuracy_reward": 0.68464595079422, + "rewards/format_reward": 0.925000011920929, + "step": 461, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 385.875, + "epoch": 0.008780765941271501, + "grad_norm": 2.6575912642055526, + "kl": 0.044189453125, + "learning_rate": 9.998097708728029e-07, + "loss": 0.0018, + "reward": 1.9312794208526611, + "reward_std": 0.0516841895878315, + "rewards/accuracy_reward": 0.8750292658805847, + "rewards/format_reward": 1.0, + "step": 462, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 388.625, + "epoch": 0.00879977192815737, + "grad_norm": 1.7631730380466322, + "kl": 0.06494140625, + "learning_rate": 9.99808946531356e-07, + "loss": 0.0026, + "reward": 2.0160138607025146, + "reward_std": 0.058510977774858475, + "rewards/accuracy_reward": 0.9047636389732361, + "rewards/format_reward": 1.0, + "step": 463, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 439.20001220703125, + "epoch": 0.008818777915043239, + "grad_norm": 2.5066464555266075, + "kl": 0.054443359375, + "learning_rate": 9.99808120408004e-07, + "loss": 0.0022, + "reward": 1.8968384265899658, + "reward_std": 0.04898768290877342, + "rewards/accuracy_reward": 0.740588366985321, + "rewards/format_reward": 1.0, + "step": 464, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 406.1000061035156, + "epoch": 0.008837783901929108, + "grad_norm": 2.2478606074374317, + "kl": 0.0732421875, + "learning_rate": 9.998072925027496e-07, + "loss": 0.0029, + "reward": 1.5600000619888306, + "reward_std": 0.42751264572143555, + "rewards/accuracy_reward": 0.4749999940395355, + "rewards/format_reward": 1.0, + "step": 465, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 416.32501220703125, + "epoch": 0.008856789888814977, + "grad_norm": 1.859802253341565, + "kl": 0.06689453125, + "learning_rate": 9.998064628155958e-07, + "loss": 0.0027, + "reward": 1.962499976158142, + "reward_std": 0.1362399160861969, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 466, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 402.2250061035156, + "epoch": 0.008875795875700846, + "grad_norm": 1.5858260461104032, + "kl": 0.050537109375, + "learning_rate": 9.998056313465455e-07, + "loss": 0.002, + "reward": 1.6306251287460327, + "reward_std": 0.1722022444009781, + "rewards/accuracy_reward": 0.590624988079071, + "rewards/format_reward": 1.0, + "step": 467, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 450.6499938964844, + "epoch": 0.008894801862586715, + "grad_norm": 1.702762047523296, + "kl": 0.05908203125, + "learning_rate": 9.998047980956018e-07, + "loss": 0.0024, + "reward": 1.7095654010772705, + "reward_std": 0.3553208112716675, + "rewards/accuracy_reward": 0.7445651888847351, + "rewards/format_reward": 0.9000000357627869, + "step": 468, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 417.0249938964844, + "epoch": 0.008913807849472584, + "grad_norm": 1.7034703754068596, + "kl": 0.0478515625, + "learning_rate": 9.998039630627675e-07, + "loss": 0.0019, + "reward": 1.692050576210022, + "reward_std": 0.17455986142158508, + "rewards/accuracy_reward": 0.6433005332946777, + "rewards/format_reward": 1.0, + "step": 469, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 424.57501220703125, + "epoch": 0.008932813836358452, + "grad_norm": 1.657649325279252, + "kl": 0.0634765625, + "learning_rate": 9.998031262480458e-07, + "loss": 0.0025, + "reward": 1.9054111242294312, + "reward_std": 0.2784424424171448, + "rewards/accuracy_reward": 0.7979111671447754, + "rewards/format_reward": 1.0, + "step": 470, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 442.8999938964844, + "epoch": 0.008951819823244321, + "grad_norm": 1.7373903623286233, + "kl": 0.0625, + "learning_rate": 9.998022876514394e-07, + "loss": 0.0025, + "reward": 1.7674999237060547, + "reward_std": 0.30718716979026794, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 0.925000011920929, + "step": 471, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 455.625, + "epoch": 0.00897082581013019, + "grad_norm": 4.952608861638442, + "kl": 0.052001953125, + "learning_rate": 9.998014472729515e-07, + "loss": 0.0021, + "reward": 1.712471604347229, + "reward_std": 0.17913781106472015, + "rewards/accuracy_reward": 0.5599716305732727, + "rewards/format_reward": 1.0, + "step": 472, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 447.32501220703125, + "epoch": 0.00898983179701606, + "grad_norm": 4.860283678424628, + "kl": 0.059326171875, + "learning_rate": 9.99800605112585e-07, + "loss": 0.0024, + "reward": 2.067499876022339, + "reward_std": 0.15350113809108734, + "rewards/accuracy_reward": 0.925000011920929, + "rewards/format_reward": 1.0, + "step": 473, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 1.0, + "all_wrong": 0.0, + "completion_length": 472.2749938964844, + "epoch": 0.00900883778390193, + "grad_norm": 1.6051640826240865, + "kl": 0.072265625, + "learning_rate": 9.997997611703428e-07, + "loss": 0.0029, + "reward": 2.3062500953674316, + "reward_std": 0.0686725452542305, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 474, + "temporal_rewards": 1.0 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 434.3500061035156, + "epoch": 0.009027843770787799, + "grad_norm": 1.8164006005138298, + "kl": 0.058349609375, + "learning_rate": 9.99798915446228e-07, + "loss": 0.0023, + "reward": 1.9271538257598877, + "reward_std": 0.0653354674577713, + "rewards/accuracy_reward": 0.7109037637710571, + "rewards/format_reward": 1.0, + "step": 475, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 454.8500061035156, + "epoch": 0.009046849757673668, + "grad_norm": 1.504528415551405, + "kl": 0.05419921875, + "learning_rate": 9.99798067940244e-07, + "loss": 0.0022, + "reward": 2.09375, + "reward_std": 0.15219198167324066, + "rewards/accuracy_reward": 0.949999988079071, + "rewards/format_reward": 1.0, + "step": 476, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 395.95001220703125, + "epoch": 0.009065855744559537, + "grad_norm": 2.781745702470083, + "kl": 0.0712890625, + "learning_rate": 9.997972186523933e-07, + "loss": 0.0028, + "reward": 1.662500023841858, + "reward_std": 0.4719924032688141, + "rewards/accuracy_reward": 0.574999988079071, + "rewards/format_reward": 1.0, + "step": 477, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 431.82501220703125, + "epoch": 0.009084861731445406, + "grad_norm": 1.7579791943277092, + "kl": 0.0751953125, + "learning_rate": 9.99796367582679e-07, + "loss": 0.003, + "reward": 1.8686836957931519, + "reward_std": 0.03408441320061684, + "rewards/accuracy_reward": 0.6536837816238403, + "rewards/format_reward": 1.0, + "step": 478, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 427.70001220703125, + "epoch": 0.009103867718331275, + "grad_norm": 2.554115765918967, + "kl": 0.0654296875, + "learning_rate": 9.997955147311044e-07, + "loss": 0.0026, + "reward": 2.195833206176758, + "reward_std": 0.19695670902729034, + "rewards/accuracy_reward": 0.9333333373069763, + "rewards/format_reward": 1.0, + "step": 479, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 453.45001220703125, + "epoch": 0.009122873705217143, + "grad_norm": 1.5587892217213488, + "kl": 0.0615234375, + "learning_rate": 9.997946600976723e-07, + "loss": 0.0025, + "reward": 1.9755518436431885, + "reward_std": 0.15970341861248016, + "rewards/accuracy_reward": 0.7743017673492432, + "rewards/format_reward": 1.0, + "step": 480, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.6, + "completion_length": 417.5, + "epoch": 0.009141879692103012, + "grad_norm": 1.19688361762643, + "kl": 0.051513671875, + "learning_rate": 9.997938036823858e-07, + "loss": 0.0021, + "reward": 1.1349740028381348, + "reward_std": 0.11196901649236679, + "rewards/accuracy_reward": 0.08122406154870987, + "rewards/format_reward": 1.0, + "step": 481, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 429.07501220703125, + "epoch": 0.009160885678988881, + "grad_norm": 1.926439565443606, + "kl": 0.078125, + "learning_rate": 9.99792945485248e-07, + "loss": 0.0031, + "reward": 1.7087501287460327, + "reward_std": 0.09527556598186493, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 482, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 415.2749938964844, + "epoch": 0.00917989166587475, + "grad_norm": 1.9094138725860295, + "kl": 0.0732421875, + "learning_rate": 9.99792085506262e-07, + "loss": 0.0029, + "reward": 1.5049999952316284, + "reward_std": 0.4254470765590668, + "rewards/accuracy_reward": 0.4749999940395355, + "rewards/format_reward": 1.0, + "step": 483, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 454.5500183105469, + "epoch": 0.00919889765276062, + "grad_norm": 1.7604931063216653, + "kl": 0.03369140625, + "learning_rate": 9.997912237454309e-07, + "loss": 0.0013, + "reward": 1.5671659708023071, + "reward_std": 0.32028064131736755, + "rewards/accuracy_reward": 0.6371659636497498, + "rewards/format_reward": 0.949999988079071, + "step": 484, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 452.3000183105469, + "epoch": 0.009217903639646488, + "grad_norm": 1.714713292367152, + "kl": 0.03466796875, + "learning_rate": 9.997903602027574e-07, + "loss": 0.0014, + "reward": 1.1929138898849487, + "reward_std": 0.3162080645561218, + "rewards/accuracy_reward": 0.36041373014450073, + "rewards/format_reward": 0.8500000238418579, + "step": 485, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 410.125, + "epoch": 0.009236909626532357, + "grad_norm": 2.521064980361963, + "kl": 0.06201171875, + "learning_rate": 9.99789494878245e-07, + "loss": 0.0025, + "reward": 1.6861017942428589, + "reward_std": 0.15840964019298553, + "rewards/accuracy_reward": 0.5286018252372742, + "rewards/format_reward": 1.0, + "step": 486, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 378.95001220703125, + "epoch": 0.009255915613418226, + "grad_norm": 2.5559421663142023, + "kl": 0.04638671875, + "learning_rate": 9.997886277718965e-07, + "loss": 0.0019, + "reward": 1.625555396080017, + "reward_std": 0.3310706317424774, + "rewards/accuracy_reward": 0.5305555462837219, + "rewards/format_reward": 1.0, + "step": 487, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 387.07501220703125, + "epoch": 0.009274921600304097, + "grad_norm": 2.6883486653837565, + "kl": 0.07666015625, + "learning_rate": 9.997877588837154e-07, + "loss": 0.0031, + "reward": 1.517319917678833, + "reward_std": 0.18872372806072235, + "rewards/accuracy_reward": 0.4960698187351227, + "rewards/format_reward": 1.0, + "step": 488, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 370.375, + "epoch": 0.009293927587189965, + "grad_norm": 1.8962347858409532, + "kl": 0.064453125, + "learning_rate": 9.997868882137043e-07, + "loss": 0.0026, + "reward": 2.0177972316741943, + "reward_std": 0.32380202412605286, + "rewards/accuracy_reward": 0.8040472865104675, + "rewards/format_reward": 1.0, + "step": 489, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 434.2250061035156, + "epoch": 0.009312933574075834, + "grad_norm": 1.7829801566738295, + "kl": 0.0498046875, + "learning_rate": 9.997860157618667e-07, + "loss": 0.002, + "reward": 1.64120614528656, + "reward_std": 0.3396809995174408, + "rewards/accuracy_reward": 0.6849562525749207, + "rewards/format_reward": 0.9000000357627869, + "step": 490, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 407.1750183105469, + "epoch": 0.009331939560961703, + "grad_norm": 1.779304318494529, + "kl": 0.049072265625, + "learning_rate": 9.997851415282054e-07, + "loss": 0.002, + "reward": 1.8875000476837158, + "reward_std": 0.22840066254138947, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 1.0, + "step": 491, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.4, + "completion_length": 379.0500183105469, + "epoch": 0.009350945547847572, + "grad_norm": 1.625905006496971, + "kl": 0.0654296875, + "learning_rate": 9.997842655127238e-07, + "loss": 0.0026, + "reward": 1.7537500858306885, + "reward_std": 0.018371161073446274, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 492, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 392.5500183105469, + "epoch": 0.009369951534733441, + "grad_norm": 2.3368460537892943, + "kl": 0.0615234375, + "learning_rate": 9.99783387715425e-07, + "loss": 0.0025, + "reward": 1.647323489189148, + "reward_std": 0.10424643009901047, + "rewards/accuracy_reward": 0.5535734295845032, + "rewards/format_reward": 1.0, + "step": 493, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 397.2250061035156, + "epoch": 0.00938895752161931, + "grad_norm": 2.0508207232179667, + "kl": 0.038818359375, + "learning_rate": 9.997825081363118e-07, + "loss": 0.0015, + "reward": 1.7971175909042358, + "reward_std": 0.042405229061841965, + "rewards/accuracy_reward": 0.680867612361908, + "rewards/format_reward": 1.0, + "step": 494, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 388.1750183105469, + "epoch": 0.009407963508505179, + "grad_norm": 1.4153900026035005, + "kl": 0.052734375, + "learning_rate": 9.997816267753875e-07, + "loss": 0.0021, + "reward": 1.71875, + "reward_std": 0.08908186107873917, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 495, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 447.9250183105469, + "epoch": 0.009426969495391048, + "grad_norm": 1.7138907316857963, + "kl": 0.057861328125, + "learning_rate": 9.997807436326553e-07, + "loss": 0.0023, + "reward": 1.59375, + "reward_std": 0.1288391649723053, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 0.800000011920929, + "step": 496, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 384.20001220703125, + "epoch": 0.009445975482276917, + "grad_norm": 1.8854658576800034, + "kl": 0.05908203125, + "learning_rate": 9.997798587081186e-07, + "loss": 0.0024, + "reward": 1.7962499856948853, + "reward_std": 0.278097003698349, + "rewards/accuracy_reward": 0.6975000500679016, + "rewards/format_reward": 1.0, + "step": 497, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 411.9750061035156, + "epoch": 0.009464981469162786, + "grad_norm": 3.4054225772597775, + "kl": 0.053466796875, + "learning_rate": 9.9977897200178e-07, + "loss": 0.0021, + "reward": 2.0822036266326904, + "reward_std": 0.040723223239183426, + "rewards/accuracy_reward": 0.8659538626670837, + "rewards/format_reward": 1.0, + "step": 498, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 1.0, + "all_wrong": 0.0, + "completion_length": 424.6499938964844, + "epoch": 0.009483987456048655, + "grad_norm": 2.2012332646042796, + "kl": 0.06982421875, + "learning_rate": 9.99778083513643e-07, + "loss": 0.0028, + "reward": 2.231250047683716, + "reward_std": 0.03622094541788101, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 499, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 427.6000061035156, + "epoch": 0.009502993442934524, + "grad_norm": 1.4652239261353879, + "kl": 0.035400390625, + "learning_rate": 9.99777193243711e-07, + "loss": 0.0014, + "reward": 1.4185357093811035, + "reward_std": 0.2842453420162201, + "rewards/accuracy_reward": 0.40603572130203247, + "rewards/format_reward": 0.949999988079071, + "step": 500, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 442.32501220703125, + "epoch": 0.009521999429820393, + "grad_norm": 1.8280873574114165, + "kl": 0.05859375, + "learning_rate": 9.997763011919864e-07, + "loss": 0.0023, + "reward": 1.863806128501892, + "reward_std": 0.196150004863739, + "rewards/accuracy_reward": 0.7075561881065369, + "rewards/format_reward": 1.0, + "step": 501, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 415.5, + "epoch": 0.009541005416706263, + "grad_norm": 1.8075627828806675, + "kl": 0.07470703125, + "learning_rate": 9.997754073584732e-07, + "loss": 0.003, + "reward": 2.078749895095825, + "reward_std": 0.2691101133823395, + "rewards/accuracy_reward": 0.8500000238418579, + "rewards/format_reward": 1.0, + "step": 502, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 425.0, + "epoch": 0.009560011403592132, + "grad_norm": 2.5875098453575585, + "kl": 0.049560546875, + "learning_rate": 9.997745117431743e-07, + "loss": 0.002, + "reward": 1.2162787914276123, + "reward_std": 0.05266318470239639, + "rewards/accuracy_reward": 0.23002873361110687, + "rewards/format_reward": 1.0, + "step": 503, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 444.5, + "epoch": 0.009579017390478001, + "grad_norm": 6.085291880837437, + "kl": 0.0703125, + "learning_rate": 9.99773614346093e-07, + "loss": 0.0028, + "reward": 1.8537498712539673, + "reward_std": 0.2420050948858261, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 1.0, + "step": 504, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 471.95001220703125, + "epoch": 0.00959802337736387, + "grad_norm": 1.648046194070453, + "kl": 0.06005859375, + "learning_rate": 9.99772715167232e-07, + "loss": 0.0024, + "reward": 1.6337499618530273, + "reward_std": 0.3327481746673584, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/format_reward": 0.9750000238418579, + "step": 505, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 430.0249938964844, + "epoch": 0.009617029364249739, + "grad_norm": 1.5849127897738289, + "kl": 0.048828125, + "learning_rate": 9.99771814206595e-07, + "loss": 0.002, + "reward": 1.763901948928833, + "reward_std": 0.18851494789123535, + "rewards/accuracy_reward": 0.6864018440246582, + "rewards/format_reward": 1.0, + "step": 506, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 441.32501220703125, + "epoch": 0.009636035351135608, + "grad_norm": 1.5959921680901905, + "kl": 0.0615234375, + "learning_rate": 9.99770911464185e-07, + "loss": 0.0025, + "reward": 2.2049999237060547, + "reward_std": 0.09811828285455704, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 507, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 478.5249938964844, + "epoch": 0.009655041338021477, + "grad_norm": 1.4051461165424177, + "kl": 0.03955078125, + "learning_rate": 9.997700069400054e-07, + "loss": 0.0016, + "reward": 1.3736310005187988, + "reward_std": 0.38330039381980896, + "rewards/accuracy_reward": 0.4586309492588043, + "rewards/format_reward": 0.925000011920929, + "step": 508, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 439.0, + "epoch": 0.009674047324907346, + "grad_norm": 2.0685341563787314, + "kl": 0.05322265625, + "learning_rate": 9.997691006340593e-07, + "loss": 0.0021, + "reward": 1.561039686203003, + "reward_std": 0.12414371222257614, + "rewards/accuracy_reward": 0.5460395812988281, + "rewards/format_reward": 1.0, + "step": 509, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 482.9750061035156, + "epoch": 0.009693053311793215, + "grad_norm": 1.421114213059028, + "kl": 0.044189453125, + "learning_rate": 9.9976819254635e-07, + "loss": 0.0018, + "reward": 1.9362499713897705, + "reward_std": 0.23603010177612305, + "rewards/accuracy_reward": 0.9000000357627869, + "rewards/format_reward": 1.0, + "step": 510, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 437.95001220703125, + "epoch": 0.009712059298679084, + "grad_norm": 1.4247342133916885, + "kl": 0.062255859375, + "learning_rate": 9.997672826768806e-07, + "loss": 0.0025, + "reward": 1.8970905542373657, + "reward_std": 0.08062443137168884, + "rewards/accuracy_reward": 0.7470905780792236, + "rewards/format_reward": 1.0, + "step": 511, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 452.6750183105469, + "epoch": 0.009731065285564952, + "grad_norm": 1.6229637859363697, + "kl": 0.047119140625, + "learning_rate": 9.997663710256545e-07, + "loss": 0.0019, + "reward": 1.6080732345581055, + "reward_std": 0.1674114614725113, + "rewards/accuracy_reward": 0.5068233013153076, + "rewards/format_reward": 0.9750000238418579, + "step": 512, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 442.82501220703125, + "epoch": 0.009750071272450821, + "grad_norm": 1.5770019601527168, + "kl": 0.0634765625, + "learning_rate": 9.997654575926748e-07, + "loss": 0.0025, + "reward": 1.933750033378601, + "reward_std": 0.18966975808143616, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 513, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 483.2749938964844, + "epoch": 0.00976907725933669, + "grad_norm": 1.5718725128573952, + "kl": 0.04931640625, + "learning_rate": 9.99764542377945e-07, + "loss": 0.002, + "reward": 1.7304449081420898, + "reward_std": 0.20716892182826996, + "rewards/accuracy_reward": 0.676694929599762, + "rewards/format_reward": 0.9750000238418579, + "step": 514, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 411.2250061035156, + "epoch": 0.009788083246222561, + "grad_norm": 1.4474529353934316, + "kl": 0.0673828125, + "learning_rate": 9.99763625381468e-07, + "loss": 0.0027, + "reward": 1.3837499618530273, + "reward_std": 0.30521318316459656, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 1.0, + "step": 515, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 466.0500183105469, + "epoch": 0.00980708923310843, + "grad_norm": 2.0818827919319185, + "kl": 0.060791015625, + "learning_rate": 9.997627066032475e-07, + "loss": 0.0024, + "reward": 1.9562500715255737, + "reward_std": 0.28784844279289246, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 516, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 435.0, + "epoch": 0.009826095219994299, + "grad_norm": 1.924609376887236, + "kl": 0.06494140625, + "learning_rate": 9.997617860432864e-07, + "loss": 0.0026, + "reward": 1.8871879577636719, + "reward_std": 0.13538892567157745, + "rewards/accuracy_reward": 0.7396878600120544, + "rewards/format_reward": 1.0, + "step": 517, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 428.5249938964844, + "epoch": 0.009845101206880168, + "grad_norm": 1.840152705053356, + "kl": 0.0576171875, + "learning_rate": 9.99760863701588e-07, + "loss": 0.0023, + "reward": 1.9327281713485718, + "reward_std": 0.06402174383401871, + "rewards/accuracy_reward": 0.776478111743927, + "rewards/format_reward": 1.0, + "step": 518, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 529.875, + "epoch": 0.009864107193766037, + "grad_norm": 1.457960696500777, + "kl": 0.031005859375, + "learning_rate": 9.997599395781559e-07, + "loss": 0.0012, + "reward": 1.0476676225662231, + "reward_std": 0.3968335688114166, + "rewards/accuracy_reward": 0.4464176297187805, + "rewards/format_reward": 0.675000011920929, + "step": 519, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 488.3999938964844, + "epoch": 0.009883113180651906, + "grad_norm": 2.0649997556200983, + "kl": 0.037109375, + "learning_rate": 9.997590136729931e-07, + "loss": 0.0015, + "reward": 1.631659746170044, + "reward_std": 0.22774966061115265, + "rewards/accuracy_reward": 0.6929095387458801, + "rewards/format_reward": 0.949999988079071, + "step": 520, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 447.5, + "epoch": 0.009902119167537774, + "grad_norm": 1.8737860942870295, + "kl": 0.08349609375, + "learning_rate": 9.99758085986103e-07, + "loss": 0.0033, + "reward": 1.8599998950958252, + "reward_std": 0.19150128960609436, + "rewards/accuracy_reward": 0.699999988079071, + "rewards/format_reward": 1.0, + "step": 521, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 450.3500061035156, + "epoch": 0.009921125154423643, + "grad_norm": 1.647439142953048, + "kl": 0.06201171875, + "learning_rate": 9.997571565174892e-07, + "loss": 0.0025, + "reward": 1.7257064580917358, + "reward_std": 0.31141045689582825, + "rewards/accuracy_reward": 0.686956524848938, + "rewards/format_reward": 0.9750000238418579, + "step": 522, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 401.3000183105469, + "epoch": 0.009940131141309512, + "grad_norm": 2.161017413640146, + "kl": 0.09716796875, + "learning_rate": 9.997562252671545e-07, + "loss": 0.0039, + "reward": 2.2019996643066406, + "reward_std": 0.14086316525936127, + "rewards/accuracy_reward": 0.9194995760917664, + "rewards/format_reward": 1.0, + "step": 523, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 452.1000061035156, + "epoch": 0.009959137128195381, + "grad_norm": 3.977499471645308, + "kl": 0.060546875, + "learning_rate": 9.997552922351024e-07, + "loss": 0.0024, + "reward": 1.5689438581466675, + "reward_std": 0.06857716292142868, + "rewards/accuracy_reward": 0.5039438605308533, + "rewards/format_reward": 1.0, + "step": 524, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 469.5249938964844, + "epoch": 0.00997814311508125, + "grad_norm": 1.25780381342993, + "kl": 0.059326171875, + "learning_rate": 9.997543574213363e-07, + "loss": 0.0024, + "reward": 1.743749976158142, + "reward_std": 0.1468115597963333, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 1.0, + "step": 525, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 442.7749938964844, + "epoch": 0.009997149101967119, + "grad_norm": 1.5561277581708837, + "kl": 0.05029296875, + "learning_rate": 9.997534208258596e-07, + "loss": 0.002, + "reward": 1.5413535833358765, + "reward_std": 0.1735600233078003, + "rewards/accuracy_reward": 0.4588536322116852, + "rewards/format_reward": 1.0, + "step": 526, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 448.57501220703125, + "epoch": 0.010016155088852988, + "grad_norm": 1.8192200758534172, + "kl": 0.06103515625, + "learning_rate": 9.997524824486754e-07, + "loss": 0.0024, + "reward": 2.0274999141693115, + "reward_std": 0.20530882477760315, + "rewards/accuracy_reward": 0.824999988079071, + "rewards/format_reward": 1.0, + "step": 527, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 416.45001220703125, + "epoch": 0.010035161075738857, + "grad_norm": 1.7830384198425893, + "kl": 0.068359375, + "learning_rate": 9.997515422897875e-07, + "loss": 0.0027, + "reward": 1.969956636428833, + "reward_std": 0.12310776859521866, + "rewards/accuracy_reward": 0.8174566626548767, + "rewards/format_reward": 1.0, + "step": 528, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 418.45001220703125, + "epoch": 0.010054167062624728, + "grad_norm": 1.29074227610966, + "kl": 0.062255859375, + "learning_rate": 9.997506003491988e-07, + "loss": 0.0025, + "reward": 1.8600000143051147, + "reward_std": 0.20849597454071045, + "rewards/accuracy_reward": 0.699999988079071, + "rewards/format_reward": 1.0, + "step": 529, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 416.3000183105469, + "epoch": 0.010073173049510597, + "grad_norm": 2.2381431785356507, + "kl": 0.057373046875, + "learning_rate": 9.997496566269127e-07, + "loss": 0.0023, + "reward": 1.5626453161239624, + "reward_std": 0.3552330434322357, + "rewards/accuracy_reward": 0.5088953375816345, + "rewards/format_reward": 0.925000011920929, + "step": 530, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 421.9750061035156, + "epoch": 0.010092179036396465, + "grad_norm": 24.485746925275997, + "kl": 0.03857421875, + "learning_rate": 9.997487111229328e-07, + "loss": 0.0015, + "reward": 1.503008484840393, + "reward_std": 0.04300389438867569, + "rewards/accuracy_reward": 0.40800848603248596, + "rewards/format_reward": 1.0, + "step": 531, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 410.3000183105469, + "epoch": 0.010111185023282334, + "grad_norm": 1.868644726796586, + "kl": 0.059326171875, + "learning_rate": 9.997477638372623e-07, + "loss": 0.0024, + "reward": 1.8612297773361206, + "reward_std": 0.2363087683916092, + "rewards/accuracy_reward": 0.7599797248840332, + "rewards/format_reward": 1.0, + "step": 532, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 401.6750183105469, + "epoch": 0.010130191010168203, + "grad_norm": 2.1563928038223703, + "kl": 0.060302734375, + "learning_rate": 9.997468147699044e-07, + "loss": 0.0024, + "reward": 2.0249557495117188, + "reward_std": 0.04264111444354057, + "rewards/accuracy_reward": 0.8487057089805603, + "rewards/format_reward": 1.0, + "step": 533, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 421.6000061035156, + "epoch": 0.010149196997054072, + "grad_norm": 1.6260224977457973, + "kl": 0.034912109375, + "learning_rate": 9.997458639208628e-07, + "loss": 0.0014, + "reward": 1.5147907733917236, + "reward_std": 0.32387611269950867, + "rewards/accuracy_reward": 0.48979073762893677, + "rewards/format_reward": 1.0, + "step": 534, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 452.4750061035156, + "epoch": 0.010168202983939941, + "grad_norm": 1.6739947191706201, + "kl": 0.059814453125, + "learning_rate": 9.99744911290141e-07, + "loss": 0.0024, + "reward": 1.777500033378601, + "reward_std": 0.3292026221752167, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 0.9750000238418579, + "step": 535, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 426.6750183105469, + "epoch": 0.01018720897082581, + "grad_norm": 2.191671938308414, + "kl": 0.053955078125, + "learning_rate": 9.99743956877742e-07, + "loss": 0.0022, + "reward": 1.7059297561645508, + "reward_std": 0.31015655398368835, + "rewards/accuracy_reward": 0.6221798062324524, + "rewards/format_reward": 0.949999988079071, + "step": 536, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 410.0, + "epoch": 0.010206214957711679, + "grad_norm": 2.1192463675187327, + "kl": 0.07470703125, + "learning_rate": 9.997430006836696e-07, + "loss": 0.003, + "reward": 2.161249876022339, + "reward_std": 0.2553601861000061, + "rewards/accuracy_reward": 0.9000000357627869, + "rewards/format_reward": 1.0, + "step": 537, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 410.875, + "epoch": 0.010225220944597548, + "grad_norm": 1.7509901082763857, + "kl": 0.0478515625, + "learning_rate": 9.997420427079268e-07, + "loss": 0.0019, + "reward": 1.5246129035949707, + "reward_std": 0.1447751522064209, + "rewards/accuracy_reward": 0.4433629512786865, + "rewards/format_reward": 1.0, + "step": 538, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 427.32501220703125, + "epoch": 0.010244226931483417, + "grad_norm": 1.9569781491781264, + "kl": 0.05810546875, + "learning_rate": 9.997410829505174e-07, + "loss": 0.0023, + "reward": 1.6866406202316284, + "reward_std": 0.2554178535938263, + "rewards/accuracy_reward": 0.6703906059265137, + "rewards/format_reward": 1.0, + "step": 539, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 396.1499938964844, + "epoch": 0.010263232918369286, + "grad_norm": 1.9090177382978537, + "kl": 0.0703125, + "learning_rate": 9.997401214114444e-07, + "loss": 0.0028, + "reward": 1.9938347339630127, + "reward_std": 0.21185918152332306, + "rewards/accuracy_reward": 0.8288349509239197, + "rewards/format_reward": 1.0, + "step": 540, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 425.4250183105469, + "epoch": 0.010282238905255155, + "grad_norm": 2.949924736464448, + "kl": 0.0693359375, + "learning_rate": 9.997391580907118e-07, + "loss": 0.0028, + "reward": 1.9996589422225952, + "reward_std": 0.12238943576812744, + "rewards/accuracy_reward": 0.7909091114997864, + "rewards/format_reward": 1.0, + "step": 541, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 453.6499938964844, + "epoch": 0.010301244892141024, + "grad_norm": 1.59182327289358, + "kl": 0.044921875, + "learning_rate": 9.997381929883225e-07, + "loss": 0.0018, + "reward": 1.8970184326171875, + "reward_std": 0.24323752522468567, + "rewards/accuracy_reward": 0.7957685589790344, + "rewards/format_reward": 0.9750000238418579, + "step": 542, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 428.625, + "epoch": 0.010320250879026894, + "grad_norm": 2.1514497795864083, + "kl": 0.058349609375, + "learning_rate": 9.9973722610428e-07, + "loss": 0.0023, + "reward": 1.7581989765167236, + "reward_std": 0.21770977973937988, + "rewards/accuracy_reward": 0.7731990218162537, + "rewards/format_reward": 0.9750000238418579, + "step": 543, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 429.6499938964844, + "epoch": 0.010339256865912763, + "grad_norm": 3.768053852370147, + "kl": 0.083984375, + "learning_rate": 9.99736257438588e-07, + "loss": 0.0033, + "reward": 2.1200387477874756, + "reward_std": 0.038123708218336105, + "rewards/accuracy_reward": 0.8837887644767761, + "rewards/format_reward": 1.0, + "step": 544, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 412.3500061035156, + "epoch": 0.010358262852798632, + "grad_norm": 1.8367108151585614, + "kl": 0.080078125, + "learning_rate": 9.997352869912499e-07, + "loss": 0.0032, + "reward": 1.7112499475479126, + "reward_std": 0.12422802299261093, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 545, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 440.3999938964844, + "epoch": 0.010377268839684501, + "grad_norm": 2.2196999092149823, + "kl": 0.078125, + "learning_rate": 9.99734314762269e-07, + "loss": 0.0031, + "reward": 1.7476776838302612, + "reward_std": 0.25022193789482117, + "rewards/accuracy_reward": 0.5614275932312012, + "rewards/format_reward": 1.0, + "step": 546, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 445.7749938964844, + "epoch": 0.01039627482657037, + "grad_norm": 1.6856817799618924, + "kl": 0.052490234375, + "learning_rate": 9.99733340751649e-07, + "loss": 0.0021, + "reward": 1.5888121128082275, + "reward_std": 0.27665263414382935, + "rewards/accuracy_reward": 0.5363120436668396, + "rewards/format_reward": 0.9750000238418579, + "step": 547, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 440.45001220703125, + "epoch": 0.010415280813456239, + "grad_norm": 1.9253001434590515, + "kl": 0.055908203125, + "learning_rate": 9.997323649593932e-07, + "loss": 0.0022, + "reward": 1.8682085275650024, + "reward_std": 0.16223637759685516, + "rewards/accuracy_reward": 0.7207085490226746, + "rewards/format_reward": 1.0, + "step": 548, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 435.6750183105469, + "epoch": 0.010434286800342108, + "grad_norm": 2.0731845286020967, + "kl": 0.08154296875, + "learning_rate": 9.997313873855052e-07, + "loss": 0.0033, + "reward": 1.549357295036316, + "reward_std": 0.30646300315856934, + "rewards/accuracy_reward": 0.3893572986125946, + "rewards/format_reward": 1.0, + "step": 549, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 421.4250183105469, + "epoch": 0.010453292787227977, + "grad_norm": 3.059206823205699, + "kl": 0.08740234375, + "learning_rate": 9.997304080299883e-07, + "loss": 0.0035, + "reward": 1.9794644117355347, + "reward_std": 0.18326017260551453, + "rewards/accuracy_reward": 0.7357142567634583, + "rewards/format_reward": 1.0, + "step": 550, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 439.8999938964844, + "epoch": 0.010472298774113846, + "grad_norm": 1.8211971498851012, + "kl": 0.068359375, + "learning_rate": 9.99729426892846e-07, + "loss": 0.0027, + "reward": 1.962499976158142, + "reward_std": 0.1277000904083252, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 551, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 435.57501220703125, + "epoch": 0.010491304760999715, + "grad_norm": 1.930959635006113, + "kl": 0.0673828125, + "learning_rate": 9.997284439740818e-07, + "loss": 0.0027, + "reward": 2.1675000190734863, + "reward_std": 0.14230495691299438, + "rewards/accuracy_reward": 0.9524999856948853, + "rewards/format_reward": 1.0, + "step": 552, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.4, + "completion_length": 429.1750183105469, + "epoch": 0.010510310747885583, + "grad_norm": 1.5425747829899408, + "kl": 0.052001953125, + "learning_rate": 9.997274592736995e-07, + "loss": 0.0021, + "reward": 1.7487499713897705, + "reward_std": 0.03251330554485321, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 553, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 419.75, + "epoch": 0.010529316734771452, + "grad_norm": 3.186369692711284, + "kl": 0.07666015625, + "learning_rate": 9.997264727917025e-07, + "loss": 0.0031, + "reward": 1.5479166507720947, + "reward_std": 0.3285001814365387, + "rewards/accuracy_reward": 0.44166669249534607, + "rewards/format_reward": 0.9750000238418579, + "step": 554, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 424.7250061035156, + "epoch": 0.010548322721657321, + "grad_norm": 2.0793191587907582, + "kl": 0.0673828125, + "learning_rate": 9.99725484528094e-07, + "loss": 0.0027, + "reward": 1.929342269897461, + "reward_std": 0.23153726756572723, + "rewards/accuracy_reward": 0.7868421077728271, + "rewards/format_reward": 1.0, + "step": 555, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 429.7749938964844, + "epoch": 0.010567328708543192, + "grad_norm": 2.382159792702014, + "kl": 0.055908203125, + "learning_rate": 9.99724494482878e-07, + "loss": 0.0022, + "reward": 1.8536278009414673, + "reward_std": 0.16179294884204865, + "rewards/accuracy_reward": 0.7536277770996094, + "rewards/format_reward": 1.0, + "step": 556, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 1.0, + "all_wrong": 0.0, + "completion_length": 430.9250183105469, + "epoch": 0.010586334695429061, + "grad_norm": 2.1033740533698104, + "kl": 0.0712890625, + "learning_rate": 9.997235026560576e-07, + "loss": 0.0028, + "reward": 2.291249990463257, + "reward_std": 0.036220937967300415, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 557, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 435.8500061035156, + "epoch": 0.01060534068231493, + "grad_norm": 5.011974977628457, + "kl": 0.0712890625, + "learning_rate": 9.997225090476364e-07, + "loss": 0.0028, + "reward": 2.0250000953674316, + "reward_std": 0.3077133297920227, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.949999988079071, + "step": 558, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 446.0249938964844, + "epoch": 0.010624346669200799, + "grad_norm": 1.6251668542112612, + "kl": 0.07861328125, + "learning_rate": 9.997215136576183e-07, + "loss": 0.0031, + "reward": 1.9399999380111694, + "reward_std": 0.14692406356334686, + "rewards/accuracy_reward": 0.824999988079071, + "rewards/format_reward": 0.9750000238418579, + "step": 559, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 461.125, + "epoch": 0.010643352656086668, + "grad_norm": 1.4719581109155024, + "kl": 0.0380859375, + "learning_rate": 9.997205164860066e-07, + "loss": 0.0015, + "reward": 1.6075143814086914, + "reward_std": 0.30150026082992554, + "rewards/accuracy_reward": 0.6612643599510193, + "rewards/format_reward": 0.949999988079071, + "step": 560, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 447.82501220703125, + "epoch": 0.010662358642972537, + "grad_norm": 1.8722311584053601, + "kl": 0.07470703125, + "learning_rate": 9.997195175328048e-07, + "loss": 0.003, + "reward": 1.6799999475479126, + "reward_std": 0.2983327805995941, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 561, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 493.375, + "epoch": 0.010681364629858405, + "grad_norm": 2.5175396066040867, + "kl": 0.04345703125, + "learning_rate": 9.997185167980164e-07, + "loss": 0.0017, + "reward": 1.2380895614624023, + "reward_std": 0.5283424258232117, + "rewards/accuracy_reward": 0.3505896329879761, + "rewards/format_reward": 0.925000011920929, + "step": 562, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 421.6000061035156, + "epoch": 0.010700370616744274, + "grad_norm": 1.2818517472753972, + "kl": 0.03955078125, + "learning_rate": 9.997175142816452e-07, + "loss": 0.0016, + "reward": 1.6775001287460327, + "reward_std": 0.12553423643112183, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 1.0, + "step": 563, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 404.625, + "epoch": 0.010719376603630143, + "grad_norm": 1.7596950112705625, + "kl": 0.07275390625, + "learning_rate": 9.997165099836945e-07, + "loss": 0.0029, + "reward": 1.8583303689956665, + "reward_std": 0.12296537309885025, + "rewards/accuracy_reward": 0.7608304023742676, + "rewards/format_reward": 1.0, + "step": 564, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 449.70001220703125, + "epoch": 0.010738382590516012, + "grad_norm": 2.0564729193203313, + "kl": 0.06640625, + "learning_rate": 9.997155039041684e-07, + "loss": 0.0027, + "reward": 1.8003486394882202, + "reward_std": 0.20200787484645844, + "rewards/accuracy_reward": 0.6553487777709961, + "rewards/format_reward": 1.0, + "step": 565, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 450.95001220703125, + "epoch": 0.010757388577401881, + "grad_norm": 1.8121228815285055, + "kl": 0.08251953125, + "learning_rate": 9.9971449604307e-07, + "loss": 0.0033, + "reward": 1.868749976158142, + "reward_std": 0.18496833741664886, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 566, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 432.875, + "epoch": 0.01077639456428775, + "grad_norm": 3.497682098835307, + "kl": 0.0625, + "learning_rate": 9.997134864004028e-07, + "loss": 0.0025, + "reward": 1.529032588005066, + "reward_std": 0.06469090282917023, + "rewards/accuracy_reward": 0.45028257369995117, + "rewards/format_reward": 1.0, + "step": 567, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 436.70001220703125, + "epoch": 0.010795400551173619, + "grad_norm": 2.4668971310469705, + "kl": 0.08154296875, + "learning_rate": 9.997124749761708e-07, + "loss": 0.0033, + "reward": 1.6386276483535767, + "reward_std": 0.15997079014778137, + "rewards/accuracy_reward": 0.5061275362968445, + "rewards/format_reward": 1.0, + "step": 568, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 455.375, + "epoch": 0.010814406538059488, + "grad_norm": 1.7766542047831666, + "kl": 0.06689453125, + "learning_rate": 9.997114617703774e-07, + "loss": 0.0027, + "reward": 1.590166687965393, + "reward_std": 0.3435172140598297, + "rewards/accuracy_reward": 0.515166699886322, + "rewards/format_reward": 0.9750000238418579, + "step": 569, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 451.7250061035156, + "epoch": 0.010833412524945359, + "grad_norm": 2.0663020555330553, + "kl": 0.060302734375, + "learning_rate": 9.997104467830264e-07, + "loss": 0.0024, + "reward": 1.84375, + "reward_std": 0.2997171878814697, + "rewards/accuracy_reward": 0.8500000238418579, + "rewards/format_reward": 0.925000011920929, + "step": 570, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 427.0, + "epoch": 0.010852418511831228, + "grad_norm": 1.9441534396138849, + "kl": 0.061279296875, + "learning_rate": 9.99709430014121e-07, + "loss": 0.0024, + "reward": 1.9108333587646484, + "reward_std": 0.1315259039402008, + "rewards/accuracy_reward": 0.753333330154419, + "rewards/format_reward": 1.0, + "step": 571, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 461.0, + "epoch": 0.010871424498717096, + "grad_norm": 5.2720125799568445, + "kl": 0.06640625, + "learning_rate": 9.997084114636653e-07, + "loss": 0.0027, + "reward": 1.712392807006836, + "reward_std": 0.18492774665355682, + "rewards/accuracy_reward": 0.5636427998542786, + "rewards/format_reward": 0.949999988079071, + "step": 572, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 454.2749938964844, + "epoch": 0.010890430485602965, + "grad_norm": 3.7820635582263407, + "kl": 0.060791015625, + "learning_rate": 9.997073911316625e-07, + "loss": 0.0024, + "reward": 1.8587497472763062, + "reward_std": 0.11446737498044968, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 1.0, + "step": 573, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 394.0, + "epoch": 0.010909436472488834, + "grad_norm": 2.9854145728478922, + "kl": 0.072265625, + "learning_rate": 9.997063690181166e-07, + "loss": 0.0029, + "reward": 1.9924999475479126, + "reward_std": 0.12202110141515732, + "rewards/accuracy_reward": 0.824999988079071, + "rewards/format_reward": 1.0, + "step": 574, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 429.1000061035156, + "epoch": 0.010928442459374703, + "grad_norm": 7.321171036237711, + "kl": 0.048583984375, + "learning_rate": 9.99705345123031e-07, + "loss": 0.0019, + "reward": 1.4943394660949707, + "reward_std": 0.23887300491333008, + "rewards/accuracy_reward": 0.42433950304985046, + "rewards/format_reward": 1.0, + "step": 575, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 447.7749938964844, + "epoch": 0.010947448446260572, + "grad_norm": 7.609787721551616, + "kl": 0.056884765625, + "learning_rate": 9.997043194464097e-07, + "loss": 0.0023, + "reward": 1.8877452611923218, + "reward_std": 0.08956362307071686, + "rewards/accuracy_reward": 0.7914954423904419, + "rewards/format_reward": 1.0, + "step": 576, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 420.6750183105469, + "epoch": 0.010966454433146441, + "grad_norm": 1.7424394249563804, + "kl": 0.0634765625, + "learning_rate": 9.997032919882557e-07, + "loss": 0.0025, + "reward": 2.137741804122925, + "reward_std": 0.10617410391569138, + "rewards/accuracy_reward": 0.9677419662475586, + "rewards/format_reward": 1.0, + "step": 577, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 438.1000061035156, + "epoch": 0.01098546042003231, + "grad_norm": 2.1309296570794722, + "kl": 0.07275390625, + "learning_rate": 9.997022627485733e-07, + "loss": 0.0029, + "reward": 1.8008911609649658, + "reward_std": 0.058240581303834915, + "rewards/accuracy_reward": 0.6608911752700806, + "rewards/format_reward": 1.0, + "step": 578, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 469.3500061035156, + "epoch": 0.011004466406918179, + "grad_norm": 3.070584243478517, + "kl": 0.029296875, + "learning_rate": 9.99701231727366e-07, + "loss": 0.0012, + "reward": 1.7209606170654297, + "reward_std": 0.3849009871482849, + "rewards/accuracy_reward": 0.7397105097770691, + "rewards/format_reward": 0.925000011920929, + "step": 579, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 444.1750183105469, + "epoch": 0.011023472393804048, + "grad_norm": 3.4046461154933247, + "kl": 0.0634765625, + "learning_rate": 9.997001989246375e-07, + "loss": 0.0025, + "reward": 1.8424999713897705, + "reward_std": 0.13863466680049896, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 1.0, + "step": 580, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 451.4250183105469, + "epoch": 0.011042478380689917, + "grad_norm": 2.120796271990568, + "kl": 0.0634765625, + "learning_rate": 9.99699164340391e-07, + "loss": 0.0025, + "reward": 1.9725000858306885, + "reward_std": 0.19706712663173676, + "rewards/accuracy_reward": 0.925000011920929, + "rewards/format_reward": 1.0, + "step": 581, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 419.375, + "epoch": 0.011061484367575786, + "grad_norm": 1.5933497265646501, + "kl": 0.058837890625, + "learning_rate": 9.996981279746309e-07, + "loss": 0.0024, + "reward": 1.778942346572876, + "reward_std": 0.2655041813850403, + "rewards/accuracy_reward": 0.7076923251152039, + "rewards/format_reward": 0.9750000238418579, + "step": 582, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 463.95001220703125, + "epoch": 0.011080490354461655, + "grad_norm": 2.068419841779232, + "kl": 0.0576171875, + "learning_rate": 9.996970898273605e-07, + "loss": 0.0023, + "reward": 1.7924998998641968, + "reward_std": 0.29065975546836853, + "rewards/accuracy_reward": 0.824999988079071, + "rewards/format_reward": 0.9000000357627869, + "step": 583, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 411.1499938964844, + "epoch": 0.011099496341347525, + "grad_norm": 1.9404332385370395, + "kl": 0.0849609375, + "learning_rate": 9.996960498985835e-07, + "loss": 0.0034, + "reward": 2.0962026119232178, + "reward_std": 0.042331378906965256, + "rewards/accuracy_reward": 0.7999525666236877, + "rewards/format_reward": 1.0, + "step": 584, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 455.1750183105469, + "epoch": 0.011118502328233394, + "grad_norm": 1.5837026435663861, + "kl": 0.047119140625, + "learning_rate": 9.99695008188304e-07, + "loss": 0.0019, + "reward": 1.7075976133346558, + "reward_std": 0.0701654776930809, + "rewards/accuracy_reward": 0.6163474917411804, + "rewards/format_reward": 1.0, + "step": 585, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 446.45001220703125, + "epoch": 0.011137508315119263, + "grad_norm": 2.0138325412935068, + "kl": 0.053955078125, + "learning_rate": 9.99693964696525e-07, + "loss": 0.0022, + "reward": 1.6336510181427002, + "reward_std": 0.18786856532096863, + "rewards/accuracy_reward": 0.4961509704589844, + "rewards/format_reward": 1.0, + "step": 586, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 398.8000183105469, + "epoch": 0.011156514302005132, + "grad_norm": 1.8654457872792787, + "kl": 0.0830078125, + "learning_rate": 9.99692919423251e-07, + "loss": 0.0033, + "reward": 1.6038554906845093, + "reward_std": 0.05064338445663452, + "rewards/accuracy_reward": 0.5063557028770447, + "rewards/format_reward": 1.0, + "step": 587, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 469.45001220703125, + "epoch": 0.011175520288891001, + "grad_norm": 1.739404524855777, + "kl": 0.056396484375, + "learning_rate": 9.99691872368485e-07, + "loss": 0.0023, + "reward": 1.8059364557266235, + "reward_std": 0.33440202474594116, + "rewards/accuracy_reward": 0.6834363341331482, + "rewards/format_reward": 1.0, + "step": 588, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 427.3500061035156, + "epoch": 0.01119452627577687, + "grad_norm": 2.541403588992428, + "kl": 0.04931640625, + "learning_rate": 9.996908235322312e-07, + "loss": 0.002, + "reward": 1.561975359916687, + "reward_std": 0.09330564737319946, + "rewards/accuracy_reward": 0.5307253003120422, + "rewards/format_reward": 1.0, + "step": 589, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 433.125, + "epoch": 0.011213532262662739, + "grad_norm": 1.7224078447156417, + "kl": 0.07421875, + "learning_rate": 9.996897729144933e-07, + "loss": 0.003, + "reward": 1.8650000095367432, + "reward_std": 0.3313520848751068, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 1.0, + "step": 590, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 419.0, + "epoch": 0.011232538249548608, + "grad_norm": 1.804958188799846, + "kl": 0.047119140625, + "learning_rate": 9.996887205152748e-07, + "loss": 0.0019, + "reward": 1.6495860815048218, + "reward_std": 0.13574181497097015, + "rewards/accuracy_reward": 0.5445861220359802, + "rewards/format_reward": 0.9750000238418579, + "step": 591, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 451.1750183105469, + "epoch": 0.011251544236434477, + "grad_norm": 1.94029667751913, + "kl": 0.060791015625, + "learning_rate": 9.9968766633458e-07, + "loss": 0.0024, + "reward": 1.8676280975341797, + "reward_std": 0.0523289330303669, + "rewards/accuracy_reward": 0.7163779735565186, + "rewards/format_reward": 1.0, + "step": 592, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 460.82501220703125, + "epoch": 0.011270550223320346, + "grad_norm": 2.876341928937013, + "kl": 0.0673828125, + "learning_rate": 9.996866103724119e-07, + "loss": 0.0027, + "reward": 1.8318378925323486, + "reward_std": 0.13642318546772003, + "rewards/accuracy_reward": 0.6943378448486328, + "rewards/format_reward": 1.0, + "step": 593, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 448.95001220703125, + "epoch": 0.011289556210206214, + "grad_norm": 3.606542369974204, + "kl": 0.0751953125, + "learning_rate": 9.996855526287748e-07, + "loss": 0.003, + "reward": 1.8008333444595337, + "reward_std": 0.23959660530090332, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/format_reward": 0.9750000238418579, + "step": 594, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 455.6499938964844, + "epoch": 0.011308562197092083, + "grad_norm": 1.6976024232220916, + "kl": 0.050048828125, + "learning_rate": 9.996844931036723e-07, + "loss": 0.002, + "reward": 1.5475986003875732, + "reward_std": 0.24376149475574493, + "rewards/accuracy_reward": 0.5800986289978027, + "rewards/format_reward": 0.8500000238418579, + "step": 595, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 485.57501220703125, + "epoch": 0.011327568183977952, + "grad_norm": 1.3563633391799217, + "kl": 0.0693359375, + "learning_rate": 9.99683431797108e-07, + "loss": 0.0028, + "reward": 1.1493602991104126, + "reward_std": 0.09293808788061142, + "rewards/accuracy_reward": 0.3168603479862213, + "rewards/format_reward": 0.800000011920929, + "step": 596, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 434.25, + "epoch": 0.011346574170863821, + "grad_norm": 1.6313913910403548, + "kl": 0.054443359375, + "learning_rate": 9.996823687090861e-07, + "loss": 0.0022, + "reward": 2.0971591472625732, + "reward_std": 0.06261853128671646, + "rewards/accuracy_reward": 0.9909090995788574, + "rewards/format_reward": 1.0, + "step": 597, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 452.75, + "epoch": 0.011365580157749692, + "grad_norm": 1.4532476788156559, + "kl": 0.0693359375, + "learning_rate": 9.996813038396102e-07, + "loss": 0.0028, + "reward": 1.875, + "reward_std": 0.18700535595417023, + "rewards/accuracy_reward": 0.824999988079071, + "rewards/format_reward": 0.925000011920929, + "step": 598, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 446.1499938964844, + "epoch": 0.01138458614463556, + "grad_norm": 1.8266604983575412, + "kl": 0.06884765625, + "learning_rate": 9.99680237188684e-07, + "loss": 0.0028, + "reward": 1.57846999168396, + "reward_std": 0.18595215678215027, + "rewards/accuracy_reward": 0.5334700345993042, + "rewards/format_reward": 1.0, + "step": 599, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.6, + "completion_length": 420.6000061035156, + "epoch": 0.01140359213152143, + "grad_norm": 1.4683311607751286, + "kl": 0.0556640625, + "learning_rate": 9.99679168756311e-07, + "loss": 0.0022, + "reward": 1.1973066329956055, + "reward_std": 0.17622622847557068, + "rewards/accuracy_reward": 0.1673065721988678, + "rewards/format_reward": 1.0, + "step": 600, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 411.6750183105469, + "epoch": 0.011422598118407299, + "grad_norm": 4.324589723372153, + "kl": 0.0771484375, + "learning_rate": 9.99678098542496e-07, + "loss": 0.0031, + "reward": 2.0297563076019287, + "reward_std": 0.17133544385433197, + "rewards/accuracy_reward": 0.8635061383247375, + "rewards/format_reward": 1.0, + "step": 601, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 431.5249938964844, + "epoch": 0.011441604105293168, + "grad_norm": 2.2166544804561843, + "kl": 0.061767578125, + "learning_rate": 9.996770265472418e-07, + "loss": 0.0025, + "reward": 1.6730873584747314, + "reward_std": 0.08894231915473938, + "rewards/accuracy_reward": 0.5393373370170593, + "rewards/format_reward": 1.0, + "step": 602, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 463.875, + "epoch": 0.011460610092179037, + "grad_norm": 1.674720354156424, + "kl": 0.064453125, + "learning_rate": 9.996759527705526e-07, + "loss": 0.0026, + "reward": 1.9574998617172241, + "reward_std": 0.19369134306907654, + "rewards/accuracy_reward": 0.824999988079071, + "rewards/format_reward": 1.0, + "step": 603, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 445.32501220703125, + "epoch": 0.011479616079064905, + "grad_norm": 1.6980972175217657, + "kl": 0.06787109375, + "learning_rate": 9.996748772124324e-07, + "loss": 0.0027, + "reward": 2.0137500762939453, + "reward_std": 0.14856313169002533, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 604, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 436.25, + "epoch": 0.011498622065950774, + "grad_norm": 2.500974902897863, + "kl": 0.0908203125, + "learning_rate": 9.996737998728849e-07, + "loss": 0.0036, + "reward": 2.077589750289917, + "reward_std": 0.16722789406776428, + "rewards/accuracy_reward": 0.8088399767875671, + "rewards/format_reward": 1.0, + "step": 605, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 429.75, + "epoch": 0.011517628052836643, + "grad_norm": 1.947607128316437, + "kl": 0.083984375, + "learning_rate": 9.996727207519138e-07, + "loss": 0.0034, + "reward": 2.1837499141693115, + "reward_std": 0.1226191446185112, + "rewards/accuracy_reward": 0.949999988079071, + "rewards/format_reward": 1.0, + "step": 606, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 432.9250183105469, + "epoch": 0.011536634039722512, + "grad_norm": 3.156271591850942, + "kl": 0.083984375, + "learning_rate": 9.996716398495229e-07, + "loss": 0.0034, + "reward": 1.9579235315322876, + "reward_std": 0.17999373376369476, + "rewards/accuracy_reward": 0.730423629283905, + "rewards/format_reward": 1.0, + "step": 607, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 433.3500061035156, + "epoch": 0.011555640026608381, + "grad_norm": 1.6373683781484616, + "kl": 0.0615234375, + "learning_rate": 9.996705571657165e-07, + "loss": 0.0025, + "reward": 1.7975000143051147, + "reward_std": 0.12626336514949799, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 1.0, + "step": 608, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 420.95001220703125, + "epoch": 0.01157464601349425, + "grad_norm": 1.9431819870080271, + "kl": 0.062255859375, + "learning_rate": 9.996694727004979e-07, + "loss": 0.0025, + "reward": 1.6727215051651, + "reward_std": 0.2554248869419098, + "rewards/accuracy_reward": 0.5839714407920837, + "rewards/format_reward": 1.0, + "step": 609, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 415.3999938964844, + "epoch": 0.011593652000380119, + "grad_norm": 1.6287603168584877, + "kl": 0.0712890625, + "learning_rate": 9.996683864538716e-07, + "loss": 0.0028, + "reward": 2.0281901359558105, + "reward_std": 0.04945867881178856, + "rewards/accuracy_reward": 0.8719400763511658, + "rewards/format_reward": 1.0, + "step": 610, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 471.1499938964844, + "epoch": 0.01161265798726599, + "grad_norm": 1.925339316876235, + "kl": 0.07080078125, + "learning_rate": 9.996672984258408e-07, + "loss": 0.0028, + "reward": 2.042964220046997, + "reward_std": 0.1353432983160019, + "rewards/accuracy_reward": 0.8554641604423523, + "rewards/format_reward": 1.0, + "step": 611, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 467.45001220703125, + "epoch": 0.011631663974151859, + "grad_norm": 1.768305528014443, + "kl": 0.051025390625, + "learning_rate": 9.996662086164098e-07, + "loss": 0.002, + "reward": 1.9012501239776611, + "reward_std": 0.2875203490257263, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 0.9750000238418579, + "step": 612, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 400.375, + "epoch": 0.011650669961037727, + "grad_norm": 1.983511940413985, + "kl": 0.062255859375, + "learning_rate": 9.996651170255822e-07, + "loss": 0.0025, + "reward": 1.655466914176941, + "reward_std": 0.35578984022140503, + "rewards/accuracy_reward": 0.5692169070243835, + "rewards/format_reward": 1.0, + "step": 613, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 433.7749938964844, + "epoch": 0.011669675947923596, + "grad_norm": 1.4749190160715435, + "kl": 0.08056640625, + "learning_rate": 9.996640236533624e-07, + "loss": 0.0032, + "reward": 1.9987499713897705, + "reward_std": 0.2163955271244049, + "rewards/accuracy_reward": 0.8500000238418579, + "rewards/format_reward": 1.0, + "step": 614, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 443.95001220703125, + "epoch": 0.011688681934809465, + "grad_norm": 1.918478728157044, + "kl": 0.07470703125, + "learning_rate": 9.996629284997538e-07, + "loss": 0.003, + "reward": 2.009999990463257, + "reward_std": 0.12332119792699814, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 615, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 469.1000061035156, + "epoch": 0.011707687921695334, + "grad_norm": 2.346581040758844, + "kl": 0.05615234375, + "learning_rate": 9.996618315647606e-07, + "loss": 0.0023, + "reward": 1.4388395547866821, + "reward_std": 0.29419374465942383, + "rewards/accuracy_reward": 0.37758952379226685, + "rewards/format_reward": 1.0, + "step": 616, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 459.7749938964844, + "epoch": 0.011726693908581203, + "grad_norm": 2.5079159276012684, + "kl": 0.083984375, + "learning_rate": 9.996607328483863e-07, + "loss": 0.0034, + "reward": 2.0762500762939453, + "reward_std": 0.18828822672367096, + "rewards/accuracy_reward": 0.9000000357627869, + "rewards/format_reward": 1.0, + "step": 617, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 483.2749938964844, + "epoch": 0.011745699895467072, + "grad_norm": 1.3953719435878342, + "kl": 0.0595703125, + "learning_rate": 9.996596323506355e-07, + "loss": 0.0024, + "reward": 1.5387500524520874, + "reward_std": 0.3371405005455017, + "rewards/accuracy_reward": 0.5250000357627869, + "rewards/format_reward": 0.9750000238418579, + "step": 618, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 462.3000183105469, + "epoch": 0.011764705882352941, + "grad_norm": 1.4392682872300198, + "kl": 0.0908203125, + "learning_rate": 9.996585300715115e-07, + "loss": 0.0036, + "reward": 1.6375000476837158, + "reward_std": 0.2294822484254837, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/format_reward": 1.0, + "step": 619, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 457.2250061035156, + "epoch": 0.01178371186923881, + "grad_norm": 2.6246094174848236, + "kl": 0.0703125, + "learning_rate": 9.996574260110183e-07, + "loss": 0.0028, + "reward": 1.6786422729492188, + "reward_std": 0.30471470952033997, + "rewards/accuracy_reward": 0.6811421513557434, + "rewards/format_reward": 0.949999988079071, + "step": 620, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 461.1499938964844, + "epoch": 0.011802717856124679, + "grad_norm": 1.3656036894578758, + "kl": 0.08056640625, + "learning_rate": 9.996563201691602e-07, + "loss": 0.0032, + "reward": 1.6598033905029297, + "reward_std": 0.14326943457126617, + "rewards/accuracy_reward": 0.516053318977356, + "rewards/format_reward": 1.0, + "step": 621, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 456.8000183105469, + "epoch": 0.011821723843010548, + "grad_norm": 2.0122664631701905, + "kl": 0.076171875, + "learning_rate": 9.996552125459408e-07, + "loss": 0.003, + "reward": 1.9619076251983643, + "reward_std": 0.07952728122472763, + "rewards/accuracy_reward": 0.745657742023468, + "rewards/format_reward": 1.0, + "step": 622, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 467.5500183105469, + "epoch": 0.011840729829896417, + "grad_norm": 1.4686695012801096, + "kl": 0.037841796875, + "learning_rate": 9.996541031413643e-07, + "loss": 0.0015, + "reward": 1.6117315292358398, + "reward_std": 0.3073624074459076, + "rewards/accuracy_reward": 0.6267315149307251, + "rewards/format_reward": 0.925000011920929, + "step": 623, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 461.20001220703125, + "epoch": 0.011859735816782286, + "grad_norm": 3.0580021522718965, + "kl": 0.07470703125, + "learning_rate": 9.996529919554345e-07, + "loss": 0.003, + "reward": 1.564117670059204, + "reward_std": 0.255709707736969, + "rewards/accuracy_reward": 0.5378676652908325, + "rewards/format_reward": 0.949999988079071, + "step": 624, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 444.125, + "epoch": 0.011878741803668156, + "grad_norm": 1.9487338400428331, + "kl": 0.09521484375, + "learning_rate": 9.996518789881555e-07, + "loss": 0.0038, + "reward": 2.1648502349853516, + "reward_std": 0.04231090843677521, + "rewards/accuracy_reward": 0.9986003041267395, + "rewards/format_reward": 1.0, + "step": 625, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 406.375, + "epoch": 0.011897747790554025, + "grad_norm": 1.9128282239923182, + "kl": 0.0869140625, + "learning_rate": 9.996507642395308e-07, + "loss": 0.0035, + "reward": 1.518365502357483, + "reward_std": 0.27028393745422363, + "rewards/accuracy_reward": 0.3833654820919037, + "rewards/format_reward": 1.0, + "step": 626, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 458.45001220703125, + "epoch": 0.011916753777439894, + "grad_norm": 1.6452795615978173, + "kl": 0.0908203125, + "learning_rate": 9.996496477095651e-07, + "loss": 0.0036, + "reward": 1.7387501001358032, + "reward_std": 0.2975967824459076, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 627, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 444.0500183105469, + "epoch": 0.011935759764325763, + "grad_norm": 1.7520423278402224, + "kl": 0.10986328125, + "learning_rate": 9.996485293982619e-07, + "loss": 0.0044, + "reward": 2.1905357837677, + "reward_std": 0.0547470822930336, + "rewards/accuracy_reward": 0.9642857909202576, + "rewards/format_reward": 1.0, + "step": 628, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 432.8000183105469, + "epoch": 0.011954765751211632, + "grad_norm": 1.919723592207281, + "kl": 0.06787109375, + "learning_rate": 9.996474093056252e-07, + "loss": 0.0027, + "reward": 1.7476999759674072, + "reward_std": 0.16914092004299164, + "rewards/accuracy_reward": 0.7076999545097351, + "rewards/format_reward": 1.0, + "step": 629, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 483.125, + "epoch": 0.011973771738097501, + "grad_norm": 1.8779415515611733, + "kl": 0.09326171875, + "learning_rate": 9.996462874316594e-07, + "loss": 0.0037, + "reward": 1.7400000095367432, + "reward_std": 0.15443065762519836, + "rewards/accuracy_reward": 0.574999988079071, + "rewards/format_reward": 1.0, + "step": 630, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 447.6499938964844, + "epoch": 0.01199277772498337, + "grad_norm": 1.8232683787485553, + "kl": 0.08056640625, + "learning_rate": 9.99645163776368e-07, + "loss": 0.0032, + "reward": 1.8219269514083862, + "reward_std": 0.31342241168022156, + "rewards/accuracy_reward": 0.6906768083572388, + "rewards/format_reward": 1.0, + "step": 631, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 424.57501220703125, + "epoch": 0.012011783711869239, + "grad_norm": 1.424304229543566, + "kl": 0.0859375, + "learning_rate": 9.99644038339755e-07, + "loss": 0.0034, + "reward": 1.597337245941162, + "reward_std": 0.0419875867664814, + "rewards/accuracy_reward": 0.453587144613266, + "rewards/format_reward": 1.0, + "step": 632, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 457.6750183105469, + "epoch": 0.012030789698755108, + "grad_norm": 1.8217729732219172, + "kl": 0.07666015625, + "learning_rate": 9.99642911121825e-07, + "loss": 0.0031, + "reward": 1.590000033378601, + "reward_std": 0.25514116883277893, + "rewards/accuracy_reward": 0.4749999940395355, + "rewards/format_reward": 1.0, + "step": 633, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 449.9250183105469, + "epoch": 0.012049795685640977, + "grad_norm": 1.7278090085604565, + "kl": 0.076171875, + "learning_rate": 9.996417821225816e-07, + "loss": 0.0031, + "reward": 2.0562500953674316, + "reward_std": 0.2243637591600418, + "rewards/accuracy_reward": 0.9000000357627869, + "rewards/format_reward": 1.0, + "step": 634, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 445.4750061035156, + "epoch": 0.012068801672526846, + "grad_norm": 2.0969020755065686, + "kl": 0.08203125, + "learning_rate": 9.996406513420286e-07, + "loss": 0.0033, + "reward": 1.8986866474151611, + "reward_std": 0.16078971326351166, + "rewards/accuracy_reward": 0.6424368023872375, + "rewards/format_reward": 1.0, + "step": 635, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 417.125, + "epoch": 0.012087807659412714, + "grad_norm": 2.5056519956756445, + "kl": 0.0849609375, + "learning_rate": 9.996395187801704e-07, + "loss": 0.0034, + "reward": 1.6768711805343628, + "reward_std": 0.26196733117103577, + "rewards/accuracy_reward": 0.616871178150177, + "rewards/format_reward": 1.0, + "step": 636, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 438.1000061035156, + "epoch": 0.012106813646298583, + "grad_norm": 1.4188254461964447, + "kl": 0.087890625, + "learning_rate": 9.99638384437011e-07, + "loss": 0.0035, + "reward": 2.0, + "reward_std": 0.04963252693414688, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 637, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.4, + "completion_length": 425.57501220703125, + "epoch": 0.012125819633184452, + "grad_norm": 1.5734732496071313, + "kl": 0.091796875, + "learning_rate": 9.996372483125545e-07, + "loss": 0.0037, + "reward": 1.7487499713897705, + "reward_std": 0.02397349290549755, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 638, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 383.0500183105469, + "epoch": 0.012144825620070323, + "grad_norm": 2.1373953876716247, + "kl": 0.083984375, + "learning_rate": 9.996361104068046e-07, + "loss": 0.0034, + "reward": 2.0375001430511475, + "reward_std": 0.13431942462921143, + "rewards/accuracy_reward": 0.925000011920929, + "rewards/format_reward": 1.0, + "step": 639, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 454.7250061035156, + "epoch": 0.012163831606956192, + "grad_norm": 2.934219298783286, + "kl": 0.06201171875, + "learning_rate": 9.996349707197658e-07, + "loss": 0.0025, + "reward": 1.9974462985992432, + "reward_std": 0.04825620725750923, + "rewards/accuracy_reward": 0.8361961245536804, + "rewards/format_reward": 1.0, + "step": 640, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 455.0249938964844, + "epoch": 0.01218283759384206, + "grad_norm": 4.915724140240582, + "kl": 0.0703125, + "learning_rate": 9.996338292514417e-07, + "loss": 0.0028, + "reward": 1.905596375465393, + "reward_std": 0.28087708353996277, + "rewards/accuracy_reward": 0.8243463635444641, + "rewards/format_reward": 0.949999988079071, + "step": 641, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 436.125, + "epoch": 0.01220184358072793, + "grad_norm": 1.7731350972977515, + "kl": 0.09375, + "learning_rate": 9.996326860018367e-07, + "loss": 0.0037, + "reward": 1.9500000476837158, + "reward_std": 0.389931857585907, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 1.0, + "step": 642, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 438.3500061035156, + "epoch": 0.012220849567613799, + "grad_norm": 2.0456006068500403, + "kl": 0.0654296875, + "learning_rate": 9.99631540970955e-07, + "loss": 0.0026, + "reward": 1.8209278583526611, + "reward_std": 0.04710244759917259, + "rewards/accuracy_reward": 0.6621779799461365, + "rewards/format_reward": 1.0, + "step": 643, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 437.0500183105469, + "epoch": 0.012239855554499668, + "grad_norm": 1.864468504138235, + "kl": 0.0869140625, + "learning_rate": 9.996303941588001e-07, + "loss": 0.0035, + "reward": 1.8938648700714111, + "reward_std": 0.04810573533177376, + "rewards/accuracy_reward": 0.6726149916648865, + "rewards/format_reward": 1.0, + "step": 644, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 460.2250061035156, + "epoch": 0.012258861541385536, + "grad_norm": 2.7097819212031315, + "kl": 0.0732421875, + "learning_rate": 9.996292455653765e-07, + "loss": 0.0029, + "reward": 1.5883558988571167, + "reward_std": 0.03470195457339287, + "rewards/accuracy_reward": 0.43960580229759216, + "rewards/format_reward": 1.0, + "step": 645, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 417.7749938964844, + "epoch": 0.012277867528271405, + "grad_norm": 2.0277230704655484, + "kl": 0.1044921875, + "learning_rate": 9.996280951906884e-07, + "loss": 0.0042, + "reward": 2.0450000762939453, + "reward_std": 0.10080788284540176, + "rewards/accuracy_reward": 0.824999988079071, + "rewards/format_reward": 1.0, + "step": 646, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 452.95001220703125, + "epoch": 0.012296873515157274, + "grad_norm": 1.8869256961406453, + "kl": 0.0693359375, + "learning_rate": 9.996269430347397e-07, + "loss": 0.0028, + "reward": 1.651249885559082, + "reward_std": 0.24141600728034973, + "rewards/accuracy_reward": 0.5250000357627869, + "rewards/format_reward": 1.0, + "step": 647, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 455.7250061035156, + "epoch": 0.012315879502043143, + "grad_norm": 5.339584858474898, + "kl": 0.06494140625, + "learning_rate": 9.996257890975348e-07, + "loss": 0.0026, + "reward": 1.7776453495025635, + "reward_std": 0.08328854292631149, + "rewards/accuracy_reward": 0.651395320892334, + "rewards/format_reward": 1.0, + "step": 648, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.4, + "completion_length": 444.5, + "epoch": 0.012334885488929012, + "grad_norm": 1.9100516400642031, + "kl": 0.10302734375, + "learning_rate": 9.996246333790773e-07, + "loss": 0.0041, + "reward": 1.7387501001358032, + "reward_std": 0.03230414167046547, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 649, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 453.6000061035156, + "epoch": 0.012353891475814881, + "grad_norm": 1.8411640564675906, + "kl": 0.08203125, + "learning_rate": 9.996234758793719e-07, + "loss": 0.0033, + "reward": 1.9427776336669922, + "reward_std": 0.13237065076828003, + "rewards/accuracy_reward": 0.7477777004241943, + "rewards/format_reward": 1.0, + "step": 650, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 470.7250061035156, + "epoch": 0.01237289746270075, + "grad_norm": 1.457761360473391, + "kl": 0.08837890625, + "learning_rate": 9.996223165984222e-07, + "loss": 0.0035, + "reward": 1.8125, + "reward_std": 0.28509339690208435, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 0.9750000238418579, + "step": 651, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 440.625, + "epoch": 0.012391903449586619, + "grad_norm": 2.048107960168898, + "kl": 0.087890625, + "learning_rate": 9.996211555362323e-07, + "loss": 0.0035, + "reward": 1.803942084312439, + "reward_std": 0.24984395503997803, + "rewards/accuracy_reward": 0.6914423108100891, + "rewards/format_reward": 1.0, + "step": 652, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 437.32501220703125, + "epoch": 0.01241090943647249, + "grad_norm": 2.2810613413674905, + "kl": 0.0849609375, + "learning_rate": 9.996199926928071e-07, + "loss": 0.0034, + "reward": 1.8767356872558594, + "reward_std": 0.1472785919904709, + "rewards/accuracy_reward": 0.8229856491088867, + "rewards/format_reward": 1.0, + "step": 653, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 450.1000061035156, + "epoch": 0.012429915423358359, + "grad_norm": 4.732046469898634, + "kl": 0.087890625, + "learning_rate": 9.9961882806815e-07, + "loss": 0.0035, + "reward": 1.9523528814315796, + "reward_std": 0.04254266619682312, + "rewards/accuracy_reward": 0.8073530197143555, + "rewards/format_reward": 1.0, + "step": 654, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 413.6000061035156, + "epoch": 0.012448921410244227, + "grad_norm": 1.6900681576111625, + "kl": 0.07958984375, + "learning_rate": 9.996176616622653e-07, + "loss": 0.0032, + "reward": 1.8049999475479126, + "reward_std": 0.3436048626899719, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 1.0, + "step": 655, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 461.70001220703125, + "epoch": 0.012467927397130096, + "grad_norm": 2.5144251096286796, + "kl": 0.10009765625, + "learning_rate": 9.996164934751575e-07, + "loss": 0.004, + "reward": 2.023566961288452, + "reward_std": 0.13137094676494598, + "rewards/accuracy_reward": 0.8110671043395996, + "rewards/format_reward": 1.0, + "step": 656, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 469.1750183105469, + "epoch": 0.012486933384015965, + "grad_norm": 1.5470058910709186, + "kl": 0.048828125, + "learning_rate": 9.996153235068303e-07, + "loss": 0.0019, + "reward": 1.8545799255371094, + "reward_std": 0.12340261787176132, + "rewards/accuracy_reward": 0.8495798110961914, + "rewards/format_reward": 1.0, + "step": 657, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.8, + "all_wrong": 0.2, + "completion_length": 467.1499938964844, + "epoch": 0.012505939370901834, + "grad_norm": 2.1737929252088724, + "kl": 0.111328125, + "learning_rate": 9.996141517572882e-07, + "loss": 0.0045, + "reward": 2.06499981880188, + "reward_std": 0.048381078988313675, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 658, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 418.3000183105469, + "epoch": 0.012524945357787703, + "grad_norm": 1.737305405881958, + "kl": 0.060791015625, + "learning_rate": 9.996129782265354e-07, + "loss": 0.0024, + "reward": 1.751460313796997, + "reward_std": 0.026737544685602188, + "rewards/accuracy_reward": 0.6464601755142212, + "rewards/format_reward": 1.0, + "step": 659, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 451.57501220703125, + "epoch": 0.012543951344673572, + "grad_norm": 2.0080142630810327, + "kl": 0.09130859375, + "learning_rate": 9.996118029145757e-07, + "loss": 0.0036, + "reward": 2.202500104904175, + "reward_std": 0.12062938511371613, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 660, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 445.95001220703125, + "epoch": 0.012562957331559441, + "grad_norm": 2.335174355328368, + "kl": 0.076171875, + "learning_rate": 9.996106258214138e-07, + "loss": 0.003, + "reward": 1.9196529388427734, + "reward_std": 0.14973393082618713, + "rewards/accuracy_reward": 0.713403046131134, + "rewards/format_reward": 1.0, + "step": 661, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.4, + "completion_length": 437.07501220703125, + "epoch": 0.01258196331844531, + "grad_norm": 1.7604136242117012, + "kl": 0.09521484375, + "learning_rate": 9.996094469470534e-07, + "loss": 0.0038, + "reward": 1.7487499713897705, + "reward_std": 0.0325133316218853, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 662, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 445.5249938964844, + "epoch": 0.012600969305331179, + "grad_norm": 1.9687321204089707, + "kl": 0.0869140625, + "learning_rate": 9.99608266291499e-07, + "loss": 0.0035, + "reward": 1.7225693464279175, + "reward_std": 0.11156398057937622, + "rewards/accuracy_reward": 0.5750694274902344, + "rewards/format_reward": 1.0, + "step": 663, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.2, + "completion_length": 455.25, + "epoch": 0.012619975292217048, + "grad_norm": 2.047854068258095, + "kl": 0.1005859375, + "learning_rate": 9.996070838547548e-07, + "loss": 0.004, + "reward": 2.004999876022339, + "reward_std": 0.05544399842619896, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 664, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 438.1000061035156, + "epoch": 0.012638981279102917, + "grad_norm": 5.169949131914374, + "kl": 0.08203125, + "learning_rate": 9.99605899636825e-07, + "loss": 0.0033, + "reward": 1.960333228111267, + "reward_std": 0.162943035364151, + "rewards/accuracy_reward": 0.861583411693573, + "rewards/format_reward": 1.0, + "step": 665, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 461.7250061035156, + "epoch": 0.012657987265988787, + "grad_norm": 2.463152764732014, + "kl": 0.099609375, + "learning_rate": 9.99604713637714e-07, + "loss": 0.004, + "reward": 1.9513797760009766, + "reward_std": 0.25893664360046387, + "rewards/accuracy_reward": 0.8126299977302551, + "rewards/format_reward": 1.0, + "step": 666, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 464.3000183105469, + "epoch": 0.012676993252874656, + "grad_norm": 2.4680220945409133, + "kl": 0.06982421875, + "learning_rate": 9.996035258574253e-07, + "loss": 0.0028, + "reward": 1.8019866943359375, + "reward_std": 0.18856680393218994, + "rewards/accuracy_reward": 0.6719867587089539, + "rewards/format_reward": 1.0, + "step": 667, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 403.5500183105469, + "epoch": 0.012695999239760525, + "grad_norm": 1.7906531481507488, + "kl": 0.060546875, + "learning_rate": 9.99602336295964e-07, + "loss": 0.0024, + "reward": 1.6636707782745361, + "reward_std": 0.24025700986385345, + "rewards/accuracy_reward": 0.6261708736419678, + "rewards/format_reward": 1.0, + "step": 668, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.8, + "all_wrong": 0.2, + "completion_length": 455.125, + "epoch": 0.012715005226646394, + "grad_norm": 1.3604911153138315, + "kl": 0.08935546875, + "learning_rate": 9.99601144953334e-07, + "loss": 0.0036, + "reward": 2.015000104904175, + "reward_std": 0.035699550062417984, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 669, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 442.0500183105469, + "epoch": 0.012734011213532263, + "grad_norm": 1.6110022902882712, + "kl": 0.045654296875, + "learning_rate": 9.995999518295395e-07, + "loss": 0.0018, + "reward": 1.787500023841858, + "reward_std": 0.25540170073509216, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 1.0, + "step": 670, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 438.9750061035156, + "epoch": 0.012753017200418132, + "grad_norm": 1.5515349711701059, + "kl": 0.0625, + "learning_rate": 9.995987569245848e-07, + "loss": 0.0025, + "reward": 1.6841137409210205, + "reward_std": 0.04847750440239906, + "rewards/accuracy_reward": 0.5491136908531189, + "rewards/format_reward": 1.0, + "step": 671, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 438.0249938964844, + "epoch": 0.012772023187304, + "grad_norm": 2.391331011104615, + "kl": 0.07421875, + "learning_rate": 9.99597560238474e-07, + "loss": 0.003, + "reward": 1.9480408430099487, + "reward_std": 0.12720969319343567, + "rewards/accuracy_reward": 0.7405409216880798, + "rewards/format_reward": 1.0, + "step": 672, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 382.6000061035156, + "epoch": 0.01279102917418987, + "grad_norm": 2.0372452827805874, + "kl": 0.0654296875, + "learning_rate": 9.995963617712116e-07, + "loss": 0.0026, + "reward": 1.7572059631347656, + "reward_std": 0.07543417066335678, + "rewards/accuracy_reward": 0.6547058820724487, + "rewards/format_reward": 1.0, + "step": 673, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 432.3000183105469, + "epoch": 0.012810035161075739, + "grad_norm": 1.3425902262789746, + "kl": 0.06787109375, + "learning_rate": 9.99595161522802e-07, + "loss": 0.0027, + "reward": 1.813750147819519, + "reward_std": 0.09199460595846176, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 1.0, + "step": 674, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.8, + "all_wrong": 0.2, + "completion_length": 427.75, + "epoch": 0.012829041147961608, + "grad_norm": 1.6300342526398393, + "kl": 0.087890625, + "learning_rate": 9.99593959493249e-07, + "loss": 0.0035, + "reward": 1.9650001525878906, + "reward_std": 0.02449488826096058, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 675, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 446.6750183105469, + "epoch": 0.012848047134847477, + "grad_norm": 2.8352253222956687, + "kl": 0.0634765625, + "learning_rate": 9.995927556825572e-07, + "loss": 0.0025, + "reward": 1.5192979574203491, + "reward_std": 0.25122472643852234, + "rewards/accuracy_reward": 0.40304800868034363, + "rewards/format_reward": 1.0, + "step": 676, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 446.5249938964844, + "epoch": 0.012867053121733345, + "grad_norm": 1.5411355002220397, + "kl": 0.0703125, + "learning_rate": 9.99591550090731e-07, + "loss": 0.0028, + "reward": 1.7590769529342651, + "reward_std": 0.29475030303001404, + "rewards/accuracy_reward": 0.719076931476593, + "rewards/format_reward": 1.0, + "step": 677, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 445.3000183105469, + "epoch": 0.012886059108619214, + "grad_norm": 2.266135144032741, + "kl": 0.0732421875, + "learning_rate": 9.995903427177743e-07, + "loss": 0.0029, + "reward": 1.6436678171157837, + "reward_std": 0.09614302963018417, + "rewards/accuracy_reward": 0.4949178695678711, + "rewards/format_reward": 1.0, + "step": 678, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 474.32501220703125, + "epoch": 0.012905065095505083, + "grad_norm": 1.5861619437714614, + "kl": 0.107421875, + "learning_rate": 9.99589133563692e-07, + "loss": 0.0043, + "reward": 1.4812500476837158, + "reward_std": 0.35283902287483215, + "rewards/accuracy_reward": 0.4749999940395355, + "rewards/format_reward": 0.949999988079071, + "step": 679, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 468.7250061035156, + "epoch": 0.012924071082390954, + "grad_norm": 2.1824498112310957, + "kl": 0.1103515625, + "learning_rate": 9.995879226284878e-07, + "loss": 0.0044, + "reward": 2.0587499141693115, + "reward_std": 0.13940279185771942, + "rewards/accuracy_reward": 0.8500000238418579, + "rewards/format_reward": 1.0, + "step": 680, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 476.8000183105469, + "epoch": 0.012943077069276823, + "grad_norm": 1.419429883664164, + "kl": 0.115234375, + "learning_rate": 9.995867099121663e-07, + "loss": 0.0046, + "reward": 1.7587498426437378, + "reward_std": 0.11715694516897202, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 681, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 445.0500183105469, + "epoch": 0.012962083056162692, + "grad_norm": 4.968383152055369, + "kl": 0.111328125, + "learning_rate": 9.995854954147318e-07, + "loss": 0.0045, + "reward": 1.8493388891220093, + "reward_std": 0.17193397879600525, + "rewards/accuracy_reward": 0.6318390965461731, + "rewards/format_reward": 1.0, + "step": 682, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 439.1750183105469, + "epoch": 0.01298108904304856, + "grad_norm": 1.7905287798967389, + "kl": 0.08837890625, + "learning_rate": 9.995842791361889e-07, + "loss": 0.0035, + "reward": 1.712389349937439, + "reward_std": 0.06312983483076096, + "rewards/accuracy_reward": 0.4986395537853241, + "rewards/format_reward": 1.0, + "step": 683, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 488.0500183105469, + "epoch": 0.01300009502993443, + "grad_norm": 3.3361165259050787, + "kl": 0.10400390625, + "learning_rate": 9.995830610765413e-07, + "loss": 0.0042, + "reward": 2.029280424118042, + "reward_std": 0.2020348608493805, + "rewards/accuracy_reward": 0.8017805218696594, + "rewards/format_reward": 0.9750000238418579, + "step": 684, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 455.875, + "epoch": 0.013019101016820299, + "grad_norm": 1.6627613233285068, + "kl": 0.11181640625, + "learning_rate": 9.995818412357939e-07, + "loss": 0.0045, + "reward": 2.078749895095825, + "reward_std": 0.12405641376972198, + "rewards/accuracy_reward": 0.8500000238418579, + "rewards/format_reward": 1.0, + "step": 685, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 481.5, + "epoch": 0.013038107003706167, + "grad_norm": 2.147588045489399, + "kl": 0.08447265625, + "learning_rate": 9.99580619613951e-07, + "loss": 0.0034, + "reward": 1.9386663436889648, + "reward_std": 0.17508693039417267, + "rewards/accuracy_reward": 0.8549163937568665, + "rewards/format_reward": 1.0, + "step": 686, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.4, + "completion_length": 486.1750183105469, + "epoch": 0.013057112990592036, + "grad_norm": 1.6460557367558475, + "kl": 0.11474609375, + "learning_rate": 9.995793962110164e-07, + "loss": 0.0046, + "reward": 1.7737499475479126, + "reward_std": 0.07152249664068222, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 687, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 487.2749938964844, + "epoch": 0.013076118977477905, + "grad_norm": 2.9424336084366276, + "kl": 0.10498046875, + "learning_rate": 9.995781710269952e-07, + "loss": 0.0042, + "reward": 1.6504096984863281, + "reward_std": 0.32433828711509705, + "rewards/accuracy_reward": 0.6016597151756287, + "rewards/format_reward": 0.949999988079071, + "step": 688, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 488.6499938964844, + "epoch": 0.013095124964363774, + "grad_norm": 1.3859886241595234, + "kl": 0.1025390625, + "learning_rate": 9.995769440618914e-07, + "loss": 0.0041, + "reward": 1.5981104373931885, + "reward_std": 0.3493271470069885, + "rewards/accuracy_reward": 0.5643603801727295, + "rewards/format_reward": 0.9750000238418579, + "step": 689, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 459.75, + "epoch": 0.013114130951249643, + "grad_norm": 1.5592672984917082, + "kl": 0.1044921875, + "learning_rate": 9.995757153157093e-07, + "loss": 0.0042, + "reward": 2.1262500286102295, + "reward_std": 0.14376184344291687, + "rewards/accuracy_reward": 0.9000000357627869, + "rewards/format_reward": 1.0, + "step": 690, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 487.5, + "epoch": 0.013133136938135512, + "grad_norm": 1.9335252465138564, + "kl": 0.1279296875, + "learning_rate": 9.995744847884535e-07, + "loss": 0.0051, + "reward": 2.0260372161865234, + "reward_std": 0.1518925577402115, + "rewards/accuracy_reward": 0.7647872567176819, + "rewards/format_reward": 1.0, + "step": 691, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 471.3000183105469, + "epoch": 0.013152142925021381, + "grad_norm": 1.6032090227902767, + "kl": 0.1318359375, + "learning_rate": 9.995732524801284e-07, + "loss": 0.0053, + "reward": 1.7048267126083374, + "reward_std": 0.19036322832107544, + "rewards/accuracy_reward": 0.5773269534111023, + "rewards/format_reward": 0.9750000238418579, + "step": 692, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 470.375, + "epoch": 0.01317114891190725, + "grad_norm": 1.7611384344022971, + "kl": 0.11474609375, + "learning_rate": 9.99572018390738e-07, + "loss": 0.0046, + "reward": 1.9498538970947266, + "reward_std": 0.25558096170425415, + "rewards/accuracy_reward": 0.8211038708686829, + "rewards/format_reward": 0.9750000238418579, + "step": 693, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.4, + "completion_length": 434.6750183105469, + "epoch": 0.01319015489879312, + "grad_norm": 1.4777802098925599, + "kl": 0.0947265625, + "learning_rate": 9.995707825202873e-07, + "loss": 0.0038, + "reward": 1.3722171783447266, + "reward_std": 0.1442262977361679, + "rewards/accuracy_reward": 0.3622172772884369, + "rewards/format_reward": 1.0, + "step": 694, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 467.625, + "epoch": 0.01320916088567899, + "grad_norm": 1.5567855319393067, + "kl": 0.10009765625, + "learning_rate": 9.995695448687803e-07, + "loss": 0.004, + "reward": 2.016423225402832, + "reward_std": 0.05085080862045288, + "rewards/accuracy_reward": 0.8601731657981873, + "rewards/format_reward": 1.0, + "step": 695, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 450.4250183105469, + "epoch": 0.013228166872564858, + "grad_norm": 1.8308404877685869, + "kl": 0.0791015625, + "learning_rate": 9.995683054362214e-07, + "loss": 0.0032, + "reward": 1.8470677137374878, + "reward_std": 0.1592775285243988, + "rewards/accuracy_reward": 0.6845678687095642, + "rewards/format_reward": 1.0, + "step": 696, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 430.4250183105469, + "epoch": 0.013247172859450727, + "grad_norm": 1.8005562681832192, + "kl": 0.10888671875, + "learning_rate": 9.99567064222615e-07, + "loss": 0.0044, + "reward": 1.8144510984420776, + "reward_std": 0.1525733917951584, + "rewards/accuracy_reward": 0.6607011556625366, + "rewards/format_reward": 1.0, + "step": 697, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 449.8999938964844, + "epoch": 0.013266178846336596, + "grad_norm": 1.6268096392218965, + "kl": 0.0927734375, + "learning_rate": 9.99565821227966e-07, + "loss": 0.0037, + "reward": 1.8695858716964722, + "reward_std": 0.03064870275557041, + "rewards/accuracy_reward": 0.6445857882499695, + "rewards/format_reward": 1.0, + "step": 698, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 396.4750061035156, + "epoch": 0.013285184833222465, + "grad_norm": 1.5353147209699973, + "kl": 0.08740234375, + "learning_rate": 9.995645764522783e-07, + "loss": 0.0035, + "reward": 1.558989405632019, + "reward_std": 0.033408764749765396, + "rewards/accuracy_reward": 0.4802393615245819, + "rewards/format_reward": 1.0, + "step": 699, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 450.45001220703125, + "epoch": 0.013304190820108334, + "grad_norm": 8.411435210594698, + "kl": 0.08447265625, + "learning_rate": 9.995633298955563e-07, + "loss": 0.0034, + "reward": 1.4234392642974854, + "reward_std": 0.09718119353055954, + "rewards/accuracy_reward": 0.3634392023086548, + "rewards/format_reward": 1.0, + "step": 700, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 442.6499938964844, + "epoch": 0.013323196806994203, + "grad_norm": 1.561761211920665, + "kl": 0.0712890625, + "learning_rate": 9.99562081557805e-07, + "loss": 0.0029, + "reward": 1.8637498617172241, + "reward_std": 0.11404251307249069, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 1.0, + "step": 701, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 423.1499938964844, + "epoch": 0.013342202793880072, + "grad_norm": 1.7734278607055687, + "kl": 0.068359375, + "learning_rate": 9.995608314390282e-07, + "loss": 0.0027, + "reward": 1.7347859144210815, + "reward_std": 0.20102575421333313, + "rewards/accuracy_reward": 0.6360358595848083, + "rewards/format_reward": 1.0, + "step": 702, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 414.7749938964844, + "epoch": 0.013361208780765941, + "grad_norm": 1.500602105385007, + "kl": 0.10888671875, + "learning_rate": 9.995595795392309e-07, + "loss": 0.0043, + "reward": 1.872756004333496, + "reward_std": 0.04351817071437836, + "rewards/accuracy_reward": 0.6527560949325562, + "rewards/format_reward": 1.0, + "step": 703, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 401.1750183105469, + "epoch": 0.01338021476765181, + "grad_norm": 2.457688062196613, + "kl": 0.09326171875, + "learning_rate": 9.995583258584173e-07, + "loss": 0.0037, + "reward": 1.9233334064483643, + "reward_std": 0.1413118690252304, + "rewards/accuracy_reward": 0.7583333849906921, + "rewards/format_reward": 1.0, + "step": 704, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 441.375, + "epoch": 0.013399220754537679, + "grad_norm": 2.159684717658479, + "kl": 0.056396484375, + "learning_rate": 9.995570703965919e-07, + "loss": 0.0023, + "reward": 1.666308045387268, + "reward_std": 0.15317901968955994, + "rewards/accuracy_reward": 0.5713080763816833, + "rewards/format_reward": 1.0, + "step": 705, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 417.1000061035156, + "epoch": 0.013418226741423548, + "grad_norm": 1.8305413335876235, + "kl": 0.10888671875, + "learning_rate": 9.99555813153759e-07, + "loss": 0.0043, + "reward": 1.7286853790283203, + "reward_std": 0.15412406623363495, + "rewards/accuracy_reward": 0.516185462474823, + "rewards/format_reward": 1.0, + "step": 706, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 412.6000061035156, + "epoch": 0.013437232728309418, + "grad_norm": 1.6703440186537377, + "kl": 0.087890625, + "learning_rate": 9.995545541299234e-07, + "loss": 0.0035, + "reward": 1.998673677444458, + "reward_std": 0.0639389157295227, + "rewards/accuracy_reward": 0.8824234008789062, + "rewards/format_reward": 1.0, + "step": 707, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 467.5, + "epoch": 0.013456238715195287, + "grad_norm": 1.683539870395896, + "kl": 0.09375, + "learning_rate": 9.995532933250893e-07, + "loss": 0.0037, + "reward": 2.0198028087615967, + "reward_std": 0.05851763114333153, + "rewards/accuracy_reward": 0.8735527396202087, + "rewards/format_reward": 1.0, + "step": 708, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 444.2250061035156, + "epoch": 0.013475244702081156, + "grad_norm": 1.7378851528712578, + "kl": 0.1123046875, + "learning_rate": 9.995520307392617e-07, + "loss": 0.0045, + "reward": 1.576685905456543, + "reward_std": 0.05492706224322319, + "rewards/accuracy_reward": 0.4429359436035156, + "rewards/format_reward": 1.0, + "step": 709, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 413.57501220703125, + "epoch": 0.013494250688967025, + "grad_norm": 1.6149188920475832, + "kl": 0.07666015625, + "learning_rate": 9.995507663724445e-07, + "loss": 0.0031, + "reward": 1.9402374029159546, + "reward_std": 0.09124945849180222, + "rewards/accuracy_reward": 0.8314873576164246, + "rewards/format_reward": 1.0, + "step": 710, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 491.9750061035156, + "epoch": 0.013513256675852894, + "grad_norm": 1.629602526732811, + "kl": 0.1337890625, + "learning_rate": 9.995495002246424e-07, + "loss": 0.0053, + "reward": 2.268749952316284, + "reward_std": 0.16989779472351074, + "rewards/accuracy_reward": 0.949999988079071, + "rewards/format_reward": 1.0, + "step": 711, + "temporal_rewards": 1.0 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 500.5500183105469, + "epoch": 0.013532262662738763, + "grad_norm": 1.2915342122524358, + "kl": 0.07861328125, + "learning_rate": 9.9954823229586e-07, + "loss": 0.0031, + "reward": 1.6649999618530273, + "reward_std": 0.4036776125431061, + "rewards/accuracy_reward": 0.699999988079071, + "rewards/format_reward": 0.9000000357627869, + "step": 712, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 521.1749877929688, + "epoch": 0.013551268649624632, + "grad_norm": 1.477959756958054, + "kl": 0.1181640625, + "learning_rate": 9.99546962586102e-07, + "loss": 0.0047, + "reward": 1.9762500524520874, + "reward_std": 0.3797941505908966, + "rewards/accuracy_reward": 0.8500000238418579, + "rewards/format_reward": 0.949999988079071, + "step": 713, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 491.6000061035156, + "epoch": 0.0135702746365105, + "grad_norm": 2.2341639964465068, + "kl": 0.10009765625, + "learning_rate": 9.995456910953723e-07, + "loss": 0.004, + "reward": 1.7212860584259033, + "reward_std": 0.11174527555704117, + "rewards/accuracy_reward": 0.6525360941886902, + "rewards/format_reward": 1.0, + "step": 714, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 514.7000122070312, + "epoch": 0.01358928062339637, + "grad_norm": 1.6864523396092914, + "kl": 0.0908203125, + "learning_rate": 9.99544417823676e-07, + "loss": 0.0036, + "reward": 1.6670664548873901, + "reward_std": 0.26193180680274963, + "rewards/accuracy_reward": 0.6383164525032043, + "rewards/format_reward": 1.0, + "step": 715, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 514.9750366210938, + "epoch": 0.013608286610282239, + "grad_norm": 1.5579047563085828, + "kl": 0.09423828125, + "learning_rate": 9.995431427710176e-07, + "loss": 0.0038, + "reward": 1.6074436902999878, + "reward_std": 0.17465007305145264, + "rewards/accuracy_reward": 0.6824435591697693, + "rewards/format_reward": 0.824999988079071, + "step": 716, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 493.9250183105469, + "epoch": 0.013627292597168108, + "grad_norm": 1.344428379186121, + "kl": 0.095703125, + "learning_rate": 9.995418659374015e-07, + "loss": 0.0038, + "reward": 1.6413357257843018, + "reward_std": 0.13948741555213928, + "rewards/accuracy_reward": 0.5838356018066406, + "rewards/format_reward": 1.0, + "step": 717, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 475.0249938964844, + "epoch": 0.013646298584053976, + "grad_norm": 1.5545225116049264, + "kl": 0.08984375, + "learning_rate": 9.995405873228323e-07, + "loss": 0.0036, + "reward": 1.6741666793823242, + "reward_std": 0.17087028920650482, + "rewards/accuracy_reward": 0.6166667342185974, + "rewards/format_reward": 1.0, + "step": 718, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 465.70001220703125, + "epoch": 0.013665304570939845, + "grad_norm": 1.7208840275761956, + "kl": 0.11279296875, + "learning_rate": 9.995393069273145e-07, + "loss": 0.0045, + "reward": 1.7095832824707031, + "reward_std": 0.1775379627943039, + "rewards/accuracy_reward": 0.6195833683013916, + "rewards/format_reward": 0.949999988079071, + "step": 719, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 463.3000183105469, + "epoch": 0.013684310557825714, + "grad_norm": 1.914289754729987, + "kl": 0.10107421875, + "learning_rate": 9.995380247508528e-07, + "loss": 0.004, + "reward": 1.944087266921997, + "reward_std": 0.1626613438129425, + "rewards/accuracy_reward": 0.7515873312950134, + "rewards/format_reward": 1.0, + "step": 720, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.2, + "completion_length": 471.57501220703125, + "epoch": 0.013703316544711585, + "grad_norm": 2.187028513443578, + "kl": 0.12255859375, + "learning_rate": 9.995367407934513e-07, + "loss": 0.0049, + "reward": 1.9500000476837158, + "reward_std": 0.03842785581946373, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 721, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 446.75, + "epoch": 0.013722322531597454, + "grad_norm": 2.083046307719133, + "kl": 0.08203125, + "learning_rate": 9.995354550551155e-07, + "loss": 0.0033, + "reward": 1.7972339391708374, + "reward_std": 0.25385886430740356, + "rewards/accuracy_reward": 0.6409839987754822, + "rewards/format_reward": 0.949999988079071, + "step": 722, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 430.9750061035156, + "epoch": 0.013741328518483323, + "grad_norm": 1.8906994238501933, + "kl": 0.0908203125, + "learning_rate": 9.99534167535849e-07, + "loss": 0.0036, + "reward": 1.6831945180892944, + "reward_std": 0.06170584633946419, + "rewards/accuracy_reward": 0.6194444298744202, + "rewards/format_reward": 1.0, + "step": 723, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 424.1750183105469, + "epoch": 0.013760334505369192, + "grad_norm": 2.8193160405494675, + "kl": 0.056884765625, + "learning_rate": 9.99532878235657e-07, + "loss": 0.0023, + "reward": 1.7075878381729126, + "reward_std": 0.2339305430650711, + "rewards/accuracy_reward": 0.6125877499580383, + "rewards/format_reward": 1.0, + "step": 724, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 430.3999938964844, + "epoch": 0.01377934049225506, + "grad_norm": 1.5906748419733205, + "kl": 0.06494140625, + "learning_rate": 9.99531587154544e-07, + "loss": 0.0026, + "reward": 1.7350000143051147, + "reward_std": 0.2571442723274231, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 0.949999988079071, + "step": 725, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 414.875, + "epoch": 0.01379834647914093, + "grad_norm": 1.9145714963599305, + "kl": 0.1015625, + "learning_rate": 9.995302942925143e-07, + "loss": 0.0041, + "reward": 1.9920570850372314, + "reward_std": 0.24692554771900177, + "rewards/accuracy_reward": 0.8295570611953735, + "rewards/format_reward": 1.0, + "step": 726, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 396.5, + "epoch": 0.013817352466026799, + "grad_norm": 2.034283441537553, + "kl": 0.0673828125, + "learning_rate": 9.99528999649573e-07, + "loss": 0.0027, + "reward": 1.5793137550354004, + "reward_std": 0.028951624408364296, + "rewards/accuracy_reward": 0.48556381464004517, + "rewards/format_reward": 1.0, + "step": 727, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 409.1499938964844, + "epoch": 0.013836358452912667, + "grad_norm": 1.9427356544075842, + "kl": 0.09716796875, + "learning_rate": 9.995277032257243e-07, + "loss": 0.0039, + "reward": 2.0037500858306885, + "reward_std": 0.16830992698669434, + "rewards/accuracy_reward": 0.949999988079071, + "rewards/format_reward": 1.0, + "step": 728, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 395.32501220703125, + "epoch": 0.013855364439798536, + "grad_norm": 1.5906150597472941, + "kl": 0.06689453125, + "learning_rate": 9.995264050209728e-07, + "loss": 0.0027, + "reward": 1.931249976158142, + "reward_std": 0.24466462433338165, + "rewards/accuracy_reward": 0.8324999809265137, + "rewards/format_reward": 1.0, + "step": 729, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 446.4250183105469, + "epoch": 0.013874370426684405, + "grad_norm": 2.359530532053771, + "kl": 0.1005859375, + "learning_rate": 9.995251050353236e-07, + "loss": 0.004, + "reward": 1.9478927850723267, + "reward_std": 0.04787446931004524, + "rewards/accuracy_reward": 0.7928928732872009, + "rewards/format_reward": 1.0, + "step": 730, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.4, + "completion_length": 419.07501220703125, + "epoch": 0.013893376413570274, + "grad_norm": 1.734433346991532, + "kl": 0.09521484375, + "learning_rate": 9.995238032687809e-07, + "loss": 0.0038, + "reward": 1.693750023841858, + "reward_std": 0.01837114803493023, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 731, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 470.3500061035156, + "epoch": 0.013912382400456143, + "grad_norm": 1.536544196243968, + "kl": 0.09619140625, + "learning_rate": 9.995224997213497e-07, + "loss": 0.0038, + "reward": 1.7087501287460327, + "reward_std": 0.2257985919713974, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 1.0, + "step": 732, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.2, + "completion_length": 478.0500183105469, + "epoch": 0.013931388387342012, + "grad_norm": 1.653609333566446, + "kl": 0.13671875, + "learning_rate": 9.995211943930342e-07, + "loss": 0.0055, + "reward": 2.0799999237060547, + "reward_std": 0.03009720705449581, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 733, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 1.0, + "all_wrong": 0.0, + "completion_length": 475.20001220703125, + "epoch": 0.013950394374227881, + "grad_norm": 1.5542723449274536, + "kl": 0.1298828125, + "learning_rate": 9.995198872838392e-07, + "loss": 0.0052, + "reward": 2.231250047683716, + "reward_std": 0.03622090816497803, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 734, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 509.1750183105469, + "epoch": 0.013969400361113752, + "grad_norm": 1.37641121629186, + "kl": 0.11572265625, + "learning_rate": 9.995185783937697e-07, + "loss": 0.0046, + "reward": 1.529911994934082, + "reward_std": 0.13772501051425934, + "rewards/accuracy_reward": 0.5061619877815247, + "rewards/format_reward": 0.9750000238418579, + "step": 735, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 502.1000061035156, + "epoch": 0.01398840634799962, + "grad_norm": 3.8174721207345197, + "kl": 0.09765625, + "learning_rate": 9.9951726772283e-07, + "loss": 0.0039, + "reward": 1.6628608703613281, + "reward_std": 0.316476970911026, + "rewards/accuracy_reward": 0.731610894203186, + "rewards/format_reward": 0.9000000357627869, + "step": 736, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 496.4250183105469, + "epoch": 0.01400741233488549, + "grad_norm": 1.7831031299435494, + "kl": 0.1298828125, + "learning_rate": 9.995159552710246e-07, + "loss": 0.0052, + "reward": 1.7787500619888306, + "reward_std": 0.2259797602891922, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 737, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 452.6000061035156, + "epoch": 0.014026418321771358, + "grad_norm": 1.544603820218446, + "kl": 0.1279296875, + "learning_rate": 9.995146410383587e-07, + "loss": 0.0051, + "reward": 1.5206249952316284, + "reward_std": 0.24276505410671234, + "rewards/accuracy_reward": 0.4593749940395355, + "rewards/format_reward": 1.0, + "step": 738, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 531.9500122070312, + "epoch": 0.014045424308657227, + "grad_norm": 1.788567454252881, + "kl": 0.10107421875, + "learning_rate": 9.995133250248368e-07, + "loss": 0.004, + "reward": 1.1032320261001587, + "reward_std": 0.49773478507995605, + "rewards/accuracy_reward": 0.2544820308685303, + "rewards/format_reward": 0.875, + "step": 739, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 477.1750183105469, + "epoch": 0.014064430295543096, + "grad_norm": 1.7441095899233134, + "kl": 0.11083984375, + "learning_rate": 9.995120072304634e-07, + "loss": 0.0044, + "reward": 1.3567793369293213, + "reward_std": 0.15901394188404083, + "rewards/accuracy_reward": 0.30427926778793335, + "rewards/format_reward": 1.0, + "step": 740, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.6, + "completion_length": 488.1750183105469, + "epoch": 0.014083436282428965, + "grad_norm": 1.3004473374352874, + "kl": 0.140625, + "learning_rate": 9.995106876552436e-07, + "loss": 0.0056, + "reward": 1.2024999856948853, + "reward_std": 0.2259700745344162, + "rewards/accuracy_reward": 0.20000000298023224, + "rewards/format_reward": 1.0, + "step": 741, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 498.0500183105469, + "epoch": 0.014102442269314834, + "grad_norm": 1.2445748694317842, + "kl": 0.07421875, + "learning_rate": 9.995093662991816e-07, + "loss": 0.003, + "reward": 1.6849998235702515, + "reward_std": 0.2943117022514343, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 0.9750000238418579, + "step": 742, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 479.45001220703125, + "epoch": 0.014121448256200703, + "grad_norm": 6.583813889317297, + "kl": 0.1474609375, + "learning_rate": 9.995080431622822e-07, + "loss": 0.0059, + "reward": 2.288750171661377, + "reward_std": 0.2106401026248932, + "rewards/accuracy_reward": 0.949999988079071, + "rewards/format_reward": 1.0, + "step": 743, + "temporal_rewards": 1.0 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 420.0249938964844, + "epoch": 0.014140454243086572, + "grad_norm": 2.0836211277274277, + "kl": 0.08349609375, + "learning_rate": 9.995067182445504e-07, + "loss": 0.0033, + "reward": 1.5436538457870483, + "reward_std": 0.14448438584804535, + "rewards/accuracy_reward": 0.4761539101600647, + "rewards/format_reward": 1.0, + "step": 744, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 418.5249938964844, + "epoch": 0.01415946022997244, + "grad_norm": 6.236097500249061, + "kl": 0.091796875, + "learning_rate": 9.99505391545991e-07, + "loss": 0.0037, + "reward": 2.132500171661377, + "reward_std": 0.1318339854478836, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 745, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 1.0, + "all_wrong": 0.0, + "completion_length": 471.07501220703125, + "epoch": 0.01417846621685831, + "grad_norm": 2.033070771072757, + "kl": 0.1220703125, + "learning_rate": 9.995040630666083e-07, + "loss": 0.0049, + "reward": 2.286250114440918, + "reward_std": 0.04182327538728714, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 746, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 461.875, + "epoch": 0.014197472203744179, + "grad_norm": 1.8083703913124065, + "kl": 0.09033203125, + "learning_rate": 9.995027328064074e-07, + "loss": 0.0036, + "reward": 1.4150055646896362, + "reward_std": 0.2622986137866974, + "rewards/accuracy_reward": 0.4250055253505707, + "rewards/format_reward": 0.9750000238418579, + "step": 747, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 446.8500061035156, + "epoch": 0.014216478190630048, + "grad_norm": 1.728439739270468, + "kl": 0.09033203125, + "learning_rate": 9.995014007653929e-07, + "loss": 0.0036, + "reward": 2.0140626430511475, + "reward_std": 0.31676068902015686, + "rewards/accuracy_reward": 0.895312488079071, + "rewards/format_reward": 0.9750000238418579, + "step": 748, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 477.6499938964844, + "epoch": 0.014235484177515918, + "grad_norm": 2.306479461937606, + "kl": 0.06298828125, + "learning_rate": 9.995000669435692e-07, + "loss": 0.0025, + "reward": 1.4972113370895386, + "reward_std": 0.24063192307949066, + "rewards/accuracy_reward": 0.4259612560272217, + "rewards/format_reward": 0.9750000238418579, + "step": 749, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 433.0500183105469, + "epoch": 0.014254490164401787, + "grad_norm": 3.734498611144611, + "kl": 0.076171875, + "learning_rate": 9.994987313409419e-07, + "loss": 0.0031, + "reward": 1.8204425573349, + "reward_std": 0.236294224858284, + "rewards/accuracy_reward": 0.7641925811767578, + "rewards/format_reward": 0.9750000238418579, + "step": 750, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 451.125, + "epoch": 0.014273496151287656, + "grad_norm": 2.0203727131221747, + "kl": 0.1064453125, + "learning_rate": 9.99497393957515e-07, + "loss": 0.0043, + "reward": 1.5, + "reward_std": 0.19880543649196625, + "rewards/accuracy_reward": 0.42500001192092896, + "rewards/format_reward": 1.0, + "step": 751, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 449.5500183105469, + "epoch": 0.014292502138173525, + "grad_norm": 1.9808058069268584, + "kl": 0.0751953125, + "learning_rate": 9.994960547932934e-07, + "loss": 0.003, + "reward": 1.6224838495254517, + "reward_std": 0.29527994990348816, + "rewards/accuracy_reward": 0.507483959197998, + "rewards/format_reward": 0.9750000238418579, + "step": 752, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 441.20001220703125, + "epoch": 0.014311508125059394, + "grad_norm": 1.4863333316983103, + "kl": 0.0966796875, + "learning_rate": 9.99494713848282e-07, + "loss": 0.0039, + "reward": 1.9149998426437378, + "reward_std": 0.13809554278850555, + "rewards/accuracy_reward": 0.699999988079071, + "rewards/format_reward": 1.0, + "step": 753, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.2, + "completion_length": 453.6499938964844, + "epoch": 0.014330514111945263, + "grad_norm": 1.5590016955518622, + "kl": 0.09423828125, + "learning_rate": 9.99493371122486e-07, + "loss": 0.0038, + "reward": 1.8900002241134644, + "reward_std": 0.045196861028671265, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 754, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 474.2749938964844, + "epoch": 0.014349520098831132, + "grad_norm": 1.499816752825605, + "kl": 0.0859375, + "learning_rate": 9.994920266159094e-07, + "loss": 0.0034, + "reward": 1.337070107460022, + "reward_std": 0.4382023811340332, + "rewards/accuracy_reward": 0.36957013607025146, + "rewards/format_reward": 0.9000000357627869, + "step": 755, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 460.75, + "epoch": 0.014368526085717, + "grad_norm": 1.7724661170009917, + "kl": 0.0830078125, + "learning_rate": 9.994906803285575e-07, + "loss": 0.0033, + "reward": 1.8309954404830933, + "reward_std": 0.32961222529411316, + "rewards/accuracy_reward": 0.7172453999519348, + "rewards/format_reward": 0.9750000238418579, + "step": 756, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 480.2250061035156, + "epoch": 0.01438753207260287, + "grad_norm": 1.8904261757288219, + "kl": 0.08203125, + "learning_rate": 9.99489332260435e-07, + "loss": 0.0033, + "reward": 1.7253230810165405, + "reward_std": 0.23734644055366516, + "rewards/accuracy_reward": 0.5840731859207153, + "rewards/format_reward": 1.0, + "step": 757, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 470.6750183105469, + "epoch": 0.014406538059488739, + "grad_norm": 1.5154542573522882, + "kl": 0.142578125, + "learning_rate": 9.994879824115466e-07, + "loss": 0.0057, + "reward": 1.8287498950958252, + "reward_std": 0.14298126101493835, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 1.0, + "step": 758, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 461.6499938964844, + "epoch": 0.014425544046374607, + "grad_norm": 1.8544975874557557, + "kl": 0.099609375, + "learning_rate": 9.994866307818973e-07, + "loss": 0.004, + "reward": 1.8991936445236206, + "reward_std": 0.19035105407238007, + "rewards/accuracy_reward": 0.7004434466362, + "rewards/format_reward": 1.0, + "step": 759, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 454.82501220703125, + "epoch": 0.014444550033260476, + "grad_norm": 1.4462065275613183, + "kl": 0.10400390625, + "learning_rate": 9.994852773714915e-07, + "loss": 0.0042, + "reward": 1.7649999856948853, + "reward_std": 0.1486663967370987, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 1.0, + "step": 760, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 484.125, + "epoch": 0.014463556020146345, + "grad_norm": 3.1441284308440065, + "kl": 0.11767578125, + "learning_rate": 9.994839221803346e-07, + "loss": 0.0047, + "reward": 1.9326947927474976, + "reward_std": 0.2921665906906128, + "rewards/accuracy_reward": 0.7714447975158691, + "rewards/format_reward": 1.0, + "step": 761, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 487.8500061035156, + "epoch": 0.014482562007032216, + "grad_norm": 3.0139285479540927, + "kl": 0.0654296875, + "learning_rate": 9.994825652084312e-07, + "loss": 0.0026, + "reward": 1.6638822555541992, + "reward_std": 0.26299145817756653, + "rewards/accuracy_reward": 0.5501323342323303, + "rewards/format_reward": 1.0, + "step": 762, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 447.8000183105469, + "epoch": 0.014501567993918085, + "grad_norm": 1.989444391148717, + "kl": 0.09423828125, + "learning_rate": 9.994812064557858e-07, + "loss": 0.0038, + "reward": 2.0299999713897705, + "reward_std": 0.13900230824947357, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 763, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 474.75, + "epoch": 0.014520573980803954, + "grad_norm": 2.6836970393642363, + "kl": 0.0791015625, + "learning_rate": 9.994798459224038e-07, + "loss": 0.0032, + "reward": 1.8350870609283447, + "reward_std": 0.15132379531860352, + "rewards/accuracy_reward": 0.6375870108604431, + "rewards/format_reward": 1.0, + "step": 764, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 497.5249938964844, + "epoch": 0.014539579967689823, + "grad_norm": 1.549667656114667, + "kl": 0.10546875, + "learning_rate": 9.994784836082896e-07, + "loss": 0.0042, + "reward": 1.7287498712539673, + "reward_std": 0.1318923383951187, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 765, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 460.1000061035156, + "epoch": 0.014558585954575692, + "grad_norm": 1.882961564104107, + "kl": 0.10595703125, + "learning_rate": 9.994771195134485e-07, + "loss": 0.0042, + "reward": 1.9885374307632446, + "reward_std": 0.07097212225198746, + "rewards/accuracy_reward": 0.7822873592376709, + "rewards/format_reward": 1.0, + "step": 766, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 497.9250183105469, + "epoch": 0.01457759194146156, + "grad_norm": 1.4813787887514824, + "kl": 0.1142578125, + "learning_rate": 9.99475753637885e-07, + "loss": 0.0046, + "reward": 2.026250123977661, + "reward_std": 0.2759816348552704, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 0.9750000238418579, + "step": 767, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 480.0, + "epoch": 0.01459659792834743, + "grad_norm": 1.7262944765708559, + "kl": 0.1494140625, + "learning_rate": 9.99474385981604e-07, + "loss": 0.006, + "reward": 2.271250009536743, + "reward_std": 0.052882224321365356, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 768, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 460.0, + "epoch": 0.014615603915233298, + "grad_norm": 1.5616740663610267, + "kl": 0.0810546875, + "learning_rate": 9.994730165446105e-07, + "loss": 0.0032, + "reward": 1.4993590116500854, + "reward_std": 0.20657002925872803, + "rewards/accuracy_reward": 0.4493590295314789, + "rewards/format_reward": 0.9750000238418579, + "step": 769, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 505.8500061035156, + "epoch": 0.014634609902119167, + "grad_norm": 1.4632730129406921, + "kl": 0.134765625, + "learning_rate": 9.994716453269095e-07, + "loss": 0.0054, + "reward": 1.9187500476837158, + "reward_std": 0.20924797654151917, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 0.9750000238418579, + "step": 770, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 482.3000183105469, + "epoch": 0.014653615889005036, + "grad_norm": 2.1424523172082552, + "kl": 0.11669921875, + "learning_rate": 9.994702723285055e-07, + "loss": 0.0047, + "reward": 2.063645839691162, + "reward_std": 0.050597209483385086, + "rewards/accuracy_reward": 0.8036457896232605, + "rewards/format_reward": 1.0, + "step": 771, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 471.125, + "epoch": 0.014672621875890905, + "grad_norm": 1.723248117910098, + "kl": 0.115234375, + "learning_rate": 9.994688975494038e-07, + "loss": 0.0046, + "reward": 2.108116865158081, + "reward_std": 0.05700944736599922, + "rewards/accuracy_reward": 0.841866672039032, + "rewards/format_reward": 1.0, + "step": 772, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 460.45001220703125, + "epoch": 0.014691627862776774, + "grad_norm": 1.5126150292434897, + "kl": 0.115234375, + "learning_rate": 9.99467520989609e-07, + "loss": 0.0046, + "reward": 1.5732500553131104, + "reward_std": 0.21708396077156067, + "rewards/accuracy_reward": 0.5057500004768372, + "rewards/format_reward": 1.0, + "step": 773, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 448.3500061035156, + "epoch": 0.014710633849662643, + "grad_norm": 1.4429782586789055, + "kl": 0.10400390625, + "learning_rate": 9.994661426491262e-07, + "loss": 0.0041, + "reward": 1.7385681867599487, + "reward_std": 0.292192280292511, + "rewards/accuracy_reward": 0.5723182559013367, + "rewards/format_reward": 1.0, + "step": 774, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 503.4250183105469, + "epoch": 0.014729639836548512, + "grad_norm": 1.3077908130981213, + "kl": 0.1416015625, + "learning_rate": 9.994647625279603e-07, + "loss": 0.0057, + "reward": 2.2612500190734863, + "reward_std": 0.0640869066119194, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 775, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 481.95001220703125, + "epoch": 0.014748645823434383, + "grad_norm": 1.3490194086212917, + "kl": 0.1484375, + "learning_rate": 9.994633806261162e-07, + "loss": 0.0059, + "reward": 1.683749794960022, + "reward_std": 0.11192484945058823, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 776, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 487.0, + "epoch": 0.014767651810320252, + "grad_norm": 1.3791951464065508, + "kl": 0.13671875, + "learning_rate": 9.994619969435986e-07, + "loss": 0.0055, + "reward": 1.7949999570846558, + "reward_std": 0.2829101085662842, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 777, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 468.1750183105469, + "epoch": 0.01478665779720612, + "grad_norm": 1.5387961756905428, + "kl": 0.11572265625, + "learning_rate": 9.994606114804128e-07, + "loss": 0.0046, + "reward": 1.8037500381469727, + "reward_std": 0.28276464343070984, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 778, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 473.25, + "epoch": 0.01480566378409199, + "grad_norm": 2.0734525343624104, + "kl": 0.146484375, + "learning_rate": 9.994592242365634e-07, + "loss": 0.0059, + "reward": 1.8637498617172241, + "reward_std": 0.15475887060165405, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 1.0, + "step": 779, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 479.6499938964844, + "epoch": 0.014824669770977858, + "grad_norm": 1.3821247588442285, + "kl": 0.1201171875, + "learning_rate": 9.994578352120555e-07, + "loss": 0.0048, + "reward": 1.9819663763046265, + "reward_std": 0.06187741085886955, + "rewards/accuracy_reward": 0.8507165312767029, + "rewards/format_reward": 1.0, + "step": 780, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 503.8000183105469, + "epoch": 0.014843675757863727, + "grad_norm": 1.4582675984327689, + "kl": 0.126953125, + "learning_rate": 9.994564444068942e-07, + "loss": 0.0051, + "reward": 2.132500171661377, + "reward_std": 0.2329360991716385, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 0.9750000238418579, + "step": 781, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.4, + "completion_length": 514.6749877929688, + "epoch": 0.014862681744749596, + "grad_norm": 1.367826239836461, + "kl": 0.09521484375, + "learning_rate": 9.994550518210843e-07, + "loss": 0.0038, + "reward": 1.158756136894226, + "reward_std": 0.2638576626777649, + "rewards/accuracy_reward": 0.16750600934028625, + "rewards/format_reward": 0.9750000238418579, + "step": 782, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 466.2749938964844, + "epoch": 0.014881687731635465, + "grad_norm": 1.4429481769775017, + "kl": 0.08837890625, + "learning_rate": 9.994536574546308e-07, + "loss": 0.0035, + "reward": 1.7625430822372437, + "reward_std": 0.3283025324344635, + "rewards/accuracy_reward": 0.7387930750846863, + "rewards/format_reward": 1.0, + "step": 783, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 443.75, + "epoch": 0.014900693718521334, + "grad_norm": 1.5136080221349137, + "kl": 0.119140625, + "learning_rate": 9.994522613075387e-07, + "loss": 0.0048, + "reward": 2.048386335372925, + "reward_std": 0.2058589905500412, + "rewards/accuracy_reward": 0.8271364569664001, + "rewards/format_reward": 1.0, + "step": 784, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 440.57501220703125, + "epoch": 0.014919699705407203, + "grad_norm": 1.5032876573447234, + "kl": 0.068359375, + "learning_rate": 9.994508633798128e-07, + "loss": 0.0027, + "reward": 1.8225001096725464, + "reward_std": 0.3319225311279297, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 1.0, + "step": 785, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 461.0249938964844, + "epoch": 0.014938705692293072, + "grad_norm": 2.6713461994091143, + "kl": 0.08349609375, + "learning_rate": 9.994494636714583e-07, + "loss": 0.0033, + "reward": 1.6481415033340454, + "reward_std": 0.30230990052223206, + "rewards/accuracy_reward": 0.6331415772438049, + "rewards/format_reward": 0.949999988079071, + "step": 786, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 409.5500183105469, + "epoch": 0.01495771167917894, + "grad_norm": 2.9009179538967174, + "kl": 0.1103515625, + "learning_rate": 9.9944806218248e-07, + "loss": 0.0044, + "reward": 2.1116578578948975, + "reward_std": 0.18856696784496307, + "rewards/accuracy_reward": 0.8254079222679138, + "rewards/format_reward": 1.0, + "step": 787, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 396.8999938964844, + "epoch": 0.01497671766606481, + "grad_norm": 1.3688749861320781, + "kl": 0.0869140625, + "learning_rate": 9.994466589128832e-07, + "loss": 0.0035, + "reward": 1.6337502002716064, + "reward_std": 0.27336740493774414, + "rewards/accuracy_reward": 0.574999988079071, + "rewards/format_reward": 1.0, + "step": 788, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 412.8500061035156, + "epoch": 0.014995723652950679, + "grad_norm": 2.2438678795410527, + "kl": 0.0966796875, + "learning_rate": 9.994452538626726e-07, + "loss": 0.0039, + "reward": 1.8737499713897705, + "reward_std": 0.2121085226535797, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 1.0, + "step": 789, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 473.07501220703125, + "epoch": 0.01501472963983655, + "grad_norm": 1.749537418576682, + "kl": 0.130859375, + "learning_rate": 9.994438470318533e-07, + "loss": 0.0052, + "reward": 2.200000047683716, + "reward_std": 0.22416846454143524, + "rewards/accuracy_reward": 0.925000011920929, + "rewards/format_reward": 1.0, + "step": 790, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 405.75, + "epoch": 0.015033735626722418, + "grad_norm": 1.4448913315052803, + "kl": 0.09619140625, + "learning_rate": 9.994424384204304e-07, + "loss": 0.0038, + "reward": 1.5371110439300537, + "reward_std": 0.20227237045764923, + "rewards/accuracy_reward": 0.4433610439300537, + "rewards/format_reward": 1.0, + "step": 791, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 497.1000061035156, + "epoch": 0.015052741613608287, + "grad_norm": 1.4979165703036394, + "kl": 0.1171875, + "learning_rate": 9.99441028028409e-07, + "loss": 0.0047, + "reward": 1.630000114440918, + "reward_std": 0.286091148853302, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 0.925000011920929, + "step": 792, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.2, + "completion_length": 461.4750061035156, + "epoch": 0.015071747600494156, + "grad_norm": 1.6179945119043353, + "kl": 0.1318359375, + "learning_rate": 9.994396158557937e-07, + "loss": 0.0053, + "reward": 1.90500009059906, + "reward_std": 0.02449488453567028, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 793, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 391.1000061035156, + "epoch": 0.015090753587380025, + "grad_norm": 4.376878225461876, + "kl": 0.06689453125, + "learning_rate": 9.9943820190259e-07, + "loss": 0.0027, + "reward": 1.518892765045166, + "reward_std": 0.16035513579845428, + "rewards/accuracy_reward": 0.4276427924633026, + "rewards/format_reward": 1.0, + "step": 794, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 447.20001220703125, + "epoch": 0.015109759574265894, + "grad_norm": 1.6290473732172044, + "kl": 0.12060546875, + "learning_rate": 9.994367861688026e-07, + "loss": 0.0048, + "reward": 1.7299998998641968, + "reward_std": 0.18592557311058044, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 795, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 450.3000183105469, + "epoch": 0.015128765561151763, + "grad_norm": 1.6960873174202618, + "kl": 0.107421875, + "learning_rate": 9.994353686544368e-07, + "loss": 0.0043, + "reward": 1.9645652770996094, + "reward_std": 0.14851684868335724, + "rewards/accuracy_reward": 0.7620654106140137, + "rewards/format_reward": 1.0, + "step": 796, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 473.2250061035156, + "epoch": 0.015147771548037632, + "grad_norm": 1.5568052726387847, + "kl": 0.125, + "learning_rate": 9.994339493594976e-07, + "loss": 0.005, + "reward": 2.1659560203552246, + "reward_std": 0.04685882478952408, + "rewards/accuracy_reward": 0.8847060203552246, + "rewards/format_reward": 1.0, + "step": 797, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 375.95001220703125, + "epoch": 0.0151667775349235, + "grad_norm": 2.441515227496874, + "kl": 0.0859375, + "learning_rate": 9.9943252828399e-07, + "loss": 0.0034, + "reward": 1.4190090894699097, + "reward_std": 0.051840174943208694, + "rewards/accuracy_reward": 0.3252590298652649, + "rewards/format_reward": 1.0, + "step": 798, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 427.25, + "epoch": 0.01518578352180937, + "grad_norm": 1.68778991778817, + "kl": 0.11181640625, + "learning_rate": 9.994311054279191e-07, + "loss": 0.0045, + "reward": 1.5412499904632568, + "reward_std": 0.333090215921402, + "rewards/accuracy_reward": 0.42500001192092896, + "rewards/format_reward": 1.0, + "step": 799, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 458.57501220703125, + "epoch": 0.015204789508695239, + "grad_norm": 1.748309733250112, + "kl": 0.07373046875, + "learning_rate": 9.9942968079129e-07, + "loss": 0.003, + "reward": 1.7813431024551392, + "reward_std": 0.3278692662715912, + "rewards/accuracy_reward": 0.6700930595397949, + "rewards/format_reward": 0.925000011920929, + "step": 800, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 470.6000061035156, + "epoch": 0.015223795495581107, + "grad_norm": 1.462241991884523, + "kl": 0.11962890625, + "learning_rate": 9.994282543741077e-07, + "loss": 0.0048, + "reward": 1.90625, + "reward_std": 0.2457003891468048, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 801, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 478.125, + "epoch": 0.015242801482466976, + "grad_norm": 2.082936475391673, + "kl": 0.10986328125, + "learning_rate": 9.994268261763773e-07, + "loss": 0.0044, + "reward": 1.5495386123657227, + "reward_std": 0.21620170772075653, + "rewards/accuracy_reward": 0.4370386302471161, + "rewards/format_reward": 1.0, + "step": 802, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 437.7749938964844, + "epoch": 0.015261807469352845, + "grad_norm": 6.520626729244516, + "kl": 0.07568359375, + "learning_rate": 9.994253961981038e-07, + "loss": 0.003, + "reward": 1.5650115013122559, + "reward_std": 0.06255774945020676, + "rewards/accuracy_reward": 0.43251147866249084, + "rewards/format_reward": 1.0, + "step": 803, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 450.3999938964844, + "epoch": 0.015280813456238716, + "grad_norm": 3.0711017255804527, + "kl": 0.1005859375, + "learning_rate": 9.994239644392925e-07, + "loss": 0.004, + "reward": 1.5605331659317017, + "reward_std": 0.05940740182995796, + "rewards/accuracy_reward": 0.42428311705589294, + "rewards/format_reward": 1.0, + "step": 804, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 468.1499938964844, + "epoch": 0.015299819443124585, + "grad_norm": 1.3510199743183433, + "kl": 0.08056640625, + "learning_rate": 9.994225308999485e-07, + "loss": 0.0032, + "reward": 1.7200106382369995, + "reward_std": 0.29686713218688965, + "rewards/accuracy_reward": 0.7100105881690979, + "rewards/format_reward": 1.0, + "step": 805, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 476.8999938964844, + "epoch": 0.015318825430010454, + "grad_norm": 1.386754069916214, + "kl": 0.12451171875, + "learning_rate": 9.994210955800768e-07, + "loss": 0.005, + "reward": 1.6737499237060547, + "reward_std": 0.04467545449733734, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 806, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 455.5249938964844, + "epoch": 0.015337831416896323, + "grad_norm": 1.4872206246572202, + "kl": 0.1103515625, + "learning_rate": 9.994196584796827e-07, + "loss": 0.0044, + "reward": 2.0174999237060547, + "reward_std": 0.3088816702365875, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 807, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 457.8000183105469, + "epoch": 0.015356837403782192, + "grad_norm": 2.7290521499829854, + "kl": 0.1171875, + "learning_rate": 9.994182195987708e-07, + "loss": 0.0047, + "reward": 2.0900001525878906, + "reward_std": 0.0994642972946167, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 808, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 420.32501220703125, + "epoch": 0.01537584339066806, + "grad_norm": 1.5777825182643292, + "kl": 0.07275390625, + "learning_rate": 9.99416778937347e-07, + "loss": 0.0029, + "reward": 1.8416575193405151, + "reward_std": 0.14976145327091217, + "rewards/accuracy_reward": 0.6754074096679688, + "rewards/format_reward": 1.0, + "step": 809, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 440.3000183105469, + "epoch": 0.01539484937755393, + "grad_norm": 2.2896139769601613, + "kl": 0.0751953125, + "learning_rate": 9.994153364954157e-07, + "loss": 0.003, + "reward": 1.5402681827545166, + "reward_std": 0.05909978225827217, + "rewards/accuracy_reward": 0.5115181803703308, + "rewards/format_reward": 1.0, + "step": 810, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 404.4750061035156, + "epoch": 0.015413855364439798, + "grad_norm": 3.144694858752442, + "kl": 0.1171875, + "learning_rate": 9.994138922729825e-07, + "loss": 0.0047, + "reward": 1.9600000381469727, + "reward_std": 0.03009721077978611, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 811, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 423.5, + "epoch": 0.015432861351325667, + "grad_norm": 2.234117683871632, + "kl": 0.08154296875, + "learning_rate": 9.994124462700525e-07, + "loss": 0.0033, + "reward": 1.8830076456069946, + "reward_std": 0.13665837049484253, + "rewards/accuracy_reward": 0.6817578077316284, + "rewards/format_reward": 1.0, + "step": 812, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 448.6750183105469, + "epoch": 0.015451867338211536, + "grad_norm": 2.371888037720843, + "kl": 0.10791015625, + "learning_rate": 9.994109984866308e-07, + "loss": 0.0043, + "reward": 1.7918812036514282, + "reward_std": 0.32109126448631287, + "rewards/accuracy_reward": 0.6243812441825867, + "rewards/format_reward": 1.0, + "step": 813, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 446.1499938964844, + "epoch": 0.015470873325097405, + "grad_norm": 1.3257033955433293, + "kl": 0.1259765625, + "learning_rate": 9.994095489227224e-07, + "loss": 0.005, + "reward": 1.6075000762939453, + "reward_std": 0.26312994956970215, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/format_reward": 1.0, + "step": 814, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 411.57501220703125, + "epoch": 0.015489879311983274, + "grad_norm": 2.288194544489373, + "kl": 0.10498046875, + "learning_rate": 9.994080975783329e-07, + "loss": 0.0042, + "reward": 1.839240312576294, + "reward_std": 0.29091987013816833, + "rewards/accuracy_reward": 0.6767401695251465, + "rewards/format_reward": 1.0, + "step": 815, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 451.32501220703125, + "epoch": 0.015508885298869143, + "grad_norm": 2.3909629602223723, + "kl": 0.08154296875, + "learning_rate": 9.994066444534668e-07, + "loss": 0.0033, + "reward": 1.6088298559188843, + "reward_std": 0.1844092607498169, + "rewards/accuracy_reward": 0.458829790353775, + "rewards/format_reward": 1.0, + "step": 816, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 505.6499938964844, + "epoch": 0.015527891285755014, + "grad_norm": 1.3999538222360683, + "kl": 0.13671875, + "learning_rate": 9.994051895481298e-07, + "loss": 0.0055, + "reward": 1.9837497472763062, + "reward_std": 0.350053071975708, + "rewards/accuracy_reward": 0.9000000357627869, + "rewards/format_reward": 0.925000011920929, + "step": 817, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 465.6499938964844, + "epoch": 0.015546897272640883, + "grad_norm": 1.877393189163533, + "kl": 0.1044921875, + "learning_rate": 9.99403732862327e-07, + "loss": 0.0042, + "reward": 1.436156153678894, + "reward_std": 0.15536737442016602, + "rewards/accuracy_reward": 0.3149060010910034, + "rewards/format_reward": 1.0, + "step": 818, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 430.1750183105469, + "epoch": 0.015565903259526752, + "grad_norm": 2.896829914699599, + "kl": 0.09814453125, + "learning_rate": 9.994022743960638e-07, + "loss": 0.0039, + "reward": 1.9121776819229126, + "reward_std": 0.18449024856090546, + "rewards/accuracy_reward": 0.8334276080131531, + "rewards/format_reward": 1.0, + "step": 819, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 446.9250183105469, + "epoch": 0.01558490924641262, + "grad_norm": 1.6359786073151226, + "kl": 0.1015625, + "learning_rate": 9.994008141493447e-07, + "loss": 0.0041, + "reward": 1.7087547779083252, + "reward_std": 0.19845259189605713, + "rewards/accuracy_reward": 0.6412548422813416, + "rewards/format_reward": 1.0, + "step": 820, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 454.0249938964844, + "epoch": 0.01560391523329849, + "grad_norm": 2.0659525121154276, + "kl": 0.0859375, + "learning_rate": 9.993993521221757e-07, + "loss": 0.0034, + "reward": 1.72882080078125, + "reward_std": 0.03259288892149925, + "rewards/accuracy_reward": 0.6288207173347473, + "rewards/format_reward": 1.0, + "step": 821, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 381.45001220703125, + "epoch": 0.015622921220184358, + "grad_norm": 2.3887967683025733, + "kl": 0.1123046875, + "learning_rate": 9.993978883145614e-07, + "loss": 0.0045, + "reward": 1.8892822265625, + "reward_std": 0.06339588016271591, + "rewards/accuracy_reward": 0.7130321860313416, + "rewards/format_reward": 1.0, + "step": 822, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 423.45001220703125, + "epoch": 0.015641927207070225, + "grad_norm": 2.9798839148968446, + "kl": 0.07421875, + "learning_rate": 9.993964227265074e-07, + "loss": 0.003, + "reward": 1.58841073513031, + "reward_std": 0.1449354737997055, + "rewards/accuracy_reward": 0.500910758972168, + "rewards/format_reward": 1.0, + "step": 823, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 457.1000061035156, + "epoch": 0.015660933193956096, + "grad_norm": 1.5512224124952236, + "kl": 0.1416015625, + "learning_rate": 9.99394955358019e-07, + "loss": 0.0056, + "reward": 1.8575000762939453, + "reward_std": 0.1376771777868271, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 1.0, + "step": 824, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 438.125, + "epoch": 0.015679939180841967, + "grad_norm": 1.5639207551772911, + "kl": 0.09130859375, + "learning_rate": 9.993934862091009e-07, + "loss": 0.0036, + "reward": 1.70260751247406, + "reward_std": 0.232549786567688, + "rewards/accuracy_reward": 0.6038575768470764, + "rewards/format_reward": 1.0, + "step": 825, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 438.3999938964844, + "epoch": 0.015698945167727834, + "grad_norm": 2.3209589552496106, + "kl": 0.10205078125, + "learning_rate": 9.99392015279759e-07, + "loss": 0.0041, + "reward": 1.8627609014511108, + "reward_std": 0.1048198714852333, + "rewards/accuracy_reward": 0.7027609348297119, + "rewards/format_reward": 1.0, + "step": 826, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 470.7250061035156, + "epoch": 0.015717951154613705, + "grad_norm": 2.3085009505564713, + "kl": 0.1494140625, + "learning_rate": 9.99390542569998e-07, + "loss": 0.006, + "reward": 2.1050000190734863, + "reward_std": 0.10080789774656296, + "rewards/accuracy_reward": 0.824999988079071, + "rewards/format_reward": 1.0, + "step": 827, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 451.5249938964844, + "epoch": 0.015736957141499572, + "grad_norm": 1.7331765865917723, + "kl": 0.10791015625, + "learning_rate": 9.993890680798234e-07, + "loss": 0.0043, + "reward": 1.4598677158355713, + "reward_std": 0.23452387750148773, + "rewards/accuracy_reward": 0.37736770510673523, + "rewards/format_reward": 1.0, + "step": 828, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 446.70001220703125, + "epoch": 0.015755963128385442, + "grad_norm": 2.0426688994806304, + "kl": 0.078125, + "learning_rate": 9.993875918092406e-07, + "loss": 0.0031, + "reward": 1.8170711994171143, + "reward_std": 0.11288689821958542, + "rewards/accuracy_reward": 0.7020711302757263, + "rewards/format_reward": 1.0, + "step": 829, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 451.95001220703125, + "epoch": 0.01577496911527131, + "grad_norm": 2.0469231909203396, + "kl": 0.12255859375, + "learning_rate": 9.993861137582544e-07, + "loss": 0.0049, + "reward": 2.0579166412353516, + "reward_std": 0.18174327909946442, + "rewards/accuracy_reward": 0.8716667294502258, + "rewards/format_reward": 1.0, + "step": 830, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 455.625, + "epoch": 0.01579397510215718, + "grad_norm": 1.9212226372226346, + "kl": 0.09765625, + "learning_rate": 9.993846339268706e-07, + "loss": 0.0039, + "reward": 1.7817646265029907, + "reward_std": 0.22643624246120453, + "rewards/accuracy_reward": 0.6630145907402039, + "rewards/format_reward": 1.0, + "step": 831, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 485.0500183105469, + "epoch": 0.015812981089043048, + "grad_norm": 1.7117025823471566, + "kl": 0.125, + "learning_rate": 9.993831523150941e-07, + "loss": 0.005, + "reward": 2.002500057220459, + "reward_std": 0.3246128559112549, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 832, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 478.6750183105469, + "epoch": 0.015831987075928918, + "grad_norm": 2.0869795193340797, + "kl": 0.107421875, + "learning_rate": 9.993816689229306e-07, + "loss": 0.0043, + "reward": 2.0425000190734863, + "reward_std": 0.29715046286582947, + "rewards/accuracy_reward": 0.925000011920929, + "rewards/format_reward": 0.9750000238418579, + "step": 833, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 474.375, + "epoch": 0.015850993062814785, + "grad_norm": 1.4969927994579393, + "kl": 0.12109375, + "learning_rate": 9.993801837503847e-07, + "loss": 0.0048, + "reward": 2.1942081451416016, + "reward_std": 0.06370045244693756, + "rewards/accuracy_reward": 0.9979583621025085, + "rewards/format_reward": 1.0, + "step": 834, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 439.9250183105469, + "epoch": 0.015869999049700656, + "grad_norm": 1.6378010011346011, + "kl": 0.076171875, + "learning_rate": 9.993786967974624e-07, + "loss": 0.003, + "reward": 1.6663583517074585, + "reward_std": 0.2924807071685791, + "rewards/accuracy_reward": 0.621358335018158, + "rewards/format_reward": 0.925000011920929, + "step": 835, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 419.5, + "epoch": 0.015889005036586523, + "grad_norm": 1.9354578913615843, + "kl": 0.08154296875, + "learning_rate": 9.993772080641687e-07, + "loss": 0.0033, + "reward": 1.8075135946273804, + "reward_std": 0.27105510234832764, + "rewards/accuracy_reward": 0.6525136232376099, + "rewards/format_reward": 1.0, + "step": 836, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 428.82501220703125, + "epoch": 0.015908011023472394, + "grad_norm": 1.7627924290890316, + "kl": 0.11279296875, + "learning_rate": 9.99375717550509e-07, + "loss": 0.0045, + "reward": 1.7273215055465698, + "reward_std": 0.23579041659832, + "rewards/accuracy_reward": 0.543571412563324, + "rewards/format_reward": 1.0, + "step": 837, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 453.5249938964844, + "epoch": 0.015927017010358265, + "grad_norm": 1.274599125249701, + "kl": 0.12890625, + "learning_rate": 9.993742252564883e-07, + "loss": 0.0052, + "reward": 1.4049999713897705, + "reward_std": 0.3525925576686859, + "rewards/accuracy_reward": 0.3499999940395355, + "rewards/format_reward": 1.0, + "step": 838, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 427.375, + "epoch": 0.01594602299724413, + "grad_norm": 1.8369478545790794, + "kl": 0.09228515625, + "learning_rate": 9.99372731182112e-07, + "loss": 0.0037, + "reward": 1.7975000143051147, + "reward_std": 0.21443481743335724, + "rewards/accuracy_reward": 0.7350000739097595, + "rewards/format_reward": 0.9750000238418579, + "step": 839, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 422.1000061035156, + "epoch": 0.015965028984130002, + "grad_norm": 1.8753793570007402, + "kl": 0.1357421875, + "learning_rate": 9.99371235327386e-07, + "loss": 0.0054, + "reward": 2.102954626083374, + "reward_std": 0.09214019030332565, + "rewards/accuracy_reward": 0.9279546141624451, + "rewards/format_reward": 1.0, + "step": 840, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 472.5, + "epoch": 0.01598403497101587, + "grad_norm": 1.4138033024464294, + "kl": 0.11669921875, + "learning_rate": 9.99369737692315e-07, + "loss": 0.0047, + "reward": 1.8562500476837158, + "reward_std": 0.21997599303722382, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 1.0, + "step": 841, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 407.70001220703125, + "epoch": 0.01600304095790174, + "grad_norm": 1.555435218787223, + "kl": 0.0986328125, + "learning_rate": 9.993682382769045e-07, + "loss": 0.0039, + "reward": 2.0625, + "reward_std": 0.17160551249980927, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 842, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 418.25, + "epoch": 0.016022046944787607, + "grad_norm": 1.6548112802951471, + "kl": 0.130859375, + "learning_rate": 9.993667370811598e-07, + "loss": 0.0052, + "reward": 1.818750023841858, + "reward_std": 0.15131191909313202, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 1.0, + "step": 843, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 418.6000061035156, + "epoch": 0.016041052931673478, + "grad_norm": 1.7388815142539007, + "kl": 0.103515625, + "learning_rate": 9.993652341050865e-07, + "loss": 0.0041, + "reward": 1.7882276773452759, + "reward_std": 0.19487141072750092, + "rewards/accuracy_reward": 0.6319777369499207, + "rewards/format_reward": 1.0, + "step": 844, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 441.3999938964844, + "epoch": 0.016060058918559345, + "grad_norm": 2.3646734653737314, + "kl": 0.10205078125, + "learning_rate": 9.993637293486897e-07, + "loss": 0.0041, + "reward": 1.7157411575317383, + "reward_std": 0.2658672332763672, + "rewards/accuracy_reward": 0.6307411789894104, + "rewards/format_reward": 1.0, + "step": 845, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 418.2749938964844, + "epoch": 0.016079064905445216, + "grad_norm": 2.2928230050104363, + "kl": 0.09765625, + "learning_rate": 9.993622228119752e-07, + "loss": 0.0039, + "reward": 2.1500000953674316, + "reward_std": 0.09946425259113312, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 846, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 449.1000061035156, + "epoch": 0.016098070892331083, + "grad_norm": 4.330309305614554, + "kl": 0.12060546875, + "learning_rate": 9.993607144949477e-07, + "loss": 0.0048, + "reward": 1.993749976158142, + "reward_std": 0.2732623517513275, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 1.0, + "step": 847, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 398.2250061035156, + "epoch": 0.016117076879216954, + "grad_norm": 1.9516245579933782, + "kl": 0.142578125, + "learning_rate": 9.99359204397613e-07, + "loss": 0.0057, + "reward": 1.9774999618530273, + "reward_std": 0.25878724455833435, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 1.0, + "step": 848, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 446.9750061035156, + "epoch": 0.01613608286610282, + "grad_norm": 2.0447718957624392, + "kl": 0.12890625, + "learning_rate": 9.993576925199765e-07, + "loss": 0.0052, + "reward": 2.2055232524871826, + "reward_std": 0.039260316640138626, + "rewards/accuracy_reward": 0.9142733812332153, + "rewards/format_reward": 1.0, + "step": 849, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 387.8000183105469, + "epoch": 0.01615508885298869, + "grad_norm": 2.2145563974573106, + "kl": 0.08251953125, + "learning_rate": 9.993561788620434e-07, + "loss": 0.0033, + "reward": 1.533031702041626, + "reward_std": 0.024825790897011757, + "rewards/accuracy_reward": 0.4392816126346588, + "rewards/format_reward": 1.0, + "step": 850, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 467.1499938964844, + "epoch": 0.01617409483987456, + "grad_norm": 2.981423734250105, + "kl": 0.11083984375, + "learning_rate": 9.993546634238192e-07, + "loss": 0.0044, + "reward": 2.0568604469299316, + "reward_std": 0.04423154518008232, + "rewards/accuracy_reward": 0.8406102061271667, + "rewards/format_reward": 1.0, + "step": 851, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 424.8999938964844, + "epoch": 0.01619310082676043, + "grad_norm": 1.4528924750481853, + "kl": 0.09765625, + "learning_rate": 9.993531462053093e-07, + "loss": 0.0039, + "reward": 1.5247740745544434, + "reward_std": 0.22725442051887512, + "rewards/accuracy_reward": 0.43977412581443787, + "rewards/format_reward": 1.0, + "step": 852, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 478.1750183105469, + "epoch": 0.0162121068136463, + "grad_norm": 2.236282492619206, + "kl": 0.1552734375, + "learning_rate": 9.99351627206519e-07, + "loss": 0.0062, + "reward": 2.28125, + "reward_std": 0.047425609081983566, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 853, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 444.8999938964844, + "epoch": 0.016231112800532167, + "grad_norm": 1.6565299598191192, + "kl": 0.115234375, + "learning_rate": 9.993501064274539e-07, + "loss": 0.0046, + "reward": 1.7637265920639038, + "reward_std": 0.1827048510313034, + "rewards/accuracy_reward": 0.6649765968322754, + "rewards/format_reward": 1.0, + "step": 854, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 480.95001220703125, + "epoch": 0.016250118787418038, + "grad_norm": 1.5293667400472255, + "kl": 0.1728515625, + "learning_rate": 9.993485838681193e-07, + "loss": 0.0069, + "reward": 2.0875000953674316, + "reward_std": 0.1455240547657013, + "rewards/accuracy_reward": 0.925000011920929, + "rewards/format_reward": 1.0, + "step": 855, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 1.0, + "all_wrong": 0.0, + "completion_length": 471.8000183105469, + "epoch": 0.016269124774303905, + "grad_norm": 1.8992269382996032, + "kl": 0.1435546875, + "learning_rate": 9.993470595285206e-07, + "loss": 0.0057, + "reward": 2.286249876022339, + "reward_std": 0.04182329773902893, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 856, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 461.8000183105469, + "epoch": 0.016288130761189776, + "grad_norm": 2.212047363842615, + "kl": 0.1376953125, + "learning_rate": 9.993455334086635e-07, + "loss": 0.0055, + "reward": 2.2024998664855957, + "reward_std": 0.12062934786081314, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 857, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 455.5, + "epoch": 0.016307136748075643, + "grad_norm": 4.892652987176434, + "kl": 0.0693359375, + "learning_rate": 9.99344005508553e-07, + "loss": 0.0028, + "reward": 1.5883082151412964, + "reward_std": 0.10912847518920898, + "rewards/accuracy_reward": 0.5095581412315369, + "rewards/format_reward": 1.0, + "step": 858, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 509.0249938964844, + "epoch": 0.016326142734961514, + "grad_norm": 1.6402165764541399, + "kl": 0.08447265625, + "learning_rate": 9.99342475828195e-07, + "loss": 0.0034, + "reward": 1.508105754852295, + "reward_std": 0.37558695673942566, + "rewards/accuracy_reward": 0.5031058192253113, + "rewards/format_reward": 0.9000000357627869, + "step": 859, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 486.20001220703125, + "epoch": 0.01634514872184738, + "grad_norm": 1.6755708474898512, + "kl": 0.1279296875, + "learning_rate": 9.993409443675947e-07, + "loss": 0.0051, + "reward": 2.018625020980835, + "reward_std": 0.0413166843354702, + "rewards/accuracy_reward": 0.8523751497268677, + "rewards/format_reward": 1.0, + "step": 860, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 473.4250183105469, + "epoch": 0.01636415470873325, + "grad_norm": 1.5552095725377828, + "kl": 0.11181640625, + "learning_rate": 9.993394111267577e-07, + "loss": 0.0045, + "reward": 1.7212499380111694, + "reward_std": 0.3069811463356018, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9750000238418579, + "step": 861, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 473.2250061035156, + "epoch": 0.01638316069561912, + "grad_norm": 1.7240589687162269, + "kl": 0.1533203125, + "learning_rate": 9.993378761056892e-07, + "loss": 0.0061, + "reward": 2.192929983139038, + "reward_std": 0.07727370411157608, + "rewards/accuracy_reward": 0.9216799139976501, + "rewards/format_reward": 1.0, + "step": 862, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 487.07501220703125, + "epoch": 0.01640216668250499, + "grad_norm": 1.7580497691378385, + "kl": 0.056640625, + "learning_rate": 9.99336339304395e-07, + "loss": 0.0023, + "reward": 1.6729110479354858, + "reward_std": 0.20493152737617493, + "rewards/accuracy_reward": 0.7079111933708191, + "rewards/format_reward": 0.9750000238418579, + "step": 863, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 459.0249938964844, + "epoch": 0.016421172669390856, + "grad_norm": 1.9702491120937722, + "kl": 0.125, + "learning_rate": 9.9933480072288e-07, + "loss": 0.005, + "reward": 2.115000009536743, + "reward_std": 0.22647862136363983, + "rewards/accuracy_reward": 0.925000011920929, + "rewards/format_reward": 1.0, + "step": 864, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 458.1750183105469, + "epoch": 0.016440178656276727, + "grad_norm": 1.474895194088238, + "kl": 0.044677734375, + "learning_rate": 9.993332603611506e-07, + "loss": 0.0018, + "reward": 1.5189027786254883, + "reward_std": 0.10458987206220627, + "rewards/accuracy_reward": 0.5051528811454773, + "rewards/format_reward": 1.0, + "step": 865, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 471.6499938964844, + "epoch": 0.016459184643162598, + "grad_norm": 1.813972251346778, + "kl": 0.15234375, + "learning_rate": 9.993317182192117e-07, + "loss": 0.0061, + "reward": 1.7414944171905518, + "reward_std": 0.3115447163581848, + "rewards/accuracy_reward": 0.6439945101737976, + "rewards/format_reward": 0.949999988079071, + "step": 866, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 423.8500061035156, + "epoch": 0.016478190630048465, + "grad_norm": 1.7205660396982019, + "kl": 0.1171875, + "learning_rate": 9.993301742970685e-07, + "loss": 0.0047, + "reward": 1.8674999475479126, + "reward_std": 0.23858864605426788, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 1.0, + "step": 867, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 417.0, + "epoch": 0.016497196616934336, + "grad_norm": 2.31565667012534, + "kl": 0.1015625, + "learning_rate": 9.993286285947272e-07, + "loss": 0.0041, + "reward": 1.7135841846466064, + "reward_std": 0.1332218199968338, + "rewards/accuracy_reward": 0.5673341155052185, + "rewards/format_reward": 1.0, + "step": 868, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 432.5, + "epoch": 0.016516202603820203, + "grad_norm": 1.8884284497793173, + "kl": 0.11181640625, + "learning_rate": 9.99327081112193e-07, + "loss": 0.0045, + "reward": 1.8396732807159424, + "reward_std": 0.11203023046255112, + "rewards/accuracy_reward": 0.754673421382904, + "rewards/format_reward": 1.0, + "step": 869, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 489.9750061035156, + "epoch": 0.016535208590706073, + "grad_norm": 2.104984276769367, + "kl": 0.1259765625, + "learning_rate": 9.993255318494716e-07, + "loss": 0.005, + "reward": 2.012500047683716, + "reward_std": 0.282286137342453, + "rewards/accuracy_reward": 0.824999988079071, + "rewards/format_reward": 0.9750000238418579, + "step": 870, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 477.95001220703125, + "epoch": 0.01655421457759194, + "grad_norm": 1.9211525955631754, + "kl": 0.10791015625, + "learning_rate": 9.993239808065681e-07, + "loss": 0.0043, + "reward": 1.875353217124939, + "reward_std": 0.04591451212763786, + "rewards/accuracy_reward": 0.7191033363342285, + "rewards/format_reward": 1.0, + "step": 871, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 410.82501220703125, + "epoch": 0.01657322056447781, + "grad_norm": 1.8909875407142658, + "kl": 0.1142578125, + "learning_rate": 9.993224279834883e-07, + "loss": 0.0046, + "reward": 1.8530091047286987, + "reward_std": 0.16574768722057343, + "rewards/accuracy_reward": 0.7055088877677917, + "rewards/format_reward": 1.0, + "step": 872, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 480.3999938964844, + "epoch": 0.01659222655136368, + "grad_norm": 1.2802115653241215, + "kl": 0.126953125, + "learning_rate": 9.993208733802379e-07, + "loss": 0.0051, + "reward": 1.575060486793518, + "reward_std": 0.13075125217437744, + "rewards/accuracy_reward": 0.4925605356693268, + "rewards/format_reward": 1.0, + "step": 873, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 496.7250061035156, + "epoch": 0.01661123253824955, + "grad_norm": 1.5261699288032977, + "kl": 0.162109375, + "learning_rate": 9.99319316996822e-07, + "loss": 0.0065, + "reward": 2.216249942779541, + "reward_std": 0.05015389993786812, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 874, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 375.8000183105469, + "epoch": 0.016630238525135416, + "grad_norm": 3.478005387881262, + "kl": 0.1025390625, + "learning_rate": 9.993177588332466e-07, + "loss": 0.0041, + "reward": 1.8293765783309937, + "reward_std": 0.15665097534656525, + "rewards/accuracy_reward": 0.6006266474723816, + "rewards/format_reward": 1.0, + "step": 875, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 444.82501220703125, + "epoch": 0.016649244512021287, + "grad_norm": 1.723714483408541, + "kl": 0.1005859375, + "learning_rate": 9.99316198889517e-07, + "loss": 0.004, + "reward": 1.7071765661239624, + "reward_std": 0.055387865751981735, + "rewards/accuracy_reward": 0.6221765279769897, + "rewards/format_reward": 1.0, + "step": 876, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 426.6750183105469, + "epoch": 0.016668250498907154, + "grad_norm": 1.319722363716567, + "kl": 0.0693359375, + "learning_rate": 9.993146371656387e-07, + "loss": 0.0028, + "reward": 1.5111616849899292, + "reward_std": 0.1265857219696045, + "rewards/accuracy_reward": 0.5024116039276123, + "rewards/format_reward": 1.0, + "step": 877, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 423.8999938964844, + "epoch": 0.016687256485793025, + "grad_norm": 6.5689295087820065, + "kl": 0.1201171875, + "learning_rate": 9.993130736616176e-07, + "loss": 0.0048, + "reward": 1.5787500143051147, + "reward_std": 0.1456565111875534, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 878, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 442.5249938964844, + "epoch": 0.016706262472678896, + "grad_norm": 1.83588452778704, + "kl": 0.12158203125, + "learning_rate": 9.993115083774588e-07, + "loss": 0.0049, + "reward": 2.0285227298736572, + "reward_std": 0.054203104227781296, + "rewards/accuracy_reward": 0.8085227012634277, + "rewards/format_reward": 1.0, + "step": 879, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 446.0249938964844, + "epoch": 0.016725268459564763, + "grad_norm": 2.2596994671096984, + "kl": 0.099609375, + "learning_rate": 9.993099413131685e-07, + "loss": 0.004, + "reward": 2.103471040725708, + "reward_std": 0.07465606927871704, + "rewards/accuracy_reward": 0.9172208905220032, + "rewards/format_reward": 1.0, + "step": 880, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 417.5, + "epoch": 0.016744274446450633, + "grad_norm": 1.744277750918761, + "kl": 0.0888671875, + "learning_rate": 9.993083724687516e-07, + "loss": 0.0035, + "reward": 1.7909082174301147, + "reward_std": 0.05641747638583183, + "rewards/accuracy_reward": 0.5809082388877869, + "rewards/format_reward": 1.0, + "step": 881, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 448.2250061035156, + "epoch": 0.0167632804333365, + "grad_norm": 2.153572328587144, + "kl": 0.1123046875, + "learning_rate": 9.993068018442141e-07, + "loss": 0.0045, + "reward": 1.734042763710022, + "reward_std": 0.19810186326503754, + "rewards/accuracy_reward": 0.6402926445007324, + "rewards/format_reward": 1.0, + "step": 882, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 466.0, + "epoch": 0.01678228642022237, + "grad_norm": 1.7170656576305139, + "kl": 0.1123046875, + "learning_rate": 9.993052294395617e-07, + "loss": 0.0045, + "reward": 2.1014654636383057, + "reward_std": 0.25909802317619324, + "rewards/accuracy_reward": 0.9189655184745789, + "rewards/format_reward": 1.0, + "step": 883, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 424.6750183105469, + "epoch": 0.01680129240710824, + "grad_norm": 2.373601273643548, + "kl": 0.1142578125, + "learning_rate": 9.993036552547997e-07, + "loss": 0.0046, + "reward": 2.0, + "reward_std": 0.24250374734401703, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 1.0, + "step": 884, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 432.625, + "epoch": 0.01682029839399411, + "grad_norm": 1.746439401988913, + "kl": 0.09130859375, + "learning_rate": 9.993020792899338e-07, + "loss": 0.0037, + "reward": 1.631177544593811, + "reward_std": 0.04874923452734947, + "rewards/accuracy_reward": 0.4924275875091553, + "rewards/format_reward": 1.0, + "step": 885, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 439.20001220703125, + "epoch": 0.016839304380879976, + "grad_norm": 1.7100199705566455, + "kl": 0.14453125, + "learning_rate": 9.9930050154497e-07, + "loss": 0.0058, + "reward": 1.9174998998641968, + "reward_std": 0.34676268696784973, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 1.0, + "step": 886, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 404.5249938964844, + "epoch": 0.016858310367765847, + "grad_norm": 3.0662870955776023, + "kl": 0.09228515625, + "learning_rate": 9.992989220199131e-07, + "loss": 0.0037, + "reward": 1.720924735069275, + "reward_std": 0.30055004358291626, + "rewards/accuracy_reward": 0.5334247350692749, + "rewards/format_reward": 1.0, + "step": 887, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 384.45001220703125, + "epoch": 0.016877316354651714, + "grad_norm": 4.733601015078978, + "kl": 0.07763671875, + "learning_rate": 9.992973407147694e-07, + "loss": 0.0031, + "reward": 1.5069843530654907, + "reward_std": 0.2703723907470703, + "rewards/accuracy_reward": 0.48573437333106995, + "rewards/format_reward": 1.0, + "step": 888, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 414.3000183105469, + "epoch": 0.016896322341537585, + "grad_norm": 1.9393133107030889, + "kl": 0.1279296875, + "learning_rate": 9.992957576295444e-07, + "loss": 0.0051, + "reward": 1.5737501382827759, + "reward_std": 0.21649669110774994, + "rewards/accuracy_reward": 0.5375000834465027, + "rewards/format_reward": 1.0, + "step": 889, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 440.5500183105469, + "epoch": 0.016915328328423452, + "grad_norm": 1.8157109279591936, + "kl": 0.126953125, + "learning_rate": 9.992941727642437e-07, + "loss": 0.0051, + "reward": 1.6087499856948853, + "reward_std": 0.20222507417201996, + "rewards/accuracy_reward": 0.5250000357627869, + "rewards/format_reward": 1.0, + "step": 890, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 433.6000061035156, + "epoch": 0.016934334315309323, + "grad_norm": 7.040820298091219, + "kl": 0.07421875, + "learning_rate": 9.99292586118873e-07, + "loss": 0.003, + "reward": 1.6710137128829956, + "reward_std": 0.36479684710502625, + "rewards/accuracy_reward": 0.5985136032104492, + "rewards/format_reward": 1.0, + "step": 891, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 411.5500183105469, + "epoch": 0.01695334030219519, + "grad_norm": 2.831915634340959, + "kl": 0.08447265625, + "learning_rate": 9.992909976934379e-07, + "loss": 0.0034, + "reward": 1.7837499380111694, + "reward_std": 0.21281376481056213, + "rewards/accuracy_reward": 0.699999988079071, + "rewards/format_reward": 1.0, + "step": 892, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 461.7749938964844, + "epoch": 0.01697234628908106, + "grad_norm": 1.869869477482866, + "kl": 0.12109375, + "learning_rate": 9.992894074879438e-07, + "loss": 0.0049, + "reward": 1.991864800453186, + "reward_std": 0.11864250898361206, + "rewards/accuracy_reward": 0.7768649458885193, + "rewards/format_reward": 1.0, + "step": 893, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 437.8500061035156, + "epoch": 0.01699135227596693, + "grad_norm": 1.5980319565472654, + "kl": 0.1025390625, + "learning_rate": 9.99287815502397e-07, + "loss": 0.0041, + "reward": 2.0072453022003174, + "reward_std": 0.3231770694255829, + "rewards/accuracy_reward": 0.9172453880310059, + "rewards/format_reward": 0.9750000238418579, + "step": 894, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 422.875, + "epoch": 0.0170103582628528, + "grad_norm": 2.8889234715050813, + "kl": 0.12109375, + "learning_rate": 9.992862217368026e-07, + "loss": 0.0048, + "reward": 1.8533226251602173, + "reward_std": 0.29257944226264954, + "rewards/accuracy_reward": 0.6683226823806763, + "rewards/format_reward": 1.0, + "step": 895, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 421.70001220703125, + "epoch": 0.01702936424973867, + "grad_norm": 1.8171922834320466, + "kl": 0.12890625, + "learning_rate": 9.992846261911667e-07, + "loss": 0.0052, + "reward": 1.6515644788742065, + "reward_std": 0.40723466873168945, + "rewards/accuracy_reward": 0.5890643000602722, + "rewards/format_reward": 0.949999988079071, + "step": 896, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 433.7250061035156, + "epoch": 0.017048370236624536, + "grad_norm": 1.7504183513198104, + "kl": 0.1376953125, + "learning_rate": 9.992830288654946e-07, + "loss": 0.0055, + "reward": 2.0028178691864014, + "reward_std": 0.13226179778575897, + "rewards/accuracy_reward": 0.8440677523612976, + "rewards/format_reward": 1.0, + "step": 897, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 478.875, + "epoch": 0.017067376223510407, + "grad_norm": 1.6776428420008855, + "kl": 0.107421875, + "learning_rate": 9.992814297597921e-07, + "loss": 0.0043, + "reward": 1.8325001001358032, + "reward_std": 0.1587127298116684, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 1.0, + "step": 898, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 405.4750061035156, + "epoch": 0.017086382210396274, + "grad_norm": 3.391580591063593, + "kl": 0.140625, + "learning_rate": 9.992798288740652e-07, + "loss": 0.0056, + "reward": 1.9079521894454956, + "reward_std": 0.03444410488009453, + "rewards/accuracy_reward": 0.7429521679878235, + "rewards/format_reward": 1.0, + "step": 899, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 417.6499938964844, + "epoch": 0.017105388197282145, + "grad_norm": 1.92102547604499, + "kl": 0.1630859375, + "learning_rate": 9.992782262083192e-07, + "loss": 0.0065, + "reward": 1.8000000715255737, + "reward_std": 0.1320357769727707, + "rewards/accuracy_reward": 0.699999988079071, + "rewards/format_reward": 1.0, + "step": 900, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 416.95001220703125, + "epoch": 0.017124394184168012, + "grad_norm": 1.5300812122541545, + "kl": 0.138671875, + "learning_rate": 9.992766217625602e-07, + "loss": 0.0055, + "reward": 1.4424999952316284, + "reward_std": 0.08856045454740524, + "rewards/accuracy_reward": 0.42500001192092896, + "rewards/format_reward": 1.0, + "step": 901, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 457.20001220703125, + "epoch": 0.017143400171053882, + "grad_norm": 1.5973634459051322, + "kl": 0.07470703125, + "learning_rate": 9.992750155367936e-07, + "loss": 0.003, + "reward": 1.6483417749404907, + "reward_std": 0.15044425427913666, + "rewards/accuracy_reward": 0.613341748714447, + "rewards/format_reward": 1.0, + "step": 902, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 381.4750061035156, + "epoch": 0.01716240615793975, + "grad_norm": 2.8127747506593352, + "kl": 0.10107421875, + "learning_rate": 9.992734075310252e-07, + "loss": 0.004, + "reward": 1.9759514331817627, + "reward_std": 0.15555985271930695, + "rewards/accuracy_reward": 0.8222013711929321, + "rewards/format_reward": 1.0, + "step": 903, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 379.875, + "epoch": 0.01718141214482562, + "grad_norm": 1.9238245977549657, + "kl": 0.1015625, + "learning_rate": 9.992717977452609e-07, + "loss": 0.0041, + "reward": 1.7420654296875, + "reward_std": 0.11957808583974838, + "rewards/accuracy_reward": 0.5795655250549316, + "rewards/format_reward": 1.0, + "step": 904, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.4, + "completion_length": 437.0249938964844, + "epoch": 0.017200418131711488, + "grad_norm": 1.9246966725259174, + "kl": 0.1337890625, + "learning_rate": 9.992701861795064e-07, + "loss": 0.0053, + "reward": 1.7537500858306885, + "reward_std": 0.018371179699897766, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 905, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 395.2749938964844, + "epoch": 0.017219424118597358, + "grad_norm": 3.2173579808888024, + "kl": 0.09716796875, + "learning_rate": 9.992685728337672e-07, + "loss": 0.0039, + "reward": 1.8408873081207275, + "reward_std": 0.17486131191253662, + "rewards/accuracy_reward": 0.6183871626853943, + "rewards/format_reward": 1.0, + "step": 906, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 375.1000061035156, + "epoch": 0.01723843010548323, + "grad_norm": 1.9334271577006181, + "kl": 0.11083984375, + "learning_rate": 9.992669577080492e-07, + "loss": 0.0044, + "reward": 2.0159237384796143, + "reward_std": 0.0631365180015564, + "rewards/accuracy_reward": 0.8484236001968384, + "rewards/format_reward": 1.0, + "step": 907, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 423.6750183105469, + "epoch": 0.017257436092369096, + "grad_norm": 1.7833943522477942, + "kl": 0.09521484375, + "learning_rate": 9.99265340802358e-07, + "loss": 0.0038, + "reward": 1.6475000381469727, + "reward_std": 0.16724412143230438, + "rewards/accuracy_reward": 0.5250000357627869, + "rewards/format_reward": 1.0, + "step": 908, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 431.4250183105469, + "epoch": 0.017276442079254967, + "grad_norm": 2.340604095428615, + "kl": 0.14453125, + "learning_rate": 9.992637221167e-07, + "loss": 0.0058, + "reward": 2.0562500953674316, + "reward_std": 0.046975038945674896, + "rewards/accuracy_reward": 0.824999988079071, + "rewards/format_reward": 1.0, + "step": 909, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 489.3999938964844, + "epoch": 0.017295448066140834, + "grad_norm": 2.0548023227236714, + "kl": 0.13671875, + "learning_rate": 9.992621016510803e-07, + "loss": 0.0055, + "reward": 1.8840579986572266, + "reward_std": 0.061809565871953964, + "rewards/accuracy_reward": 0.6840581297874451, + "rewards/format_reward": 1.0, + "step": 910, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 460.3999938964844, + "epoch": 0.017314454053026705, + "grad_norm": 2.2033052205495034, + "kl": 0.1337890625, + "learning_rate": 9.992604794055047e-07, + "loss": 0.0054, + "reward": 1.9399999380111694, + "reward_std": 0.2200370579957962, + "rewards/accuracy_reward": 0.8350000381469727, + "rewards/format_reward": 1.0, + "step": 911, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 477.5249938964844, + "epoch": 0.01733346003991257, + "grad_norm": 1.8187752496420266, + "kl": 0.1376953125, + "learning_rate": 9.992588553799794e-07, + "loss": 0.0055, + "reward": 2.045555591583252, + "reward_std": 0.1822027564048767, + "rewards/accuracy_reward": 0.9055555462837219, + "rewards/format_reward": 1.0, + "step": 912, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 490.0249938964844, + "epoch": 0.017352466026798442, + "grad_norm": 1.5538539437913994, + "kl": 0.1416015625, + "learning_rate": 9.9925722957451e-07, + "loss": 0.0056, + "reward": 1.6162500381469727, + "reward_std": 0.42723989486694336, + "rewards/accuracy_reward": 0.574999988079071, + "rewards/format_reward": 0.9750000238418579, + "step": 913, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 457.125, + "epoch": 0.01737147201368431, + "grad_norm": 2.1621873194400587, + "kl": 0.1201171875, + "learning_rate": 9.99255601989102e-07, + "loss": 0.0048, + "reward": 1.8568352460861206, + "reward_std": 0.1532951146364212, + "rewards/accuracy_reward": 0.7430852055549622, + "rewards/format_reward": 1.0, + "step": 914, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 456.1750183105469, + "epoch": 0.01739047800057018, + "grad_norm": 1.6042727946718174, + "kl": 0.1142578125, + "learning_rate": 9.992539726237616e-07, + "loss": 0.0046, + "reward": 1.6074215173721313, + "reward_std": 0.18194305896759033, + "rewards/accuracy_reward": 0.6086716651916504, + "rewards/format_reward": 1.0, + "step": 915, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 428.8000183105469, + "epoch": 0.017409483987456047, + "grad_norm": 1.9681153378432128, + "kl": 0.1279296875, + "learning_rate": 9.992523414784945e-07, + "loss": 0.0051, + "reward": 1.6426811218261719, + "reward_std": 0.20662398636341095, + "rewards/accuracy_reward": 0.5626811385154724, + "rewards/format_reward": 1.0, + "step": 916, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 441.5249938964844, + "epoch": 0.017428489974341918, + "grad_norm": 1.6637915253080449, + "kl": 0.091796875, + "learning_rate": 9.992507085533065e-07, + "loss": 0.0037, + "reward": 1.8812862634658813, + "reward_std": 0.05521545559167862, + "rewards/accuracy_reward": 0.7700361609458923, + "rewards/format_reward": 1.0, + "step": 917, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 463.32501220703125, + "epoch": 0.017447495961227785, + "grad_norm": 2.2822791510295173, + "kl": 0.107421875, + "learning_rate": 9.992490738482035e-07, + "loss": 0.0043, + "reward": 1.5084000825881958, + "reward_std": 0.31936535239219666, + "rewards/accuracy_reward": 0.4284001290798187, + "rewards/format_reward": 1.0, + "step": 918, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 452.2749938964844, + "epoch": 0.017466501948113656, + "grad_norm": 2.0519751011605467, + "kl": 0.146484375, + "learning_rate": 9.992474373631911e-07, + "loss": 0.0059, + "reward": 1.8933333158493042, + "reward_std": 0.030097205191850662, + "rewards/accuracy_reward": 0.7333333492279053, + "rewards/format_reward": 1.0, + "step": 919, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 462.95001220703125, + "epoch": 0.017485507934999527, + "grad_norm": 2.518309606045907, + "kl": 0.08935546875, + "learning_rate": 9.992457990982752e-07, + "loss": 0.0036, + "reward": 2.0289437770843506, + "reward_std": 0.20214155316352844, + "rewards/accuracy_reward": 0.8114437460899353, + "rewards/format_reward": 1.0, + "step": 920, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 394.6750183105469, + "epoch": 0.017504513921885394, + "grad_norm": 3.374778912898044, + "kl": 0.09814453125, + "learning_rate": 9.992441590534619e-07, + "loss": 0.0039, + "reward": 1.776856780052185, + "reward_std": 0.20960001647472382, + "rewards/accuracy_reward": 0.6406068205833435, + "rewards/format_reward": 1.0, + "step": 921, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 425.70001220703125, + "epoch": 0.017523519908771264, + "grad_norm": 2.174622831364653, + "kl": 0.134765625, + "learning_rate": 9.992425172287568e-07, + "loss": 0.0054, + "reward": 1.5806818008422852, + "reward_std": 0.2188062220811844, + "rewards/accuracy_reward": 0.543181836605072, + "rewards/format_reward": 1.0, + "step": 922, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 420.1000061035156, + "epoch": 0.01754252589565713, + "grad_norm": 1.752949143156655, + "kl": 0.1259765625, + "learning_rate": 9.992408736241657e-07, + "loss": 0.0051, + "reward": 1.7737499475479126, + "reward_std": 0.1288391351699829, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 1.0, + "step": 923, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 464.8999938964844, + "epoch": 0.017561531882543002, + "grad_norm": 1.7384423819909192, + "kl": 0.1279296875, + "learning_rate": 9.992392282396945e-07, + "loss": 0.0051, + "reward": 1.5212500095367432, + "reward_std": 0.23440158367156982, + "rewards/accuracy_reward": 0.4749999940395355, + "rewards/format_reward": 0.9750000238418579, + "step": 924, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 432.1750183105469, + "epoch": 0.01758053786942887, + "grad_norm": 10.388002464218191, + "kl": 0.126953125, + "learning_rate": 9.992375810753495e-07, + "loss": 0.0051, + "reward": 1.6720832586288452, + "reward_std": 0.026257777586579323, + "rewards/accuracy_reward": 0.5833333730697632, + "rewards/format_reward": 1.0, + "step": 925, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 450.95001220703125, + "epoch": 0.01759954385631474, + "grad_norm": 1.879091653090008, + "kl": 0.134765625, + "learning_rate": 9.99235932131136e-07, + "loss": 0.0054, + "reward": 2.0712502002716064, + "reward_std": 0.13815949857234955, + "rewards/accuracy_reward": 0.9000000357627869, + "rewards/format_reward": 1.0, + "step": 926, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 443.95001220703125, + "epoch": 0.017618549843200607, + "grad_norm": 1.8656385520671657, + "kl": 0.1396484375, + "learning_rate": 9.9923428140706e-07, + "loss": 0.0056, + "reward": 2.1524999141693115, + "reward_std": 0.1306147426366806, + "rewards/accuracy_reward": 0.925000011920929, + "rewards/format_reward": 1.0, + "step": 927, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 414.25, + "epoch": 0.017637555830086478, + "grad_norm": 2.9581923607526366, + "kl": 0.09619140625, + "learning_rate": 9.992326289031276e-07, + "loss": 0.0039, + "reward": 1.9236233234405518, + "reward_std": 0.23222461342811584, + "rewards/accuracy_reward": 0.7536233067512512, + "rewards/format_reward": 1.0, + "step": 928, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 420.5249938964844, + "epoch": 0.017656561816972345, + "grad_norm": 2.056712629653743, + "kl": 0.140625, + "learning_rate": 9.992309746193444e-07, + "loss": 0.0056, + "reward": 1.837916612625122, + "reward_std": 0.17498302459716797, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/format_reward": 1.0, + "step": 929, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 487.75, + "epoch": 0.017675567803858216, + "grad_norm": 4.783569391328312, + "kl": 0.146484375, + "learning_rate": 9.992293185557167e-07, + "loss": 0.0059, + "reward": 1.823996901512146, + "reward_std": 0.2458491325378418, + "rewards/accuracy_reward": 0.6402468681335449, + "rewards/format_reward": 1.0, + "step": 930, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 464.3000183105469, + "epoch": 0.017694573790744083, + "grad_norm": 1.6907576700414495, + "kl": 0.125, + "learning_rate": 9.9922766071225e-07, + "loss": 0.005, + "reward": 1.8650000095367432, + "reward_std": 0.1977672576904297, + "rewards/accuracy_reward": 0.699999988079071, + "rewards/format_reward": 1.0, + "step": 931, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 463.5249938964844, + "epoch": 0.017713579777629954, + "grad_norm": 2.1710034956058215, + "kl": 0.146484375, + "learning_rate": 9.992260010889504e-07, + "loss": 0.0058, + "reward": 1.9315861463546753, + "reward_std": 0.30427151918411255, + "rewards/accuracy_reward": 0.7565861940383911, + "rewards/format_reward": 1.0, + "step": 932, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 455.45001220703125, + "epoch": 0.01773258576451582, + "grad_norm": 2.982162755567636, + "kl": 0.12060546875, + "learning_rate": 9.992243396858237e-07, + "loss": 0.0048, + "reward": 1.8697845935821533, + "reward_std": 0.0642734169960022, + "rewards/accuracy_reward": 0.6847846508026123, + "rewards/format_reward": 1.0, + "step": 933, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 425.625, + "epoch": 0.01775159175140169, + "grad_norm": 2.238340090834316, + "kl": 0.091796875, + "learning_rate": 9.99222676502876e-07, + "loss": 0.0037, + "reward": 1.5753962993621826, + "reward_std": 0.27225151658058167, + "rewards/accuracy_reward": 0.5853962898254395, + "rewards/format_reward": 0.949999988079071, + "step": 934, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 420.8999938964844, + "epoch": 0.017770597738287562, + "grad_norm": 1.7667811015325243, + "kl": 0.1181640625, + "learning_rate": 9.99221011540113e-07, + "loss": 0.0047, + "reward": 1.7462501525878906, + "reward_std": 0.21943913400173187, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 935, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 474.20001220703125, + "epoch": 0.01778960372517343, + "grad_norm": 1.331285004061845, + "kl": 0.11083984375, + "learning_rate": 9.992193447975412e-07, + "loss": 0.0044, + "reward": 1.5946146249771118, + "reward_std": 0.23505862057209015, + "rewards/accuracy_reward": 0.4983646869659424, + "rewards/format_reward": 0.9750000238418579, + "step": 936, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 434.7250061035156, + "epoch": 0.0178086097120593, + "grad_norm": 2.292171652878736, + "kl": 0.126953125, + "learning_rate": 9.99217676275166e-07, + "loss": 0.0051, + "reward": 1.6480646133422852, + "reward_std": 0.35812708735466003, + "rewards/accuracy_reward": 0.5080644488334656, + "rewards/format_reward": 1.0, + "step": 937, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 435.6499938964844, + "epoch": 0.017827615698945167, + "grad_norm": 1.7570461417337828, + "kl": 0.12060546875, + "learning_rate": 9.992160059729933e-07, + "loss": 0.0048, + "reward": 2.0587499141693115, + "reward_std": 0.2870778739452362, + "rewards/accuracy_reward": 0.9000000357627869, + "rewards/format_reward": 1.0, + "step": 938, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 471.5, + "epoch": 0.017846621685831038, + "grad_norm": 5.372988577750309, + "kl": 0.162109375, + "learning_rate": 9.992143338910292e-07, + "loss": 0.0065, + "reward": 2.0, + "reward_std": 0.05981827899813652, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 939, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 464.8500061035156, + "epoch": 0.017865627672716905, + "grad_norm": 1.6155353055526307, + "kl": 0.1171875, + "learning_rate": 9.992126600292799e-07, + "loss": 0.0047, + "reward": 1.9077577590942383, + "reward_std": 0.08513978123664856, + "rewards/accuracy_reward": 0.8065077066421509, + "rewards/format_reward": 1.0, + "step": 940, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 448.6499938964844, + "epoch": 0.017884633659602776, + "grad_norm": 4.703343237748224, + "kl": 0.11572265625, + "learning_rate": 9.99210984387751e-07, + "loss": 0.0046, + "reward": 2.0587499141693115, + "reward_std": 0.22609496116638184, + "rewards/accuracy_reward": 0.8375000357627869, + "rewards/format_reward": 1.0, + "step": 941, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 430.4750061035156, + "epoch": 0.017903639646488643, + "grad_norm": 2.0665089701744535, + "kl": 0.11083984375, + "learning_rate": 9.992093069664485e-07, + "loss": 0.0044, + "reward": 1.8625000715255737, + "reward_std": 0.2017369121313095, + "rewards/accuracy_reward": 0.7550000548362732, + "rewards/format_reward": 1.0, + "step": 942, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 494.95001220703125, + "epoch": 0.017922645633374513, + "grad_norm": 2.0539424581902113, + "kl": 0.1259765625, + "learning_rate": 9.992076277653787e-07, + "loss": 0.005, + "reward": 1.5616008043289185, + "reward_std": 0.2615428566932678, + "rewards/accuracy_reward": 0.6441008448600769, + "rewards/format_reward": 0.824999988079071, + "step": 943, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 462.125, + "epoch": 0.01794165162026038, + "grad_norm": 2.6413124699177715, + "kl": 0.15234375, + "learning_rate": 9.992059467845474e-07, + "loss": 0.0061, + "reward": 1.932103157043457, + "reward_std": 0.038227807730436325, + "rewards/accuracy_reward": 0.7308531999588013, + "rewards/format_reward": 1.0, + "step": 944, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 459.5500183105469, + "epoch": 0.01796065760714625, + "grad_norm": 3.5179101772434915, + "kl": 0.361328125, + "learning_rate": 9.992042640239606e-07, + "loss": 0.0145, + "reward": 2.0274999141693115, + "reward_std": 0.3652125895023346, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.925000011920929, + "step": 945, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.4, + "completion_length": 399.7749938964844, + "epoch": 0.01797966359403212, + "grad_norm": 3.1089886476084816, + "kl": 0.373046875, + "learning_rate": 9.99202579483624e-07, + "loss": 0.0149, + "reward": 1.3112766742706299, + "reward_std": 0.16472071409225464, + "rewards/accuracy_reward": 0.2525266110897064, + "rewards/format_reward": 1.0, + "step": 946, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 411.1750183105469, + "epoch": 0.01799866958091799, + "grad_norm": 2.840354424079927, + "kl": 0.28125, + "learning_rate": 9.992008931635443e-07, + "loss": 0.0112, + "reward": 1.9889267683029175, + "reward_std": 0.476940393447876, + "rewards/accuracy_reward": 0.8439265489578247, + "rewards/format_reward": 0.949999988079071, + "step": 947, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 416.4750061035156, + "epoch": 0.01801767556780386, + "grad_norm": 2.5975677131780777, + "kl": 0.27734375, + "learning_rate": 9.99199205063727e-07, + "loss": 0.0111, + "reward": 1.894955039024353, + "reward_std": 0.19314749538898468, + "rewards/accuracy_reward": 0.7112049460411072, + "rewards/format_reward": 1.0, + "step": 948, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 413.7250061035156, + "epoch": 0.018036681554689727, + "grad_norm": 12.987690832168017, + "kl": 0.1220703125, + "learning_rate": 9.99197515184178e-07, + "loss": 0.0049, + "reward": 1.8079410791397095, + "reward_std": 0.05968896299600601, + "rewards/accuracy_reward": 0.7729408740997314, + "rewards/format_reward": 1.0, + "step": 949, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 459.8999938964844, + "epoch": 0.018055687541575598, + "grad_norm": 2.0117058108923933, + "kl": 0.1533203125, + "learning_rate": 9.991958235249039e-07, + "loss": 0.0061, + "reward": 1.973196029663086, + "reward_std": 0.15524807572364807, + "rewards/accuracy_reward": 0.7856961488723755, + "rewards/format_reward": 1.0, + "step": 950, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 468.375, + "epoch": 0.018074693528461465, + "grad_norm": 2.7709125727215214, + "kl": 0.1259765625, + "learning_rate": 9.9919413008591e-07, + "loss": 0.005, + "reward": 1.6391013860702515, + "reward_std": 0.0913296490907669, + "rewards/accuracy_reward": 0.5053513646125793, + "rewards/format_reward": 1.0, + "step": 951, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 418.4750061035156, + "epoch": 0.018093699515347336, + "grad_norm": 3.0649735468559176, + "kl": 0.19140625, + "learning_rate": 9.991924348672031e-07, + "loss": 0.0077, + "reward": 2.096759796142578, + "reward_std": 0.06396092474460602, + "rewards/accuracy_reward": 0.8667598962783813, + "rewards/format_reward": 1.0, + "step": 952, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 424.375, + "epoch": 0.018112705502233203, + "grad_norm": 2.326711194017883, + "kl": 0.1220703125, + "learning_rate": 9.991907378687886e-07, + "loss": 0.0049, + "reward": 1.6628551483154297, + "reward_std": 0.25205954909324646, + "rewards/accuracy_reward": 0.6278550028800964, + "rewards/format_reward": 1.0, + "step": 953, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 438.875, + "epoch": 0.018131711489119073, + "grad_norm": 3.17294327314815, + "kl": 0.2373046875, + "learning_rate": 9.99189039090673e-07, + "loss": 0.0095, + "reward": 1.751649260520935, + "reward_std": 0.29257771372795105, + "rewards/accuracy_reward": 0.7066492438316345, + "rewards/format_reward": 0.9750000238418579, + "step": 954, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 403.375, + "epoch": 0.01815071747600494, + "grad_norm": 4.620806520620864, + "kl": 0.2216796875, + "learning_rate": 9.99187338532862e-07, + "loss": 0.0089, + "reward": 2.1465559005737305, + "reward_std": 0.06309028714895248, + "rewards/accuracy_reward": 0.9103057980537415, + "rewards/format_reward": 1.0, + "step": 955, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.2, + "completion_length": 427.375, + "epoch": 0.01816972346289081, + "grad_norm": 4.196303590071475, + "kl": 0.3125, + "learning_rate": 9.991856361953619e-07, + "loss": 0.0125, + "reward": 2.0, + "reward_std": 0.05275532230734825, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 956, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 458.1499938964844, + "epoch": 0.01818872944977668, + "grad_norm": 1.5539313885356878, + "kl": 0.099609375, + "learning_rate": 9.991839320781787e-07, + "loss": 0.004, + "reward": 1.4706287384033203, + "reward_std": 0.326867014169693, + "rewards/accuracy_reward": 0.4743788242340088, + "rewards/format_reward": 0.9750000238418579, + "step": 957, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 473.2250061035156, + "epoch": 0.01820773543666255, + "grad_norm": 3.3356646775788477, + "kl": 0.427734375, + "learning_rate": 9.991822261813186e-07, + "loss": 0.0171, + "reward": 1.7416805028915405, + "reward_std": 0.6748544573783875, + "rewards/accuracy_reward": 0.7441805601119995, + "rewards/format_reward": 0.8500000238418579, + "step": 958, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 463.6000061035156, + "epoch": 0.018226741423548416, + "grad_norm": 3.093438098821877, + "kl": 0.294921875, + "learning_rate": 9.991805185047873e-07, + "loss": 0.0118, + "reward": 1.4122883081436157, + "reward_std": 0.1356745809316635, + "rewards/accuracy_reward": 0.42978811264038086, + "rewards/format_reward": 0.9750000238418579, + "step": 959, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 428.1499938964844, + "epoch": 0.018245747410434287, + "grad_norm": 4.588428440906342, + "kl": 0.5859375, + "learning_rate": 9.991788090485913e-07, + "loss": 0.0234, + "reward": 1.465193748474121, + "reward_std": 0.13266170024871826, + "rewards/accuracy_reward": 0.4364437758922577, + "rewards/format_reward": 0.949999988079071, + "step": 960, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 430.07501220703125, + "epoch": 0.018264753397320158, + "grad_norm": 2.2541051628772797, + "kl": 0.2734375, + "learning_rate": 9.991770978127365e-07, + "loss": 0.0109, + "reward": 1.6100000143051147, + "reward_std": 0.22446639835834503, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 961, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 432.4750061035156, + "epoch": 0.018283759384206025, + "grad_norm": 4.152128087108797, + "kl": 0.275390625, + "learning_rate": 9.99175384797229e-07, + "loss": 0.011, + "reward": 1.8690799474716187, + "reward_std": 0.19647662341594696, + "rewards/accuracy_reward": 0.7065801024436951, + "rewards/format_reward": 1.0, + "step": 962, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 481.5, + "epoch": 0.018302765371091895, + "grad_norm": 2.585845757439392, + "kl": 0.291015625, + "learning_rate": 9.991736700020751e-07, + "loss": 0.0117, + "reward": 1.7849998474121094, + "reward_std": 0.4497860074043274, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 0.9750000238418579, + "step": 963, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 449.32501220703125, + "epoch": 0.018321771357977763, + "grad_norm": 3.2951027124883185, + "kl": 0.2421875, + "learning_rate": 9.991719534272806e-07, + "loss": 0.0097, + "reward": 1.557499885559082, + "reward_std": 0.0724850744009018, + "rewards/accuracy_reward": 0.42500001192092896, + "rewards/format_reward": 1.0, + "step": 964, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 428.1000061035156, + "epoch": 0.018340777344863633, + "grad_norm": 4.381643439121722, + "kl": 0.37890625, + "learning_rate": 9.991702350728518e-07, + "loss": 0.0152, + "reward": 1.87496018409729, + "reward_std": 0.05095122382044792, + "rewards/accuracy_reward": 0.6649600863456726, + "rewards/format_reward": 1.0, + "step": 965, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 511.0, + "epoch": 0.0183597833317495, + "grad_norm": 15.244856557677098, + "kl": 0.30859375, + "learning_rate": 9.99168514938795e-07, + "loss": 0.0123, + "reward": 1.810625433921814, + "reward_std": 0.25420841574668884, + "rewards/accuracy_reward": 0.8168756365776062, + "rewards/format_reward": 0.949999988079071, + "step": 966, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 427.0500183105469, + "epoch": 0.01837878931863537, + "grad_norm": 2.1756852110703693, + "kl": 0.115234375, + "learning_rate": 9.99166793025116e-07, + "loss": 0.0046, + "reward": 1.7404800653457642, + "reward_std": 0.18577569723129272, + "rewards/accuracy_reward": 0.6242300868034363, + "rewards/format_reward": 1.0, + "step": 967, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 475.7250061035156, + "epoch": 0.01839779530552124, + "grad_norm": 2.113511488614174, + "kl": 0.0908203125, + "learning_rate": 9.991650693318213e-07, + "loss": 0.0036, + "reward": 1.9334135055541992, + "reward_std": 0.055705346167087555, + "rewards/accuracy_reward": 0.8371635675430298, + "rewards/format_reward": 1.0, + "step": 968, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 496.0500183105469, + "epoch": 0.01841680129240711, + "grad_norm": 2.628306037529305, + "kl": 0.1142578125, + "learning_rate": 9.991633438589164e-07, + "loss": 0.0046, + "reward": 1.7807127237319946, + "reward_std": 0.06006943807005882, + "rewards/accuracy_reward": 0.6657127141952515, + "rewards/format_reward": 1.0, + "step": 969, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 507.20001220703125, + "epoch": 0.018435807279292976, + "grad_norm": 2.8141559042053665, + "kl": 0.1328125, + "learning_rate": 9.99161616606408e-07, + "loss": 0.0053, + "reward": 1.6087499856948853, + "reward_std": 0.1750754415988922, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 1.0, + "step": 970, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 503.57501220703125, + "epoch": 0.018454813266178847, + "grad_norm": 1.9481236056538105, + "kl": 0.1337890625, + "learning_rate": 9.991598875743023e-07, + "loss": 0.0053, + "reward": 2.0799663066864014, + "reward_std": 0.06390620768070221, + "rewards/accuracy_reward": 0.8687164187431335, + "rewards/format_reward": 1.0, + "step": 971, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 482.32501220703125, + "epoch": 0.018473819253064714, + "grad_norm": 2.1944315347193553, + "kl": 0.10986328125, + "learning_rate": 9.991581567626053e-07, + "loss": 0.0044, + "reward": 1.6176159381866455, + "reward_std": 0.29334235191345215, + "rewards/accuracy_reward": 0.5413660407066345, + "rewards/format_reward": 1.0, + "step": 972, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.4, + "completion_length": 491.95001220703125, + "epoch": 0.018492825239950585, + "grad_norm": 1.4056729502158087, + "kl": 0.1318359375, + "learning_rate": 9.99156424171323e-07, + "loss": 0.0053, + "reward": 1.743749976158142, + "reward_std": 0.04665544256567955, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 973, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 447.3999938964844, + "epoch": 0.018511831226836452, + "grad_norm": 1.8518768973510429, + "kl": 0.0908203125, + "learning_rate": 9.99154689800462e-07, + "loss": 0.0036, + "reward": 1.3625515699386597, + "reward_std": 0.23056507110595703, + "rewards/accuracy_reward": 0.3863014876842499, + "rewards/format_reward": 0.9750000238418579, + "step": 974, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 481.1000061035156, + "epoch": 0.018530837213722322, + "grad_norm": 1.1990804074264203, + "kl": 0.134765625, + "learning_rate": 9.991529536500281e-07, + "loss": 0.0054, + "reward": 1.8637501001358032, + "reward_std": 0.1592256873846054, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 1.0, + "step": 975, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 478.57501220703125, + "epoch": 0.018549843200608193, + "grad_norm": 1.9793808979708043, + "kl": 0.130859375, + "learning_rate": 9.991512157200274e-07, + "loss": 0.0052, + "reward": 1.8549998998641968, + "reward_std": 0.13763809204101562, + "rewards/accuracy_reward": 0.699999988079071, + "rewards/format_reward": 1.0, + "step": 976, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 468.3500061035156, + "epoch": 0.01856884918749406, + "grad_norm": 1.455494972313243, + "kl": 0.11328125, + "learning_rate": 9.991494760104666e-07, + "loss": 0.0045, + "reward": 1.954702377319336, + "reward_std": 0.24830415844917297, + "rewards/accuracy_reward": 0.7997024059295654, + "rewards/format_reward": 1.0, + "step": 977, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 495.95001220703125, + "epoch": 0.01858785517437993, + "grad_norm": 1.2845338422412447, + "kl": 0.1357421875, + "learning_rate": 9.991477345213516e-07, + "loss": 0.0054, + "reward": 1.7600164413452148, + "reward_std": 0.04295359179377556, + "rewards/accuracy_reward": 0.6800164580345154, + "rewards/format_reward": 1.0, + "step": 978, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 470.82501220703125, + "epoch": 0.018606861161265798, + "grad_norm": 2.524473888507463, + "kl": 0.1201171875, + "learning_rate": 9.991459912526885e-07, + "loss": 0.0048, + "reward": 1.9898589849472046, + "reward_std": 0.14100247621536255, + "rewards/accuracy_reward": 0.7848591208457947, + "rewards/format_reward": 1.0, + "step": 979, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 421.8500061035156, + "epoch": 0.01862586714815167, + "grad_norm": 1.5353370725495674, + "kl": 0.07763671875, + "learning_rate": 9.991442462044838e-07, + "loss": 0.0031, + "reward": 1.7439583539962769, + "reward_std": 0.05997762829065323, + "rewards/accuracy_reward": 0.7239583730697632, + "rewards/format_reward": 1.0, + "step": 980, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 514.6000366210938, + "epoch": 0.018644873135037536, + "grad_norm": 1.2507029833332797, + "kl": 0.08935546875, + "learning_rate": 9.991424993767435e-07, + "loss": 0.0036, + "reward": 1.5458621978759766, + "reward_std": 0.2544994652271271, + "rewards/accuracy_reward": 0.5408622026443481, + "rewards/format_reward": 0.9750000238418579, + "step": 981, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 513.2250366210938, + "epoch": 0.018663879121923407, + "grad_norm": 1.6842281728942927, + "kl": 0.1103515625, + "learning_rate": 9.991407507694739e-07, + "loss": 0.0044, + "reward": 1.60509192943573, + "reward_std": 0.37781214714050293, + "rewards/accuracy_reward": 0.6463419795036316, + "rewards/format_reward": 0.9000000357627869, + "step": 982, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 483.20001220703125, + "epoch": 0.018682885108809274, + "grad_norm": 1.7394526175220422, + "kl": 0.1259765625, + "learning_rate": 9.991390003826811e-07, + "loss": 0.005, + "reward": 1.96294105052948, + "reward_std": 0.06293322145938873, + "rewards/accuracy_reward": 0.7779411673545837, + "rewards/format_reward": 1.0, + "step": 983, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.4, + "completion_length": 470.6000061035156, + "epoch": 0.018701891095695145, + "grad_norm": 1.4538016558032507, + "kl": 0.1142578125, + "learning_rate": 9.991372482163715e-07, + "loss": 0.0046, + "reward": 1.256554365158081, + "reward_std": 0.22981882095336914, + "rewards/accuracy_reward": 0.19280432164669037, + "rewards/format_reward": 1.0, + "step": 984, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 437.5249938964844, + "epoch": 0.01872089708258101, + "grad_norm": 1.8900056804728786, + "kl": 0.07763671875, + "learning_rate": 9.991354942705515e-07, + "loss": 0.0031, + "reward": 1.4299341440200806, + "reward_std": 0.04026506096124649, + "rewards/accuracy_reward": 0.36118412017822266, + "rewards/format_reward": 1.0, + "step": 985, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 468.625, + "epoch": 0.018739903069466882, + "grad_norm": 1.915245768357843, + "kl": 0.0966796875, + "learning_rate": 9.99133738545227e-07, + "loss": 0.0039, + "reward": 1.7253516912460327, + "reward_std": 0.060850657522678375, + "rewards/accuracy_reward": 0.5203516483306885, + "rewards/format_reward": 1.0, + "step": 986, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 455.57501220703125, + "epoch": 0.01875890905635275, + "grad_norm": 1.5382300833377138, + "kl": 0.115234375, + "learning_rate": 9.991319810404045e-07, + "loss": 0.0046, + "reward": 1.631250023841858, + "reward_std": 0.12575089931488037, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/format_reward": 1.0, + "step": 987, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 464.1499938964844, + "epoch": 0.01877791504323862, + "grad_norm": 3.1192437339873416, + "kl": 0.0791015625, + "learning_rate": 9.9913022175609e-07, + "loss": 0.0032, + "reward": 1.507525086402893, + "reward_std": 0.15524284541606903, + "rewards/accuracy_reward": 0.43752503395080566, + "rewards/format_reward": 1.0, + "step": 988, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.4, + "completion_length": 449.1499938964844, + "epoch": 0.01879692103012449, + "grad_norm": 1.54914774404728, + "kl": 0.091796875, + "learning_rate": 9.9912846069229e-07, + "loss": 0.0037, + "reward": 1.4654277563095093, + "reward_std": 0.17589350044727325, + "rewards/accuracy_reward": 0.4516778886318207, + "rewards/format_reward": 1.0, + "step": 989, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 469.82501220703125, + "epoch": 0.018815927017010358, + "grad_norm": 1.4213697094478461, + "kl": 0.103515625, + "learning_rate": 9.991266978490108e-07, + "loss": 0.0041, + "reward": 1.7612498998641968, + "reward_std": 0.2095511257648468, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 990, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 481.45001220703125, + "epoch": 0.01883493300389623, + "grad_norm": 1.6312655250495025, + "kl": 0.1201171875, + "learning_rate": 9.991249332262588e-07, + "loss": 0.0048, + "reward": 1.7371524572372437, + "reward_std": 0.032223377376794815, + "rewards/accuracy_reward": 0.6384023427963257, + "rewards/format_reward": 1.0, + "step": 991, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 485.20001220703125, + "epoch": 0.018853938990782096, + "grad_norm": 1.3182864645673726, + "kl": 0.1103515625, + "learning_rate": 9.9912316682404e-07, + "loss": 0.0044, + "reward": 1.933750033378601, + "reward_std": 0.13984672725200653, + "rewards/accuracy_reward": 0.8500000238418579, + "rewards/format_reward": 1.0, + "step": 992, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 480.5249938964844, + "epoch": 0.018872944977667967, + "grad_norm": 1.5844503657364932, + "kl": 0.1376953125, + "learning_rate": 9.991213986423608e-07, + "loss": 0.0055, + "reward": 1.4237499237060547, + "reward_std": 0.2008010596036911, + "rewards/accuracy_reward": 0.3499999940395355, + "rewards/format_reward": 1.0, + "step": 993, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 408.3999938964844, + "epoch": 0.018891950964553834, + "grad_norm": 1.8887526543539201, + "kl": 0.0703125, + "learning_rate": 9.991196286812277e-07, + "loss": 0.0028, + "reward": 1.6798213720321655, + "reward_std": 0.1683581918478012, + "rewards/accuracy_reward": 0.5910714268684387, + "rewards/format_reward": 1.0, + "step": 994, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 495.5249938964844, + "epoch": 0.018910956951439704, + "grad_norm": 1.8087575175397754, + "kl": 0.1416015625, + "learning_rate": 9.991178569406465e-07, + "loss": 0.0056, + "reward": 2.006077527999878, + "reward_std": 0.07102253288030624, + "rewards/accuracy_reward": 0.8798274993896484, + "rewards/format_reward": 1.0, + "step": 995, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 441.3000183105469, + "epoch": 0.01892996293832557, + "grad_norm": 1.7108546851866144, + "kl": 0.0869140625, + "learning_rate": 9.99116083420624e-07, + "loss": 0.0035, + "reward": 1.56125009059906, + "reward_std": 0.27193161845207214, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 1.0, + "step": 996, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 461.1499938964844, + "epoch": 0.018948968925211442, + "grad_norm": 9.216624129646492, + "kl": 0.10400390625, + "learning_rate": 9.991143081211663e-07, + "loss": 0.0042, + "reward": 2.0642831325531006, + "reward_std": 0.1092679500579834, + "rewards/accuracy_reward": 0.8342830538749695, + "rewards/format_reward": 1.0, + "step": 997, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.6, + "completion_length": 451.125, + "epoch": 0.01896797491209731, + "grad_norm": 1.4393901320995328, + "kl": 0.134765625, + "learning_rate": 9.9911253104228e-07, + "loss": 0.0054, + "reward": 1.4824999570846558, + "reward_std": 0.012247446924448013, + "rewards/accuracy_reward": 0.4000000059604645, + "rewards/format_reward": 1.0, + "step": 998, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 441.57501220703125, + "epoch": 0.01898698089898318, + "grad_norm": 1.8310984839526372, + "kl": 0.130859375, + "learning_rate": 9.99110752183971e-07, + "loss": 0.0052, + "reward": 1.5205000638961792, + "reward_std": 0.1124877855181694, + "rewards/accuracy_reward": 0.43800002336502075, + "rewards/format_reward": 1.0, + "step": 999, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 442.32501220703125, + "epoch": 0.019005986885869047, + "grad_norm": 1.9001223645439553, + "kl": 0.1240234375, + "learning_rate": 9.99108971546246e-07, + "loss": 0.005, + "reward": 1.680837631225586, + "reward_std": 0.2153915911912918, + "rewards/accuracy_reward": 0.5345876812934875, + "rewards/format_reward": 1.0, + "step": 1000, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 432.2749938964844, + "epoch": 0.019024992872754918, + "grad_norm": 1.9630811609313032, + "kl": 0.10107421875, + "learning_rate": 9.991071891291114e-07, + "loss": 0.004, + "reward": 1.687555193901062, + "reward_std": 0.05677928403019905, + "rewards/accuracy_reward": 0.5300551652908325, + "rewards/format_reward": 1.0, + "step": 1001, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 456.6750183105469, + "epoch": 0.019043998859640785, + "grad_norm": 1.3656022773433452, + "kl": 0.1376953125, + "learning_rate": 9.991054049325731e-07, + "loss": 0.0055, + "reward": 1.6224998235702515, + "reward_std": 0.12271543592214584, + "rewards/accuracy_reward": 0.4749999940395355, + "rewards/format_reward": 1.0, + "step": 1002, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 457.1499938964844, + "epoch": 0.019063004846526656, + "grad_norm": 2.318556812619703, + "kl": 0.134765625, + "learning_rate": 9.991036189566378e-07, + "loss": 0.0054, + "reward": 1.87375009059906, + "reward_std": 0.09894287586212158, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 1.0, + "step": 1003, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 480.9750061035156, + "epoch": 0.019082010833412526, + "grad_norm": 1.903658302580525, + "kl": 0.12255859375, + "learning_rate": 9.991018312013118e-07, + "loss": 0.0049, + "reward": 2.0052125453948975, + "reward_std": 0.10790147632360458, + "rewards/accuracy_reward": 0.8402124643325806, + "rewards/format_reward": 1.0, + "step": 1004, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 448.5, + "epoch": 0.019101016820298394, + "grad_norm": 1.805451294129596, + "kl": 0.11279296875, + "learning_rate": 9.991000416666015e-07, + "loss": 0.0045, + "reward": 1.611193060874939, + "reward_std": 0.04346662759780884, + "rewards/accuracy_reward": 0.5111930966377258, + "rewards/format_reward": 1.0, + "step": 1005, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 425.07501220703125, + "epoch": 0.019120022807184264, + "grad_norm": 1.781836315521842, + "kl": 0.08642578125, + "learning_rate": 9.990982503525134e-07, + "loss": 0.0034, + "reward": 1.6278969049453735, + "reward_std": 0.31013163924217224, + "rewards/accuracy_reward": 0.5953971147537231, + "rewards/format_reward": 1.0, + "step": 1006, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 511.82501220703125, + "epoch": 0.01913902879407013, + "grad_norm": 1.5531280531862826, + "kl": 0.1201171875, + "learning_rate": 9.990964572590537e-07, + "loss": 0.0048, + "reward": 1.590000033378601, + "reward_std": 0.2036997526884079, + "rewards/accuracy_reward": 0.5250000357627869, + "rewards/format_reward": 0.9750000238418579, + "step": 1007, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 451.75, + "epoch": 0.019158034780956002, + "grad_norm": 1.40396899180214, + "kl": 0.09375, + "learning_rate": 9.990946623862286e-07, + "loss": 0.0037, + "reward": 1.6199522018432617, + "reward_std": 0.163608580827713, + "rewards/accuracy_reward": 0.47745224833488464, + "rewards/format_reward": 1.0, + "step": 1008, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 489.8999938964844, + "epoch": 0.01917704076784187, + "grad_norm": 1.834123059099525, + "kl": 0.12451171875, + "learning_rate": 9.990928657340448e-07, + "loss": 0.005, + "reward": 1.5168421268463135, + "reward_std": 0.41266241669654846, + "rewards/accuracy_reward": 0.4555921256542206, + "rewards/format_reward": 0.9750000238418579, + "step": 1009, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 469.95001220703125, + "epoch": 0.01919604675472774, + "grad_norm": 1.7488146322306355, + "kl": 0.1552734375, + "learning_rate": 9.99091067302509e-07, + "loss": 0.0062, + "reward": 2.257500171661377, + "reward_std": 0.12623165547847748, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 1010, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 466.0, + "epoch": 0.019215052741613607, + "grad_norm": 1.553807296078126, + "kl": 0.08642578125, + "learning_rate": 9.99089267091627e-07, + "loss": 0.0035, + "reward": 1.65625, + "reward_std": 0.0689966008067131, + "rewards/accuracy_reward": 0.5925000309944153, + "rewards/format_reward": 1.0, + "step": 1011, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 442.1000061035156, + "epoch": 0.019234058728499478, + "grad_norm": 2.090037041687299, + "kl": 0.17578125, + "learning_rate": 9.990874651014054e-07, + "loss": 0.007, + "reward": 1.7426666021347046, + "reward_std": 0.294862300157547, + "rewards/accuracy_reward": 0.5976666808128357, + "rewards/format_reward": 1.0, + "step": 1012, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 469.8500061035156, + "epoch": 0.019253064715385345, + "grad_norm": 3.0827752207802965, + "kl": 0.1435546875, + "learning_rate": 9.990856613318506e-07, + "loss": 0.0057, + "reward": 2.1619644165039062, + "reward_std": 0.03936680033802986, + "rewards/accuracy_reward": 0.9957141876220703, + "rewards/format_reward": 1.0, + "step": 1013, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 492.32501220703125, + "epoch": 0.019272070702271216, + "grad_norm": 2.2608813479083283, + "kl": 0.119140625, + "learning_rate": 9.990838557829694e-07, + "loss": 0.0048, + "reward": 1.7749525308609009, + "reward_std": 0.15113787353038788, + "rewards/accuracy_reward": 0.581202507019043, + "rewards/format_reward": 1.0, + "step": 1014, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 476.7250061035156, + "epoch": 0.019291076689157083, + "grad_norm": 2.4735848437679904, + "kl": 0.181640625, + "learning_rate": 9.990820484547677e-07, + "loss": 0.0073, + "reward": 1.8574999570846558, + "reward_std": 0.35503897070884705, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 0.9750000238418579, + "step": 1015, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 471.3500061035156, + "epoch": 0.019310082676042954, + "grad_norm": 2.0349285847414365, + "kl": 0.1611328125, + "learning_rate": 9.990802393472527e-07, + "loss": 0.0065, + "reward": 1.8112499713897705, + "reward_std": 0.1496996134519577, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1016, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 438.3500061035156, + "epoch": 0.019329088662928824, + "grad_norm": 21.869414747166775, + "kl": 0.1376953125, + "learning_rate": 9.9907842846043e-07, + "loss": 0.0055, + "reward": 1.9371391534805298, + "reward_std": 0.07876028120517731, + "rewards/accuracy_reward": 0.7758890390396118, + "rewards/format_reward": 1.0, + "step": 1017, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 448.32501220703125, + "epoch": 0.01934809464981469, + "grad_norm": 2.5556945127144792, + "kl": 0.142578125, + "learning_rate": 9.990766157943063e-07, + "loss": 0.0057, + "reward": 2.022892951965332, + "reward_std": 0.056258928030729294, + "rewards/accuracy_reward": 0.7966430187225342, + "rewards/format_reward": 1.0, + "step": 1018, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 421.0, + "epoch": 0.019367100636700562, + "grad_norm": 2.118697152745554, + "kl": 0.1396484375, + "learning_rate": 9.990748013488883e-07, + "loss": 0.0056, + "reward": 2.049999952316284, + "reward_std": 0.09520556777715683, + "rewards/accuracy_reward": 0.824999988079071, + "rewards/format_reward": 1.0, + "step": 1019, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 412.375, + "epoch": 0.01938610662358643, + "grad_norm": 2.163694047007911, + "kl": 0.146484375, + "learning_rate": 9.990729851241823e-07, + "loss": 0.0059, + "reward": 1.8013746738433838, + "reward_std": 0.17564113438129425, + "rewards/accuracy_reward": 0.6388747692108154, + "rewards/format_reward": 1.0, + "step": 1020, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 427.6750183105469, + "epoch": 0.0194051126104723, + "grad_norm": 1.981511956734752, + "kl": 0.1279296875, + "learning_rate": 9.99071167120195e-07, + "loss": 0.0051, + "reward": 1.7745332717895508, + "reward_std": 0.12141894549131393, + "rewards/accuracy_reward": 0.6207833290100098, + "rewards/format_reward": 1.0, + "step": 1021, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 449.875, + "epoch": 0.019424118597358167, + "grad_norm": 2.1361217390911644, + "kl": 0.07958984375, + "learning_rate": 9.990693473369325e-07, + "loss": 0.0032, + "reward": 1.4644477367401123, + "reward_std": 0.3071606457233429, + "rewards/accuracy_reward": 0.41444769501686096, + "rewards/format_reward": 1.0, + "step": 1022, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 410.95001220703125, + "epoch": 0.019443124584244038, + "grad_norm": 2.2489954767139735, + "kl": 0.10546875, + "learning_rate": 9.990675257744017e-07, + "loss": 0.0042, + "reward": 1.7889044284820557, + "reward_std": 0.11797785758972168, + "rewards/accuracy_reward": 0.690154492855072, + "rewards/format_reward": 1.0, + "step": 1023, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 404.2250061035156, + "epoch": 0.019462130571129905, + "grad_norm": 2.653349206098878, + "kl": 0.10693359375, + "learning_rate": 9.990657024326087e-07, + "loss": 0.0043, + "reward": 1.4250637292861938, + "reward_std": 0.25342002511024475, + "rewards/accuracy_reward": 0.3488137125968933, + "rewards/format_reward": 1.0, + "step": 1024, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.4, + "completion_length": 406.95001220703125, + "epoch": 0.019481136558015776, + "grad_norm": 1.7123506878473547, + "kl": 0.1279296875, + "learning_rate": 9.990638773115604e-07, + "loss": 0.0051, + "reward": 1.7537498474121094, + "reward_std": 0.01837117038667202, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 1025, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 397.125, + "epoch": 0.019500142544901643, + "grad_norm": 2.212712015139928, + "kl": 0.142578125, + "learning_rate": 9.99062050411263e-07, + "loss": 0.0057, + "reward": 1.9385731220245361, + "reward_std": 0.13834629952907562, + "rewards/accuracy_reward": 0.7660731673240662, + "rewards/format_reward": 1.0, + "step": 1026, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.2, + "completion_length": 426.75, + "epoch": 0.019519148531787513, + "grad_norm": 2.1062121885530103, + "kl": 0.177734375, + "learning_rate": 9.99060221731723e-07, + "loss": 0.0071, + "reward": 2.0250000953674316, + "reward_std": 0.024494869634509087, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 1027, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 392.1499938964844, + "epoch": 0.01953815451867338, + "grad_norm": 2.8179326490289958, + "kl": 0.1748046875, + "learning_rate": 9.99058391272947e-07, + "loss": 0.007, + "reward": 1.9591667652130127, + "reward_std": 0.13837946951389313, + "rewards/accuracy_reward": 0.7866666913032532, + "rewards/format_reward": 1.0, + "step": 1028, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 406.5500183105469, + "epoch": 0.01955716050555925, + "grad_norm": 2.4705107867070684, + "kl": 0.169921875, + "learning_rate": 9.990565590349418e-07, + "loss": 0.0068, + "reward": 2.1500000953674316, + "reward_std": 0.09946427494287491, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 1029, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 405.4750061035156, + "epoch": 0.019576166492445122, + "grad_norm": 1.7775593490466943, + "kl": 0.12109375, + "learning_rate": 9.990547250177134e-07, + "loss": 0.0048, + "reward": 1.681867003440857, + "reward_std": 0.10608299821615219, + "rewards/accuracy_reward": 0.6531171202659607, + "rewards/format_reward": 1.0, + "step": 1030, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 387.75, + "epoch": 0.01959517247933099, + "grad_norm": 1.7065115803590838, + "kl": 0.123046875, + "learning_rate": 9.990528892212687e-07, + "loss": 0.0049, + "reward": 1.53125, + "reward_std": 0.11581762135028839, + "rewards/accuracy_reward": 0.45000001788139343, + "rewards/format_reward": 1.0, + "step": 1031, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 388.1750183105469, + "epoch": 0.01961417846621686, + "grad_norm": 2.7571682695840933, + "kl": 0.26171875, + "learning_rate": 9.990510516456143e-07, + "loss": 0.0105, + "reward": 2.093621015548706, + "reward_std": 0.05897649750113487, + "rewards/accuracy_reward": 0.858620822429657, + "rewards/format_reward": 1.0, + "step": 1032, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 1.0, + "all_wrong": 0.0, + "completion_length": 414.3000183105469, + "epoch": 0.019633184453102727, + "grad_norm": 3.9120173181912197, + "kl": 0.294921875, + "learning_rate": 9.990492122907566e-07, + "loss": 0.0118, + "reward": 2.351250171661377, + "reward_std": 0.036220937967300415, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1033, + "temporal_rewards": 1.0 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 426.1499938964844, + "epoch": 0.019652190439988598, + "grad_norm": 2.3178758492640164, + "kl": 0.2451171875, + "learning_rate": 9.99047371156702e-07, + "loss": 0.0098, + "reward": 1.8461750745773315, + "reward_std": 0.30464860796928406, + "rewards/accuracy_reward": 0.78742516040802, + "rewards/format_reward": 0.925000011920929, + "step": 1034, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 384.7749938964844, + "epoch": 0.019671196426874465, + "grad_norm": 2.8619659164198388, + "kl": 0.1767578125, + "learning_rate": 9.990455282434572e-07, + "loss": 0.0071, + "reward": 1.6730884313583374, + "reward_std": 0.10481330007314682, + "rewards/accuracy_reward": 0.5693384408950806, + "rewards/format_reward": 1.0, + "step": 1035, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 394.1499938964844, + "epoch": 0.019690202413760335, + "grad_norm": 1.8127472887440206, + "kl": 0.26953125, + "learning_rate": 9.99043683551029e-07, + "loss": 0.0108, + "reward": 1.7037500143051147, + "reward_std": 0.10087790340185165, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 1.0, + "step": 1036, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 380.0249938964844, + "epoch": 0.019709208400646203, + "grad_norm": 2.0569088952554937, + "kl": 0.228515625, + "learning_rate": 9.990418370794239e-07, + "loss": 0.0092, + "reward": 1.6823810338974, + "reward_std": 0.06980056315660477, + "rewards/accuracy_reward": 0.6461309790611267, + "rewards/format_reward": 1.0, + "step": 1037, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 431.95001220703125, + "epoch": 0.019728214387532073, + "grad_norm": 2.5848940539338154, + "kl": 0.296875, + "learning_rate": 9.990399888286482e-07, + "loss": 0.0118, + "reward": 2.2649998664855957, + "reward_std": 0.10506659746170044, + "rewards/accuracy_reward": 0.9750000238418579, + "rewards/format_reward": 1.0, + "step": 1038, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 424.5, + "epoch": 0.01974722037441794, + "grad_norm": 2.7547516533454206, + "kl": 0.416015625, + "learning_rate": 9.990381387987086e-07, + "loss": 0.0166, + "reward": 1.741979956626892, + "reward_std": 0.41703328490257263, + "rewards/accuracy_reward": 0.7407300472259521, + "rewards/format_reward": 0.925000011920929, + "step": 1039, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 412.8500061035156, + "epoch": 0.01976622636130381, + "grad_norm": 2.6965592038809496, + "kl": 0.32421875, + "learning_rate": 9.990362869896119e-07, + "loss": 0.013, + "reward": 1.6772819757461548, + "reward_std": 0.5514366030693054, + "rewards/accuracy_reward": 0.7460320591926575, + "rewards/format_reward": 0.9000000357627869, + "step": 1040, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 449.9250183105469, + "epoch": 0.01978523234818968, + "grad_norm": 2.1342571278000584, + "kl": 0.283203125, + "learning_rate": 9.990344334013646e-07, + "loss": 0.0113, + "reward": 1.9636785984039307, + "reward_std": 0.29915428161621094, + "rewards/accuracy_reward": 0.7911784648895264, + "rewards/format_reward": 0.9750000238418579, + "step": 1041, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 407.1750183105469, + "epoch": 0.01980423833507555, + "grad_norm": 4.819846812951813, + "kl": 0.4921875, + "learning_rate": 9.99032578033973e-07, + "loss": 0.0197, + "reward": 1.6124862432479858, + "reward_std": 0.1234317421913147, + "rewards/accuracy_reward": 0.4924861490726471, + "rewards/format_reward": 0.9750000238418579, + "step": 1042, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 427.45001220703125, + "epoch": 0.019823244321961416, + "grad_norm": 1.3106107015759494, + "kl": 0.1455078125, + "learning_rate": 9.990307208874442e-07, + "loss": 0.0058, + "reward": 1.9005578756332397, + "reward_std": 0.3362431526184082, + "rewards/accuracy_reward": 0.839307963848114, + "rewards/format_reward": 0.9750000238418579, + "step": 1043, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 378.3500061035156, + "epoch": 0.019842250308847287, + "grad_norm": 3.034639156082052, + "kl": 0.375, + "learning_rate": 9.990288619617844e-07, + "loss": 0.015, + "reward": 1.4457237720489502, + "reward_std": 0.47860288619995117, + "rewards/accuracy_reward": 0.534473717212677, + "rewards/format_reward": 0.9000000357627869, + "step": 1044, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 418.25, + "epoch": 0.019861256295733157, + "grad_norm": 2.7615694846522914, + "kl": 0.57421875, + "learning_rate": 9.990270012570005e-07, + "loss": 0.023, + "reward": 1.8250000476837158, + "reward_std": 0.4990555942058563, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 0.925000011920929, + "step": 1045, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 439.4250183105469, + "epoch": 0.019880262282619025, + "grad_norm": 1.8898014206055522, + "kl": 0.322265625, + "learning_rate": 9.990251387730993e-07, + "loss": 0.0129, + "reward": 1.6197454929351807, + "reward_std": 0.22780810296535492, + "rewards/accuracy_reward": 0.5297453999519348, + "rewards/format_reward": 1.0, + "step": 1046, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 427.9750061035156, + "epoch": 0.019899268269504895, + "grad_norm": 5.50585571663118, + "kl": 0.5625, + "learning_rate": 9.990232745100869e-07, + "loss": 0.0225, + "reward": 1.5639976263046265, + "reward_std": 0.13982482254505157, + "rewards/accuracy_reward": 0.48024773597717285, + "rewards/format_reward": 0.9750000238418579, + "step": 1047, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 401.5, + "epoch": 0.019918274256390762, + "grad_norm": 2.7239170576983307, + "kl": 0.2890625, + "learning_rate": 9.990214084679705e-07, + "loss": 0.0116, + "reward": 1.4754761457443237, + "reward_std": 0.18877726793289185, + "rewards/accuracy_reward": 0.4154761731624603, + "rewards/format_reward": 0.9750000238418579, + "step": 1048, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 372.70001220703125, + "epoch": 0.019937280243276633, + "grad_norm": 2.047033943669166, + "kl": 0.33203125, + "learning_rate": 9.990195406467563e-07, + "loss": 0.0133, + "reward": 1.8841667175292969, + "reward_std": 0.04042382910847664, + "rewards/accuracy_reward": 0.7991666793823242, + "rewards/format_reward": 1.0, + "step": 1049, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 367.2250061035156, + "epoch": 0.0199562862301625, + "grad_norm": 2.7595244558349434, + "kl": 0.3515625, + "learning_rate": 9.990176710464512e-07, + "loss": 0.0141, + "reward": 1.8950246572494507, + "reward_std": 0.24025297164916992, + "rewards/accuracy_reward": 0.7262746691703796, + "rewards/format_reward": 1.0, + "step": 1050, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 419.5500183105469, + "epoch": 0.01997529221704837, + "grad_norm": 2.856255799647025, + "kl": 0.703125, + "learning_rate": 9.990157996670619e-07, + "loss": 0.028, + "reward": 1.7473324537277222, + "reward_std": 0.3358840048313141, + "rewards/accuracy_reward": 0.632332444190979, + "rewards/format_reward": 0.9750000238418579, + "step": 1051, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 443.2749938964844, + "epoch": 0.019994298203934238, + "grad_norm": 2.267571252125733, + "kl": 0.3125, + "learning_rate": 9.99013926508595e-07, + "loss": 0.0125, + "reward": 1.845000147819519, + "reward_std": 0.4743429124355316, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 0.949999988079071, + "step": 1052, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 375.8000183105469, + "epoch": 0.02001330419082011, + "grad_norm": 2.785642060901953, + "kl": 0.5234375, + "learning_rate": 9.990120515710572e-07, + "loss": 0.0209, + "reward": 1.6485786437988281, + "reward_std": 0.2498089075088501, + "rewards/accuracy_reward": 0.5498287081718445, + "rewards/format_reward": 1.0, + "step": 1053, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 396.95001220703125, + "epoch": 0.020032310177705976, + "grad_norm": 2.9415979783586526, + "kl": 0.435546875, + "learning_rate": 9.99010174854455e-07, + "loss": 0.0174, + "reward": 1.857224702835083, + "reward_std": 0.11635222285985947, + "rewards/accuracy_reward": 0.7159746289253235, + "rewards/format_reward": 0.9750000238418579, + "step": 1054, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 383.45001220703125, + "epoch": 0.020051316164591847, + "grad_norm": 3.452851265646793, + "kl": 0.5625, + "learning_rate": 9.990082963587954e-07, + "loss": 0.0226, + "reward": 1.5637085437774658, + "reward_std": 0.16516831517219543, + "rewards/accuracy_reward": 0.46870848536491394, + "rewards/format_reward": 1.0, + "step": 1055, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 434.9750061035156, + "epoch": 0.020070322151477714, + "grad_norm": 4.607732604558867, + "kl": 0.349609375, + "learning_rate": 9.990064160840848e-07, + "loss": 0.014, + "reward": 1.7404297590255737, + "reward_std": 0.39258453249931335, + "rewards/accuracy_reward": 0.6341797709465027, + "rewards/format_reward": 0.949999988079071, + "step": 1056, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 411.2250061035156, + "epoch": 0.020089328138363585, + "grad_norm": 1.7609440047723581, + "kl": 0.3046875, + "learning_rate": 9.990045340303302e-07, + "loss": 0.0121, + "reward": 1.5140036344528198, + "reward_std": 0.4501815736293793, + "rewards/accuracy_reward": 0.5265036225318909, + "rewards/format_reward": 0.925000011920929, + "step": 1057, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 391.3999938964844, + "epoch": 0.020108334125249455, + "grad_norm": 2.842884566334913, + "kl": 0.53515625, + "learning_rate": 9.990026501975382e-07, + "loss": 0.0215, + "reward": 1.821428656578064, + "reward_std": 0.2050504982471466, + "rewards/accuracy_reward": 0.7214285731315613, + "rewards/format_reward": 1.0, + "step": 1058, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 402.8999938964844, + "epoch": 0.020127340112135322, + "grad_norm": 2.9778563623613348, + "kl": 0.38671875, + "learning_rate": 9.990007645857153e-07, + "loss": 0.0155, + "reward": 2.0104236602783203, + "reward_std": 0.15403418242931366, + "rewards/accuracy_reward": 0.8979236483573914, + "rewards/format_reward": 1.0, + "step": 1059, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 1.0, + "all_wrong": 0.0, + "completion_length": 458.20001220703125, + "epoch": 0.020146346099021193, + "grad_norm": 3.294990752610389, + "kl": 0.435546875, + "learning_rate": 9.989988771948685e-07, + "loss": 0.0174, + "reward": 2.356250047683716, + "reward_std": 0.03061860240995884, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1060, + "temporal_rewards": 1.0 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 437.0500183105469, + "epoch": 0.02016535208590706, + "grad_norm": 3.019277322578053, + "kl": 0.58984375, + "learning_rate": 9.989969880250044e-07, + "loss": 0.0235, + "reward": 1.6652908325195312, + "reward_std": 0.24903574585914612, + "rewards/accuracy_reward": 0.5802907943725586, + "rewards/format_reward": 0.9750000238418579, + "step": 1061, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 402.07501220703125, + "epoch": 0.02018435807279293, + "grad_norm": 2.6544577749862595, + "kl": 0.49609375, + "learning_rate": 9.989950970761298e-07, + "loss": 0.0198, + "reward": 1.5977493524551392, + "reward_std": 0.3752979040145874, + "rewards/accuracy_reward": 0.5352492928504944, + "rewards/format_reward": 0.9750000238418579, + "step": 1062, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 456.8000183105469, + "epoch": 0.020203364059678798, + "grad_norm": 2.35222328332762, + "kl": 0.6796875, + "learning_rate": 9.989932043482515e-07, + "loss": 0.0271, + "reward": 1.3567017316818237, + "reward_std": 0.4286865293979645, + "rewards/accuracy_reward": 0.3717016279697418, + "rewards/format_reward": 0.949999988079071, + "step": 1063, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 441.125, + "epoch": 0.02022237004656467, + "grad_norm": 4.283698679859192, + "kl": 0.78125, + "learning_rate": 9.989913098413758e-07, + "loss": 0.0312, + "reward": 1.4906439781188965, + "reward_std": 0.1723892241716385, + "rewards/accuracy_reward": 0.44439396262168884, + "rewards/format_reward": 1.0, + "step": 1064, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 531.2750244140625, + "epoch": 0.020241376033450536, + "grad_norm": 4.876568017614198, + "kl": 1.5078125, + "learning_rate": 9.989894135555102e-07, + "loss": 0.0602, + "reward": 1.5412499904632568, + "reward_std": 1.0078068971633911, + "rewards/accuracy_reward": 0.699999988079071, + "rewards/format_reward": 0.75, + "step": 1065, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 541.5499877929688, + "epoch": 0.020260382020336407, + "grad_norm": 3.2720555968355365, + "kl": 1.1875, + "learning_rate": 9.989875154906608e-07, + "loss": 0.0474, + "reward": 1.0928294658660889, + "reward_std": 0.7177988886833191, + "rewards/accuracy_reward": 0.40657949447631836, + "rewards/format_reward": 0.699999988079071, + "step": 1066, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 457.6750183105469, + "epoch": 0.020279388007222274, + "grad_norm": 4.448078878169397, + "kl": 0.97265625, + "learning_rate": 9.98985615646835e-07, + "loss": 0.0388, + "reward": 1.4606159925460815, + "reward_std": 0.2986356317996979, + "rewards/accuracy_reward": 0.5743657946586609, + "rewards/format_reward": 0.925000011920929, + "step": 1067, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 517.4500122070312, + "epoch": 0.020298393994108144, + "grad_norm": 2.363348867384236, + "kl": 0.2734375, + "learning_rate": 9.989837140240389e-07, + "loss": 0.011, + "reward": 1.1824373006820679, + "reward_std": 0.41314831376075745, + "rewards/accuracy_reward": 0.36243730783462524, + "rewards/format_reward": 0.824999988079071, + "step": 1068, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.4, + "completion_length": 536.5499877929688, + "epoch": 0.02031739998099401, + "grad_norm": 5.134794222810242, + "kl": 1.1796875, + "learning_rate": 9.989818106222795e-07, + "loss": 0.0471, + "reward": 1.1349999904632568, + "reward_std": 0.7050307393074036, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.800000011920929, + "step": 1069, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 439.0249938964844, + "epoch": 0.020336405967879882, + "grad_norm": 2.3327486514463343, + "kl": 0.65625, + "learning_rate": 9.98979905441564e-07, + "loss": 0.0261, + "reward": 1.9594318866729736, + "reward_std": 0.20904293656349182, + "rewards/accuracy_reward": 0.9306818246841431, + "rewards/format_reward": 0.9750000238418579, + "step": 1070, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 475.4750061035156, + "epoch": 0.020355411954765753, + "grad_norm": 2.406752671101741, + "kl": 0.396484375, + "learning_rate": 9.989779984818985e-07, + "loss": 0.0158, + "reward": 1.6237499713897705, + "reward_std": 0.1582767367362976, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9750000238418579, + "step": 1071, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 496.4750061035156, + "epoch": 0.02037441794165162, + "grad_norm": 3.1892577937015436, + "kl": 0.41796875, + "learning_rate": 9.989760897432903e-07, + "loss": 0.0168, + "reward": 1.641535997390747, + "reward_std": 0.30127832293510437, + "rewards/accuracy_reward": 0.5652860999107361, + "rewards/format_reward": 0.9750000238418579, + "step": 1072, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 510.8000183105469, + "epoch": 0.02039342392853749, + "grad_norm": 1.5675129417411724, + "kl": 0.20703125, + "learning_rate": 9.989741792257463e-07, + "loss": 0.0083, + "reward": 1.8320114612579346, + "reward_std": 0.1766887903213501, + "rewards/accuracy_reward": 0.7782613635063171, + "rewards/format_reward": 1.0, + "step": 1073, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 413.3999938964844, + "epoch": 0.020412429915423358, + "grad_norm": 1.7064393938049844, + "kl": 0.25, + "learning_rate": 9.98972266929273e-07, + "loss": 0.01, + "reward": 1.8130369186401367, + "reward_std": 0.14584560692310333, + "rewards/accuracy_reward": 0.7942870259284973, + "rewards/format_reward": 1.0, + "step": 1074, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 461.375, + "epoch": 0.02043143590230923, + "grad_norm": 4.7527380552141265, + "kl": 0.265625, + "learning_rate": 9.98970352853877e-07, + "loss": 0.0106, + "reward": 1.9015886783599854, + "reward_std": 0.1914331018924713, + "rewards/accuracy_reward": 0.8165885806083679, + "rewards/format_reward": 0.9750000238418579, + "step": 1075, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 497.0, + "epoch": 0.020450441889195096, + "grad_norm": 4.212975933640076, + "kl": 0.2412109375, + "learning_rate": 9.989684369995657e-07, + "loss": 0.0097, + "reward": 2.0439999103546143, + "reward_std": 0.11628691107034683, + "rewards/accuracy_reward": 0.9739999771118164, + "rewards/format_reward": 1.0, + "step": 1076, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 490.9250183105469, + "epoch": 0.020469447876080966, + "grad_norm": 1.1425370556000116, + "kl": 0.11083984375, + "learning_rate": 9.989665193663456e-07, + "loss": 0.0044, + "reward": 1.7450001239776611, + "reward_std": 0.13763807713985443, + "rewards/accuracy_reward": 0.699999988079071, + "rewards/format_reward": 1.0, + "step": 1077, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 400.9250183105469, + "epoch": 0.020488453862966834, + "grad_norm": 24.909327096365107, + "kl": 0.10595703125, + "learning_rate": 9.989645999542236e-07, + "loss": 0.0042, + "reward": 1.8112499713897705, + "reward_std": 0.25130271911621094, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 1078, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 477.6000061035156, + "epoch": 0.020507459849852704, + "grad_norm": 1.6378639672917517, + "kl": 0.1064453125, + "learning_rate": 9.989626787632066e-07, + "loss": 0.0043, + "reward": 1.6247367858886719, + "reward_std": 0.23722195625305176, + "rewards/accuracy_reward": 0.5822367668151855, + "rewards/format_reward": 1.0, + "step": 1079, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 466.07501220703125, + "epoch": 0.02052646583673857, + "grad_norm": 2.131040517194741, + "kl": 0.07763671875, + "learning_rate": 9.989607557933011e-07, + "loss": 0.0031, + "reward": 1.7458094358444214, + "reward_std": 0.14009727537631989, + "rewards/accuracy_reward": 0.647059440612793, + "rewards/format_reward": 1.0, + "step": 1080, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 554.3250122070312, + "epoch": 0.020545471823624442, + "grad_norm": 3.6728755784435, + "kl": 0.1240234375, + "learning_rate": 9.989588310445145e-07, + "loss": 0.005, + "reward": 1.5125000476837158, + "reward_std": 0.265155553817749, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/format_reward": 1.0, + "step": 1081, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 519.7999877929688, + "epoch": 0.02056447781051031, + "grad_norm": 1.9976021894789202, + "kl": 0.1201171875, + "learning_rate": 9.989569045168534e-07, + "loss": 0.0048, + "reward": 1.806249976158142, + "reward_std": 0.4982987344264984, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.949999988079071, + "step": 1082, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 550.8500366210938, + "epoch": 0.02058348379739618, + "grad_norm": 11.867139146001337, + "kl": 0.1328125, + "learning_rate": 9.989549762103247e-07, + "loss": 0.0053, + "reward": 1.0172616243362427, + "reward_std": 0.6514346599578857, + "rewards/accuracy_reward": 0.35351163148880005, + "rewards/format_reward": 0.7250000238418579, + "step": 1083, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 484.3500061035156, + "epoch": 0.020602489784282047, + "grad_norm": 1.5716586463126754, + "kl": 0.11865234375, + "learning_rate": 9.98953046124935e-07, + "loss": 0.0047, + "reward": 1.772499918937683, + "reward_std": 0.54538893699646, + "rewards/accuracy_reward": 0.824999988079071, + "rewards/format_reward": 0.8500000238418579, + "step": 1084, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 439.3500061035156, + "epoch": 0.020621495771167918, + "grad_norm": 1.4474047969208617, + "kl": 0.080078125, + "learning_rate": 9.989511142606918e-07, + "loss": 0.0032, + "reward": 1.5293375253677368, + "reward_std": 0.44812893867492676, + "rewards/accuracy_reward": 0.5855875015258789, + "rewards/format_reward": 0.9000000357627869, + "step": 1085, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 387.1499938964844, + "epoch": 0.02064050175805379, + "grad_norm": 1.5988116883734946, + "kl": 0.08984375, + "learning_rate": 9.989491806176011e-07, + "loss": 0.0036, + "reward": 1.8309376239776611, + "reward_std": 0.02562147192656994, + "rewards/accuracy_reward": 0.785937488079071, + "rewards/format_reward": 1.0, + "step": 1086, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 393.75, + "epoch": 0.020659507744939656, + "grad_norm": 2.1259623459317347, + "kl": 0.103515625, + "learning_rate": 9.989472451956706e-07, + "loss": 0.0041, + "reward": 2.0172619819641113, + "reward_std": 0.11930018663406372, + "rewards/accuracy_reward": 0.7922618985176086, + "rewards/format_reward": 1.0, + "step": 1087, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 395.70001220703125, + "epoch": 0.020678513731825526, + "grad_norm": 1.465230605082796, + "kl": 0.087890625, + "learning_rate": 9.989453079949071e-07, + "loss": 0.0035, + "reward": 1.6762501001358032, + "reward_std": 0.3955805003643036, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 0.9750000238418579, + "step": 1088, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 394.0249938964844, + "epoch": 0.020697519718711394, + "grad_norm": 1.9551105424298225, + "kl": 0.08203125, + "learning_rate": 9.98943369015317e-07, + "loss": 0.0033, + "reward": 1.9587501287460327, + "reward_std": 0.23802022635936737, + "rewards/accuracy_reward": 0.8500000238418579, + "rewards/format_reward": 1.0, + "step": 1089, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 380.9750061035156, + "epoch": 0.020716525705597264, + "grad_norm": 1.8469513508132582, + "kl": 0.0869140625, + "learning_rate": 9.989414282569076e-07, + "loss": 0.0035, + "reward": 1.7670704126358032, + "reward_std": 0.08387457579374313, + "rewards/accuracy_reward": 0.7258204817771912, + "rewards/format_reward": 1.0, + "step": 1090, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 431.75, + "epoch": 0.02073553169248313, + "grad_norm": 1.920501572608846, + "kl": 0.0693359375, + "learning_rate": 9.989394857196858e-07, + "loss": 0.0028, + "reward": 1.5712499618530273, + "reward_std": 0.3352877199649811, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/format_reward": 0.9000000357627869, + "step": 1091, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 383.0249938964844, + "epoch": 0.020754537679369002, + "grad_norm": 2.443963178631602, + "kl": 0.11328125, + "learning_rate": 9.989375414036584e-07, + "loss": 0.0045, + "reward": 1.7179073095321655, + "reward_std": 0.24341915547847748, + "rewards/accuracy_reward": 0.640407383441925, + "rewards/format_reward": 1.0, + "step": 1092, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 445.1000061035156, + "epoch": 0.02077354366625487, + "grad_norm": 1.7723624196992813, + "kl": 0.06884765625, + "learning_rate": 9.989355953088326e-07, + "loss": 0.0028, + "reward": 1.7637499570846558, + "reward_std": 0.3484143614768982, + "rewards/accuracy_reward": 0.762499988079071, + "rewards/format_reward": 0.875, + "step": 1093, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 413.25, + "epoch": 0.02079254965314074, + "grad_norm": 1.4968608366270706, + "kl": 0.080078125, + "learning_rate": 9.989336474352151e-07, + "loss": 0.0032, + "reward": 2.1112499237060547, + "reward_std": 0.14834749698638916, + "rewards/accuracy_reward": 0.9000000357627869, + "rewards/format_reward": 1.0, + "step": 1094, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 432.375, + "epoch": 0.020811555640026607, + "grad_norm": 1.7153893013050203, + "kl": 0.0830078125, + "learning_rate": 9.989316977828126e-07, + "loss": 0.0033, + "reward": 1.9839897155761719, + "reward_std": 0.1398380547761917, + "rewards/accuracy_reward": 0.7677397131919861, + "rewards/format_reward": 1.0, + "step": 1095, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 443.25, + "epoch": 0.020830561626912478, + "grad_norm": 1.5051392319753347, + "kl": 0.08740234375, + "learning_rate": 9.989297463516326e-07, + "loss": 0.0035, + "reward": 1.9412498474121094, + "reward_std": 0.13574114441871643, + "rewards/accuracy_reward": 0.7250000238418579, + "rewards/format_reward": 1.0, + "step": 1096, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 423.7749938964844, + "epoch": 0.020849567613798345, + "grad_norm": 1.9077565998329153, + "kl": 0.10009765625, + "learning_rate": 9.989277931416817e-07, + "loss": 0.004, + "reward": 1.8943119049072266, + "reward_std": 0.22906899452209473, + "rewards/accuracy_reward": 0.7243117690086365, + "rewards/format_reward": 1.0, + "step": 1097, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 459.8000183105469, + "epoch": 0.020868573600684216, + "grad_norm": 1.8276528565109738, + "kl": 0.11865234375, + "learning_rate": 9.98925838152967e-07, + "loss": 0.0047, + "reward": 1.9516490697860718, + "reward_std": 0.19768182933330536, + "rewards/accuracy_reward": 0.8328990340232849, + "rewards/format_reward": 0.9750000238418579, + "step": 1098, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 497.45001220703125, + "epoch": 0.020887579587570086, + "grad_norm": 2.2909597620079647, + "kl": 0.091796875, + "learning_rate": 9.989238813854953e-07, + "loss": 0.0037, + "reward": 1.4409477710723877, + "reward_std": 0.5140753388404846, + "rewards/accuracy_reward": 0.5121976733207703, + "rewards/format_reward": 0.9000000357627869, + "step": 1099, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 492.82501220703125, + "epoch": 0.020906585574455953, + "grad_norm": 2.2944242066760476, + "kl": 0.1220703125, + "learning_rate": 9.989219228392737e-07, + "loss": 0.0049, + "reward": 1.9307613372802734, + "reward_std": 0.20309896767139435, + "rewards/accuracy_reward": 0.8270112872123718, + "rewards/format_reward": 1.0, + "step": 1100, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 464.45001220703125, + "epoch": 0.020925591561341824, + "grad_norm": 2.8809098115632863, + "kl": 0.126953125, + "learning_rate": 9.989199625143094e-07, + "loss": 0.0051, + "reward": 1.6325000524520874, + "reward_std": 0.40920862555503845, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 0.9750000238418579, + "step": 1101, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 501.0, + "epoch": 0.02094459754822769, + "grad_norm": 1.8804940928958014, + "kl": 0.126953125, + "learning_rate": 9.989180004106091e-07, + "loss": 0.0051, + "reward": 1.4704875946044922, + "reward_std": 0.42537155747413635, + "rewards/accuracy_reward": 0.490487664937973, + "rewards/format_reward": 0.9000000357627869, + "step": 1102, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 507.625, + "epoch": 0.020963603535113562, + "grad_norm": 2.7341625579862483, + "kl": 0.1484375, + "learning_rate": 9.989160365281797e-07, + "loss": 0.0059, + "reward": 1.8605848550796509, + "reward_std": 0.1214105635881424, + "rewards/accuracy_reward": 0.7743348479270935, + "rewards/format_reward": 1.0, + "step": 1103, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 464.375, + "epoch": 0.02098260952199943, + "grad_norm": 17.81933693019855, + "kl": 0.142578125, + "learning_rate": 9.989140708670285e-07, + "loss": 0.0057, + "reward": 1.9149999618530273, + "reward_std": 0.06461530178785324, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 1104, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 493.3500061035156, + "epoch": 0.0210016155088853, + "grad_norm": 3.9765937206582684, + "kl": 0.1884765625, + "learning_rate": 9.989121034271626e-07, + "loss": 0.0075, + "reward": 1.8371429443359375, + "reward_std": 0.4592036306858063, + "rewards/accuracy_reward": 0.8571428656578064, + "rewards/format_reward": 0.875, + "step": 1105, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 482.3500061035156, + "epoch": 0.021020621495771167, + "grad_norm": 3.3013987507746707, + "kl": 0.1611328125, + "learning_rate": 9.989101342085884e-07, + "loss": 0.0064, + "reward": 1.9663803577423096, + "reward_std": 0.14365266263484955, + "rewards/accuracy_reward": 0.8563804626464844, + "rewards/format_reward": 0.9750000238418579, + "step": 1106, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 474.1000061035156, + "epoch": 0.021039627482657038, + "grad_norm": 2.1701200783763728, + "kl": 0.1845703125, + "learning_rate": 9.989081632113135e-07, + "loss": 0.0074, + "reward": 1.5840139389038086, + "reward_std": 0.16018573939800262, + "rewards/accuracy_reward": 0.5752639174461365, + "rewards/format_reward": 1.0, + "step": 1107, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 467.8500061035156, + "epoch": 0.021058633469542905, + "grad_norm": 3.1589717624970937, + "kl": 0.185546875, + "learning_rate": 9.989061904353447e-07, + "loss": 0.0074, + "reward": 1.340881586074829, + "reward_std": 0.19195245206356049, + "rewards/accuracy_reward": 0.35463154315948486, + "rewards/format_reward": 0.949999988079071, + "step": 1108, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 482.6000061035156, + "epoch": 0.021077639456428775, + "grad_norm": 2.8074066428685516, + "kl": 0.259765625, + "learning_rate": 9.98904215880689e-07, + "loss": 0.0104, + "reward": 1.7889518737792969, + "reward_std": 0.2965729534626007, + "rewards/accuracy_reward": 0.6464519500732422, + "rewards/format_reward": 0.9750000238418579, + "step": 1109, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 447.8999938964844, + "epoch": 0.021096645443314643, + "grad_norm": 2.530398401920556, + "kl": 0.1845703125, + "learning_rate": 9.989022395473538e-07, + "loss": 0.0074, + "reward": 1.8787498474121094, + "reward_std": 0.42936381697654724, + "rewards/accuracy_reward": 0.8500000238418579, + "rewards/format_reward": 0.949999988079071, + "step": 1110, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 458.45001220703125, + "epoch": 0.021115651430200513, + "grad_norm": 4.252170277531869, + "kl": 0.18359375, + "learning_rate": 9.989002614353456e-07, + "loss": 0.0074, + "reward": 2.06000018119812, + "reward_std": 0.15157610177993774, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 1.0, + "step": 1111, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 426.1750183105469, + "epoch": 0.021134657417086384, + "grad_norm": 2.8291493048761858, + "kl": 0.275390625, + "learning_rate": 9.988982815446717e-07, + "loss": 0.011, + "reward": 1.8688366413116455, + "reward_std": 0.10286872833967209, + "rewards/accuracy_reward": 0.708836555480957, + "rewards/format_reward": 1.0, + "step": 1112, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 480.6750183105469, + "epoch": 0.02115366340397225, + "grad_norm": 2.656584988188495, + "kl": 0.1923828125, + "learning_rate": 9.988962998753392e-07, + "loss": 0.0077, + "reward": 1.5665762424468994, + "reward_std": 0.2611367404460907, + "rewards/accuracy_reward": 0.572826087474823, + "rewards/format_reward": 0.949999988079071, + "step": 1113, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 469.0500183105469, + "epoch": 0.021172669390858122, + "grad_norm": 2.29502916119169, + "kl": 0.3984375, + "learning_rate": 9.988943164273552e-07, + "loss": 0.0159, + "reward": 2.1050591468811035, + "reward_std": 0.22429104149341583, + "rewards/accuracy_reward": 0.8375590443611145, + "rewards/format_reward": 0.949999988079071, + "step": 1114, + "temporal_rewards": 1.0 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 441.6750183105469, + "epoch": 0.02119167537774399, + "grad_norm": 2.2118718515634046, + "kl": 0.494140625, + "learning_rate": 9.988923312007268e-07, + "loss": 0.0198, + "reward": 1.902500033378601, + "reward_std": 0.22653710842132568, + "rewards/accuracy_reward": 0.699999988079071, + "rewards/format_reward": 1.0, + "step": 1115, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 470.32501220703125, + "epoch": 0.02121068136462986, + "grad_norm": 2.5601758006216837, + "kl": 0.458984375, + "learning_rate": 9.98890344195461e-07, + "loss": 0.0184, + "reward": 1.5993732213974, + "reward_std": 0.42046108841896057, + "rewards/accuracy_reward": 0.49437323212623596, + "rewards/format_reward": 0.949999988079071, + "step": 1116, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 410.4750061035156, + "epoch": 0.021229687351515727, + "grad_norm": 3.114194843852156, + "kl": 0.482421875, + "learning_rate": 9.988883554115645e-07, + "loss": 0.0193, + "reward": 1.6028660535812378, + "reward_std": 0.07979714870452881, + "rewards/accuracy_reward": 0.4991160035133362, + "rewards/format_reward": 1.0, + "step": 1117, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 457.20001220703125, + "epoch": 0.021248693338401597, + "grad_norm": 2.002613964714544, + "kl": 0.28125, + "learning_rate": 9.98886364849045e-07, + "loss": 0.0112, + "reward": 1.5487500429153442, + "reward_std": 0.4187520146369934, + "rewards/accuracy_reward": 0.5250000357627869, + "rewards/format_reward": 0.9750000238418579, + "step": 1118, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 441.6750183105469, + "epoch": 0.021267699325287465, + "grad_norm": 3.274407931048261, + "kl": 0.484375, + "learning_rate": 9.988843725079095e-07, + "loss": 0.0194, + "reward": 1.6832802295684814, + "reward_std": 0.36162883043289185, + "rewards/accuracy_reward": 0.603280246257782, + "rewards/format_reward": 0.949999988079071, + "step": 1119, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 418.8999938964844, + "epoch": 0.021286705312173335, + "grad_norm": 2.3955242037890514, + "kl": 0.373046875, + "learning_rate": 9.988823783881648e-07, + "loss": 0.0149, + "reward": 1.8356670141220093, + "reward_std": 0.09992270916700363, + "rewards/accuracy_reward": 0.6706671118736267, + "rewards/format_reward": 1.0, + "step": 1120, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 440.75, + "epoch": 0.021305711299059202, + "grad_norm": 3.742041206355109, + "kl": 0.365234375, + "learning_rate": 9.988803824898183e-07, + "loss": 0.0146, + "reward": 1.7740919589996338, + "reward_std": 0.16810846328735352, + "rewards/accuracy_reward": 0.6403418779373169, + "rewards/format_reward": 0.9750000238418579, + "step": 1121, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 428.3000183105469, + "epoch": 0.021324717285945073, + "grad_norm": 3.0274629931569965, + "kl": 0.369140625, + "learning_rate": 9.988783848128768e-07, + "loss": 0.0148, + "reward": 1.9174998998641968, + "reward_std": 0.25878721475601196, + "rewards/accuracy_reward": 0.7750000357627869, + "rewards/format_reward": 1.0, + "step": 1122, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 418.5, + "epoch": 0.02134372327283094, + "grad_norm": 3.0556221439357283, + "kl": 0.609375, + "learning_rate": 9.988763853573476e-07, + "loss": 0.0243, + "reward": 2.1110386848449707, + "reward_std": 0.3558754026889801, + "rewards/accuracy_reward": 0.859788715839386, + "rewards/format_reward": 0.9750000238418579, + "step": 1123, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 426.6499938964844, + "epoch": 0.02136272925971681, + "grad_norm": 2.823012185745552, + "kl": 0.5078125, + "learning_rate": 9.98874384123238e-07, + "loss": 0.0204, + "reward": 1.989999771118164, + "reward_std": 0.1035362109541893, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 0.9750000238418579, + "step": 1124, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 422.3999938964844, + "epoch": 0.021381735246602678, + "grad_norm": 2.8385232890125316, + "kl": 0.484375, + "learning_rate": 9.98872381110555e-07, + "loss": 0.0194, + "reward": 1.7665491104125977, + "reward_std": 0.1389819234609604, + "rewards/accuracy_reward": 0.6152991056442261, + "rewards/format_reward": 1.0, + "step": 1125, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 474.70001220703125, + "epoch": 0.02140074123348855, + "grad_norm": 2.06715848172825, + "kl": 0.33203125, + "learning_rate": 9.988703763193054e-07, + "loss": 0.0133, + "reward": 1.8977149724960327, + "reward_std": 0.4486086964607239, + "rewards/accuracy_reward": 0.865215003490448, + "rewards/format_reward": 0.925000011920929, + "step": 1126, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 434.625, + "epoch": 0.02141974722037442, + "grad_norm": 3.558806967338625, + "kl": 0.32421875, + "learning_rate": 9.98868369749497e-07, + "loss": 0.0129, + "reward": 2.0890328884124756, + "reward_std": 0.03455172851681709, + "rewards/accuracy_reward": 0.8527830243110657, + "rewards/format_reward": 1.0, + "step": 1127, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 453.07501220703125, + "epoch": 0.021438753207260287, + "grad_norm": 2.194949637184403, + "kl": 0.26953125, + "learning_rate": 9.988663614011363e-07, + "loss": 0.0108, + "reward": 2.0461175441741943, + "reward_std": 0.04294559359550476, + "rewards/accuracy_reward": 0.8898676037788391, + "rewards/format_reward": 1.0, + "step": 1128, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 448.6750183105469, + "epoch": 0.021457759194146157, + "grad_norm": 2.5174553767161276, + "kl": 0.30078125, + "learning_rate": 9.98864351274231e-07, + "loss": 0.012, + "reward": 1.773172378540039, + "reward_std": 0.3384849727153778, + "rewards/accuracy_reward": 0.7331724166870117, + "rewards/format_reward": 0.949999988079071, + "step": 1129, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 469.7250061035156, + "epoch": 0.021476765181032025, + "grad_norm": 1.4912590548255154, + "kl": 0.22265625, + "learning_rate": 9.98862339368788e-07, + "loss": 0.0089, + "reward": 1.8037500381469727, + "reward_std": 0.22611112892627716, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 0.949999988079071, + "step": 1130, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 479.3999938964844, + "epoch": 0.021495771167917895, + "grad_norm": 2.105907871858483, + "kl": 0.283203125, + "learning_rate": 9.988603256848143e-07, + "loss": 0.0113, + "reward": 1.7814123630523682, + "reward_std": 0.2664862871170044, + "rewards/accuracy_reward": 0.6614122986793518, + "rewards/format_reward": 1.0, + "step": 1131, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 423.45001220703125, + "epoch": 0.021514777154803762, + "grad_norm": 2.5390173575463844, + "kl": 0.3125, + "learning_rate": 9.988583102223176e-07, + "loss": 0.0125, + "reward": 1.850250244140625, + "reward_std": 0.04399308189749718, + "rewards/accuracy_reward": 0.6902503371238708, + "rewards/format_reward": 1.0, + "step": 1132, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 404.5249938964844, + "epoch": 0.021533783141689633, + "grad_norm": 2.281713118447679, + "kl": 0.66015625, + "learning_rate": 9.988562929813045e-07, + "loss": 0.0265, + "reward": 1.479461669921875, + "reward_std": 0.21519115567207336, + "rewards/accuracy_reward": 0.3582117557525635, + "rewards/format_reward": 0.9750000238418579, + "step": 1133, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 457.1000061035156, + "epoch": 0.0215527891285755, + "grad_norm": 2.1119535085806445, + "kl": 0.33984375, + "learning_rate": 9.988542739617827e-07, + "loss": 0.0136, + "reward": 1.6164772510528564, + "reward_std": 0.21216817200183868, + "rewards/accuracy_reward": 0.5977272987365723, + "rewards/format_reward": 0.949999988079071, + "step": 1134, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 468.70001220703125, + "epoch": 0.02157179511546137, + "grad_norm": 2.4958872968807695, + "kl": 0.396484375, + "learning_rate": 9.98852253163759e-07, + "loss": 0.0159, + "reward": 2.032238721847534, + "reward_std": 0.1964418888092041, + "rewards/accuracy_reward": 0.8522385954856873, + "rewards/format_reward": 0.9750000238418579, + "step": 1135, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 456.6000061035156, + "epoch": 0.021590801102347238, + "grad_norm": 3.3600806794631204, + "kl": 0.609375, + "learning_rate": 9.98850230587241e-07, + "loss": 0.0244, + "reward": 1.694771647453308, + "reward_std": 0.3171394169330597, + "rewards/accuracy_reward": 0.6385214924812317, + "rewards/format_reward": 0.925000011920929, + "step": 1136, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 440.4750061035156, + "epoch": 0.02160980708923311, + "grad_norm": 3.0949135162270345, + "kl": 0.56640625, + "learning_rate": 9.988482062322355e-07, + "loss": 0.0226, + "reward": 1.4600000381469727, + "reward_std": 0.47220584750175476, + "rewards/accuracy_reward": 0.4000000059604645, + "rewards/format_reward": 0.949999988079071, + "step": 1137, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 451.20001220703125, + "epoch": 0.021628813076118976, + "grad_norm": 2.452416135525228, + "kl": 0.8515625, + "learning_rate": 9.988461800987497e-07, + "loss": 0.0341, + "reward": 1.3278422355651855, + "reward_std": 0.27286624908447266, + "rewards/accuracy_reward": 0.35909223556518555, + "rewards/format_reward": 0.9750000238418579, + "step": 1138, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 449.20001220703125, + "epoch": 0.021647819063004847, + "grad_norm": 2.1188206224876818, + "kl": 0.58203125, + "learning_rate": 9.988441521867915e-07, + "loss": 0.0232, + "reward": 1.7224998474121094, + "reward_std": 0.4373147189617157, + "rewards/accuracy_reward": 0.699999988079071, + "rewards/format_reward": 0.949999988079071, + "step": 1139, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 478.9750061035156, + "epoch": 0.021666825049890717, + "grad_norm": 2.2507479390567324, + "kl": 0.53125, + "learning_rate": 9.988421224963672e-07, + "loss": 0.0213, + "reward": 1.5777565240859985, + "reward_std": 0.43320542573928833, + "rewards/accuracy_reward": 0.6177565455436707, + "rewards/format_reward": 0.925000011920929, + "step": 1140, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 454.3000183105469, + "epoch": 0.021685831036776584, + "grad_norm": 5.72497736751786, + "kl": 0.7890625, + "learning_rate": 9.988400910274849e-07, + "loss": 0.0316, + "reward": 1.3049999475479126, + "reward_std": 0.622310221195221, + "rewards/accuracy_reward": 0.4000000059604645, + "rewards/format_reward": 0.9000000357627869, + "step": 1141, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 442.9750061035156, + "epoch": 0.021704837023662455, + "grad_norm": 2.3336740721342903, + "kl": 0.84375, + "learning_rate": 9.98838057780151e-07, + "loss": 0.0337, + "reward": 1.6687500476837158, + "reward_std": 0.5652045011520386, + "rewards/accuracy_reward": 0.675000011920929, + "rewards/format_reward": 0.875, + "step": 1142, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 447.25, + "epoch": 0.021723843010548322, + "grad_norm": 2.348345711262493, + "kl": 0.625, + "learning_rate": 9.988360227543734e-07, + "loss": 0.0249, + "reward": 1.9391344785690308, + "reward_std": 0.4283779263496399, + "rewards/accuracy_reward": 0.7903845906257629, + "rewards/format_reward": 0.949999988079071, + "step": 1143, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 430.6750183105469, + "epoch": 0.021742848997434193, + "grad_norm": 1.6655662038556005, + "kl": 0.3671875, + "learning_rate": 9.98833985950159e-07, + "loss": 0.0147, + "reward": 1.9451178312301636, + "reward_std": 0.05423903465270996, + "rewards/accuracy_reward": 0.8988677859306335, + "rewards/format_reward": 1.0, + "step": 1144, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 458.25, + "epoch": 0.02176185498432006, + "grad_norm": 3.536788262989706, + "kl": 1.265625, + "learning_rate": 9.988319473675153e-07, + "loss": 0.0507, + "reward": 1.3060330152511597, + "reward_std": 0.5574919581413269, + "rewards/accuracy_reward": 0.3922830820083618, + "rewards/format_reward": 0.875, + "step": 1145, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 446.7749938964844, + "epoch": 0.02178086097120593, + "grad_norm": 5.042838672476203, + "kl": 1.671875, + "learning_rate": 9.988299070064496e-07, + "loss": 0.0667, + "reward": 1.8337500095367432, + "reward_std": 0.5980350375175476, + "rewards/accuracy_reward": 0.8500000238418579, + "rewards/format_reward": 0.8500000238418579, + "step": 1146, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 447.3500061035156, + "epoch": 0.021799866958091798, + "grad_norm": 4.739213882568442, + "kl": 3.078125, + "learning_rate": 9.988278648669689e-07, + "loss": 0.1235, + "reward": 0.9837499856948853, + "reward_std": 0.7462717890739441, + "rewards/accuracy_reward": 0.42500001192092896, + "rewards/format_reward": 0.45000001788139343, + "step": 1147, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 406.3000183105469, + "epoch": 0.02181887294497767, + "grad_norm": 6.881088180730933, + "kl": 4.21875, + "learning_rate": 9.988258209490807e-07, + "loss": 0.1693, + "reward": 0.7211111187934875, + "reward_std": 0.8187532424926758, + "rewards/accuracy_reward": 0.28611111640930176, + "rewards/format_reward": 0.4000000059604645, + "step": 1148, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 430.7749938964844, + "epoch": 0.021837878931863536, + "grad_norm": 7.867300199676618, + "kl": 4.21875, + "learning_rate": 9.988237752527921e-07, + "loss": 0.1685, + "reward": 0.8793601393699646, + "reward_std": 0.9862723350524902, + "rewards/accuracy_reward": 0.4081101417541504, + "rewards/format_reward": 0.4749999940395355, + "step": 1149, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 399.3999938964844, + "epoch": 0.021856884918749406, + "grad_norm": 6.863023704504246, + "kl": 4.03125, + "learning_rate": 9.988217277781105e-07, + "loss": 0.1609, + "reward": 1.4275000095367432, + "reward_std": 0.8714839816093445, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.699999988079071, + "step": 1150, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 416.4250183105469, + "epoch": 0.021875890905635274, + "grad_norm": 2.6568467453200943, + "kl": 2.390625, + "learning_rate": 9.988196785250432e-07, + "loss": 0.0957, + "reward": 1.6962499618530273, + "reward_std": 0.6650688052177429, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.875, + "step": 1151, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 384.2749938964844, + "epoch": 0.021894896892521144, + "grad_norm": 2.91787701554552, + "kl": 2.0625, + "learning_rate": 9.988176274935976e-07, + "loss": 0.0823, + "reward": 1.8666852712631226, + "reward_std": 0.32089829444885254, + "rewards/accuracy_reward": 0.7391854524612427, + "rewards/format_reward": 0.925000011920929, + "step": 1152, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 440.0, + "epoch": 0.02191390287940701, + "grad_norm": 6.356761104993911, + "kl": 0.48046875, + "learning_rate": 9.98815574683781e-07, + "loss": 0.0192, + "reward": 1.890345573425293, + "reward_std": 0.12729094922542572, + "rewards/accuracy_reward": 0.6853455305099487, + "rewards/format_reward": 1.0, + "step": 1153, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.8, + "all_wrong": 0.0, + "completion_length": 413.8999938964844, + "epoch": 0.021932908866292882, + "grad_norm": 1.9157296553068588, + "kl": 0.71875, + "learning_rate": 9.988135200956004e-07, + "loss": 0.0287, + "reward": 2.0818288326263428, + "reward_std": 0.050610002130270004, + "rewards/accuracy_reward": 0.910578727722168, + "rewards/format_reward": 1.0, + "step": 1154, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 421.32501220703125, + "epoch": 0.021951914853178753, + "grad_norm": 3.4329642624347874, + "kl": 1.3984375, + "learning_rate": 9.988114637290635e-07, + "loss": 0.056, + "reward": 2.1162497997283936, + "reward_std": 0.3431072533130646, + "rewards/accuracy_reward": 0.949999988079071, + "rewards/format_reward": 0.949999988079071, + "step": 1155, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 380.5, + "epoch": 0.02197092084006462, + "grad_norm": 3.3599646843218895, + "kl": 2.1875, + "learning_rate": 9.988094055841774e-07, + "loss": 0.0874, + "reward": 1.5845874547958374, + "reward_std": 0.26362115144729614, + "rewards/accuracy_reward": 0.5158374905586243, + "rewards/format_reward": 0.949999988079071, + "step": 1156, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 453.7250061035156, + "epoch": 0.02198992682695049, + "grad_norm": 6.004458777358778, + "kl": 2.46875, + "learning_rate": 9.988073456609495e-07, + "loss": 0.0989, + "reward": 1.6548088788986206, + "reward_std": 0.368537962436676, + "rewards/accuracy_reward": 0.6173087358474731, + "rewards/format_reward": 0.875, + "step": 1157, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 417.7749938964844, + "epoch": 0.022008932813836358, + "grad_norm": 12.377396172306666, + "kl": 5.84375, + "learning_rate": 9.988052839593873e-07, + "loss": 0.2342, + "reward": 1.0857117176055908, + "reward_std": 0.6150609850883484, + "rewards/accuracy_reward": 0.2882117033004761, + "rewards/format_reward": 0.7250000238418579, + "step": 1158, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 347.0, + "epoch": 0.02202793880072223, + "grad_norm": 14.310417847636268, + "kl": 5.65625, + "learning_rate": 9.988032204794978e-07, + "loss": 0.2258, + "reward": 1.738639235496521, + "reward_std": 0.41455432772636414, + "rewards/accuracy_reward": 0.6486393213272095, + "rewards/format_reward": 0.925000011920929, + "step": 1159, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.0, + "all_wrong": 0.4, + "completion_length": 361.6000061035156, + "epoch": 0.022046944787608096, + "grad_norm": 15.082462895525596, + "kl": 6.1875, + "learning_rate": 9.988011552212888e-07, + "loss": 0.2469, + "reward": 1.5087499618530273, + "reward_std": 0.41775813698768616, + "rewards/accuracy_reward": 0.45000001788139343, + "rewards/format_reward": 0.949999988079071, + "step": 1160, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 401.9250183105469, + "epoch": 0.022065950774493966, + "grad_norm": 5.934688730965424, + "kl": 1.6875, + "learning_rate": 9.987990881847675e-07, + "loss": 0.0675, + "reward": 1.6231422424316406, + "reward_std": 0.15801259875297546, + "rewards/accuracy_reward": 0.4693923592567444, + "rewards/format_reward": 1.0, + "step": 1161, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 420.3500061035156, + "epoch": 0.022084956761379834, + "grad_norm": 3.691650891548696, + "kl": 2.484375, + "learning_rate": 9.98797019369941e-07, + "loss": 0.0995, + "reward": 1.5292717218399048, + "reward_std": 0.1003173366189003, + "rewards/accuracy_reward": 0.4755217134952545, + "rewards/format_reward": 0.9750000238418579, + "step": 1162, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 448.6000061035156, + "epoch": 0.022103962748265704, + "grad_norm": 2.161277127845887, + "kl": 0.62890625, + "learning_rate": 9.98794948776817e-07, + "loss": 0.0251, + "reward": 1.969212532043457, + "reward_std": 0.2199476808309555, + "rewards/accuracy_reward": 0.8117125630378723, + "rewards/format_reward": 1.0, + "step": 1163, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 1.0, + "all_wrong": 0.0, + "completion_length": 455.8999938964844, + "epoch": 0.02212296873515157, + "grad_norm": 3.0084445357306175, + "kl": 0.38671875, + "learning_rate": 9.987928764054027e-07, + "loss": 0.0154, + "reward": 2.341249942779541, + "reward_std": 0.04742559790611267, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 1.0, + "step": 1164, + "temporal_rewards": 1.0 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 459.75, + "epoch": 0.022141974722037442, + "grad_norm": 2.534339030967863, + "kl": 0.380859375, + "learning_rate": 9.987908022557057e-07, + "loss": 0.0153, + "reward": 1.6424999237060547, + "reward_std": 0.12985198199748993, + "rewards/accuracy_reward": 0.574999988079071, + "rewards/format_reward": 1.0, + "step": 1165, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 445.625, + "epoch": 0.02216098070892331, + "grad_norm": 5.799787655731888, + "kl": 0.16796875, + "learning_rate": 9.98788726327733e-07, + "loss": 0.0067, + "reward": 1.6402705907821655, + "reward_std": 0.24621747434139252, + "rewards/accuracy_reward": 0.5752705931663513, + "rewards/format_reward": 1.0, + "step": 1166, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 467.70001220703125, + "epoch": 0.02217998669580918, + "grad_norm": 1.3333199001500888, + "kl": 0.150390625, + "learning_rate": 9.987866486214926e-07, + "loss": 0.006, + "reward": 2.009999990463257, + "reward_std": 0.04696769639849663, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 1167, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 468.5500183105469, + "epoch": 0.02219899268269505, + "grad_norm": 1.422607361320117, + "kl": 0.1181640625, + "learning_rate": 9.987845691369912e-07, + "loss": 0.0047, + "reward": 2.0349998474121094, + "reward_std": 0.10494961589574814, + "rewards/accuracy_reward": 0.824999988079071, + "rewards/format_reward": 1.0, + "step": 1168, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 446.1000061035156, + "epoch": 0.022217998669580918, + "grad_norm": 2.2260870129899684, + "kl": 0.1875, + "learning_rate": 9.987824878742369e-07, + "loss": 0.0075, + "reward": 1.6787575483322144, + "reward_std": 0.05346319079399109, + "rewards/accuracy_reward": 0.5237575173377991, + "rewards/format_reward": 1.0, + "step": 1169, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 425.2749938964844, + "epoch": 0.02223700465646679, + "grad_norm": 1.7441836081400246, + "kl": 0.2138671875, + "learning_rate": 9.987804048332366e-07, + "loss": 0.0086, + "reward": 1.505933165550232, + "reward_std": 0.24260154366493225, + "rewards/accuracy_reward": 0.5046830773353577, + "rewards/format_reward": 1.0, + "step": 1170, + "temporal_rewards": 0.3999999761581421 + }, + { + "all_correct": 0.2, + "all_wrong": 0.0, + "completion_length": 443.125, + "epoch": 0.022256010643352656, + "grad_norm": 3.1235419010677044, + "kl": 0.212890625, + "learning_rate": 9.987783200139978e-07, + "loss": 0.0085, + "reward": 1.6564706563949585, + "reward_std": 0.10165625810623169, + "rewards/accuracy_reward": 0.6177206635475159, + "rewards/format_reward": 1.0, + "step": 1171, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 448.875, + "epoch": 0.022275016630238526, + "grad_norm": 1.5302875316625966, + "kl": 0.1748046875, + "learning_rate": 9.987762334165284e-07, + "loss": 0.007, + "reward": 1.8300564289093018, + "reward_std": 0.2543344497680664, + "rewards/accuracy_reward": 0.7363064885139465, + "rewards/format_reward": 1.0, + "step": 1172, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 466.2250061035156, + "epoch": 0.022294022617124393, + "grad_norm": 1.843049217849662, + "kl": 0.142578125, + "learning_rate": 9.987741450408354e-07, + "loss": 0.0057, + "reward": 1.6622055768966675, + "reward_std": 0.07972613722085953, + "rewards/accuracy_reward": 0.5772055983543396, + "rewards/format_reward": 1.0, + "step": 1173, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.8, + "all_wrong": 0.2, + "completion_length": 479.5500183105469, + "epoch": 0.022313028604010264, + "grad_norm": 1.461850296625544, + "kl": 0.158203125, + "learning_rate": 9.98772054886926e-07, + "loss": 0.0063, + "reward": 2.049999952316284, + "reward_std": 0.06527493894100189, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 1174, + "temporal_rewards": 0.8999999761581421 + }, + { + "all_correct": 0.0, + "all_wrong": 0.6, + "completion_length": 475.6000061035156, + "epoch": 0.02233203459089613, + "grad_norm": 2.6610367192788806, + "kl": 0.205078125, + "learning_rate": 9.987699629548083e-07, + "loss": 0.0082, + "reward": 1.1446107625961304, + "reward_std": 0.19837354123592377, + "rewards/accuracy_reward": 0.13586071133613586, + "rewards/format_reward": 1.0, + "step": 1175, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 468.75, + "epoch": 0.022351040577782002, + "grad_norm": 1.8298904258351714, + "kl": 0.2060546875, + "learning_rate": 9.987678692444894e-07, + "loss": 0.0082, + "reward": 1.8398265838623047, + "reward_std": 0.2500605583190918, + "rewards/accuracy_reward": 0.6973266005516052, + "rewards/format_reward": 1.0, + "step": 1176, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 469.1000061035156, + "epoch": 0.02237004656466787, + "grad_norm": 1.645741629002132, + "kl": 0.1357421875, + "learning_rate": 9.98765773755977e-07, + "loss": 0.0054, + "reward": 1.693355917930603, + "reward_std": 0.09225838631391525, + "rewards/accuracy_reward": 0.5808559656143188, + "rewards/format_reward": 1.0, + "step": 1177, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 469.1499938964844, + "epoch": 0.02238905255155374, + "grad_norm": 1.5206825641172703, + "kl": 0.10009765625, + "learning_rate": 9.98763676489278e-07, + "loss": 0.004, + "reward": 1.6968517303466797, + "reward_std": 0.07131198048591614, + "rewards/accuracy_reward": 0.6518518328666687, + "rewards/format_reward": 1.0, + "step": 1178, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 445.3500061035156, + "epoch": 0.022408058538439607, + "grad_norm": 1.3005230258076648, + "kl": 0.142578125, + "learning_rate": 9.987615774444004e-07, + "loss": 0.0057, + "reward": 1.8824999332427979, + "reward_std": 0.14569950103759766, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 1.0, + "step": 1179, + "temporal_rewards": 1.0 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 472.3999938964844, + "epoch": 0.022427064525325478, + "grad_norm": 2.2157074437726774, + "kl": 0.15625, + "learning_rate": 9.987594766213516e-07, + "loss": 0.0063, + "reward": 1.5982060432434082, + "reward_std": 0.14768338203430176, + "rewards/accuracy_reward": 0.5732061266899109, + "rewards/format_reward": 0.9750000238418579, + "step": 1180, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 483.82501220703125, + "epoch": 0.022446070512211348, + "grad_norm": 1.7886712817879944, + "kl": 0.146484375, + "learning_rate": 9.98757374020139e-07, + "loss": 0.0058, + "reward": 1.8199999332427979, + "reward_std": 0.23715201020240784, + "rewards/accuracy_reward": 0.699999988079071, + "rewards/format_reward": 1.0, + "step": 1181, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 464.7250061035156, + "epoch": 0.022465076499097215, + "grad_norm": 1.8719500152149386, + "kl": 0.140625, + "learning_rate": 9.987552696407702e-07, + "loss": 0.0056, + "reward": 2.0943691730499268, + "reward_std": 0.04618341848254204, + "rewards/accuracy_reward": 0.9281191229820251, + "rewards/format_reward": 1.0, + "step": 1182, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 474.7250061035156, + "epoch": 0.022484082485983086, + "grad_norm": 1.3671762467998057, + "kl": 0.146484375, + "learning_rate": 9.987531634832527e-07, + "loss": 0.0059, + "reward": 1.8374998569488525, + "reward_std": 0.14663709700107574, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 1.0, + "step": 1183, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.0, + "completion_length": 501.82501220703125, + "epoch": 0.022503088472868953, + "grad_norm": 1.8856679504366625, + "kl": 0.1298828125, + "learning_rate": 9.987510555475938e-07, + "loss": 0.0052, + "reward": 1.750697374343872, + "reward_std": 0.2980468273162842, + "rewards/accuracy_reward": 0.6919474005699158, + "rewards/format_reward": 0.9750000238418579, + "step": 1184, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.2, + "completion_length": 466.32501220703125, + "epoch": 0.022522094459754824, + "grad_norm": 1.1345719205040237, + "kl": 0.1748046875, + "learning_rate": 9.987489458338013e-07, + "loss": 0.007, + "reward": 1.8574999570846558, + "reward_std": 0.12913735210895538, + "rewards/accuracy_reward": 0.6500000357627869, + "rewards/format_reward": 1.0, + "step": 1185, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.0, + "all_wrong": 0.4, + "completion_length": 506.875, + "epoch": 0.02254110044664069, + "grad_norm": 1.7151041252666872, + "kl": 0.15234375, + "learning_rate": 9.987468343418823e-07, + "loss": 0.0061, + "reward": 1.1401817798614502, + "reward_std": 0.14938972890377045, + "rewards/accuracy_reward": 0.19393174350261688, + "rewards/format_reward": 1.0, + "step": 1186, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 458.5500183105469, + "epoch": 0.022560106433526562, + "grad_norm": 1.9190999280521972, + "kl": 0.224609375, + "learning_rate": 9.987447210718449e-07, + "loss": 0.009, + "reward": 2.083750009536743, + "reward_std": 0.14773176610469818, + "rewards/accuracy_reward": 0.925000011920929, + "rewards/format_reward": 1.0, + "step": 1187, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.2, + "completion_length": 456.25, + "epoch": 0.02257911242041243, + "grad_norm": 2.7601744813281375, + "kl": 0.39453125, + "learning_rate": 9.98742606023696e-07, + "loss": 0.0158, + "reward": 1.7402499914169312, + "reward_std": 0.17064407467842102, + "rewards/accuracy_reward": 0.627750039100647, + "rewards/format_reward": 0.9750000238418579, + "step": 1188, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 468.75, + "epoch": 0.0225981184072983, + "grad_norm": 1.6961197264926566, + "kl": 0.16015625, + "learning_rate": 9.987404891974439e-07, + "loss": 0.0064, + "reward": 1.8000000715255737, + "reward_std": 0.09896720945835114, + "rewards/accuracy_reward": 0.6850000619888306, + "rewards/format_reward": 1.0, + "step": 1189, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.4, + "completion_length": 457.1499938964844, + "epoch": 0.022617124394184167, + "grad_norm": 2.299641723237745, + "kl": 0.291015625, + "learning_rate": 9.987383705930954e-07, + "loss": 0.0116, + "reward": 1.5035418272018433, + "reward_std": 0.10506286472082138, + "rewards/accuracy_reward": 0.432291716337204, + "rewards/format_reward": 1.0, + "step": 1190, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.2, + "all_wrong": 0.4, + "completion_length": 447.20001220703125, + "epoch": 0.022636130381070037, + "grad_norm": 3.1215590241380466, + "kl": 0.2431640625, + "learning_rate": 9.987362502106586e-07, + "loss": 0.0097, + "reward": 1.6737499237060547, + "reward_std": 0.1938600391149521, + "rewards/accuracy_reward": 0.550000011920929, + "rewards/format_reward": 1.0, + "step": 1191, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 457.8500061035156, + "epoch": 0.022655136367955905, + "grad_norm": 1.5706659326588908, + "kl": 0.267578125, + "learning_rate": 9.987341280501407e-07, + "loss": 0.0107, + "reward": 1.491249918937683, + "reward_std": 0.22266459465026855, + "rewards/accuracy_reward": 0.4749999940395355, + "rewards/format_reward": 0.9750000238418579, + "step": 1192, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 436.0500183105469, + "epoch": 0.022674142354841775, + "grad_norm": 4.590543835937289, + "kl": 0.306640625, + "learning_rate": 9.987320041115495e-07, + "loss": 0.0122, + "reward": 2.011476993560791, + "reward_std": 0.09533931314945221, + "rewards/accuracy_reward": 0.7877270579338074, + "rewards/format_reward": 1.0, + "step": 1193, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.6, + "all_wrong": 0.4, + "completion_length": 412.2250061035156, + "epoch": 0.022693148341727643, + "grad_norm": 3.717987877086677, + "kl": 0.2099609375, + "learning_rate": 9.987298783948923e-07, + "loss": 0.0084, + "reward": 1.7487499713897705, + "reward_std": 0.0325133316218853, + "rewards/accuracy_reward": 0.6000000238418579, + "rewards/format_reward": 1.0, + "step": 1194, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 445.4250183105469, + "epoch": 0.022712154328613513, + "grad_norm": 1.550577133119749, + "kl": 0.2021484375, + "learning_rate": 9.98727750900177e-07, + "loss": 0.0081, + "reward": 1.651789665222168, + "reward_std": 0.12180455029010773, + "rewards/accuracy_reward": 0.5142897963523865, + "rewards/format_reward": 1.0, + "step": 1195, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 417.6750183105469, + "epoch": 0.022731160315499384, + "grad_norm": 2.6170553217661006, + "kl": 0.3046875, + "learning_rate": 9.98725621627411e-07, + "loss": 0.0122, + "reward": 1.8090852499008179, + "reward_std": 0.2262856811285019, + "rewards/accuracy_reward": 0.6578353643417358, + "rewards/format_reward": 1.0, + "step": 1196, + "temporal_rewards": 0.699999988079071 + }, + { + "all_correct": 0.4, + "all_wrong": 0.2, + "completion_length": 402.57501220703125, + "epoch": 0.02275016630238525, + "grad_norm": 3.1478659850181603, + "kl": 0.5078125, + "learning_rate": 9.98723490576602e-07, + "loss": 0.0203, + "reward": 1.850000023841858, + "reward_std": 0.16654424369335175, + "rewards/accuracy_reward": 0.637499988079071, + "rewards/format_reward": 1.0, + "step": 1197, + "temporal_rewards": 0.7999999523162842 + }, + { + "all_correct": 0.6, + "all_wrong": 0.0, + "completion_length": 398.8000183105469, + "epoch": 0.02276917228927112, + "grad_norm": 3.9870173336907477, + "kl": 0.498046875, + "learning_rate": 9.987213577477574e-07, + "loss": 0.0199, + "reward": 1.9662498235702515, + "reward_std": 0.24570035934448242, + "rewards/accuracy_reward": 0.800000011920929, + "rewards/format_reward": 1.0, + "step": 1198, + "temporal_rewards": 0.5 + }, + { + "all_correct": 0.0, + "all_wrong": 0.0, + "completion_length": 385.82501220703125, + "epoch": 0.02278817827615699, + "grad_norm": 4.553674308518263, + "kl": 0.427734375, + "learning_rate": 9.987192231408851e-07, + "loss": 0.0171, + "reward": 1.947222113609314, + "reward_std": 0.2639230191707611, + "rewards/accuracy_reward": 0.8472222685813904, + "rewards/format_reward": 1.0, + "step": 1199, + "temporal_rewards": 0.5999999642372131 + }, + { + "all_correct": 0.0, + "all_wrong": 0.2, + "completion_length": 364.3000183105469, + "epoch": 0.02280718426304286, + "grad_norm": 3.1491021834644775, + "kl": 0.318359375, + "learning_rate": 9.987170867559924e-07, + "loss": 0.0127, + "reward": 1.329876184463501, + "reward_std": 0.15625979006290436, + "rewards/accuracy_reward": 0.23987610638141632, + "rewards/format_reward": 1.0, + "step": 1200, + "temporal_rewards": 0.699999988079071 + } + ], + "logging_steps": 1.0, + "max_steps": 52615, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}