{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7910349373764007, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.5, "completions/max_terminated_length": 582.5, "completions/mean_length": 274.4375, "completions/mean_terminated_length": 274.4375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.0013183915622940012, "grad_norm": 3.9803826808929443, "kl": 0.0, "learning_rate": 9.993403693931399e-07, "loss": -0.002, "num_tokens": 530179.0, "reward": 1.0078125, "reward_std": 0.46677708625793457, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.37497539073228836, "rewards/counterfactual_reasoning_reward/mean": 0.140625, "rewards/counterfactual_reasoning_reward/std": 0.22840170562267303, "rewards/multiturn_format_reward/mean": 0.78125, "rewards/multiturn_format_reward/std": 0.4128527194261551, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.5, "completions/max_terminated_length": 839.5, "completions/mean_length": 271.140625, "completions/mean_terminated_length": 271.140625, "completions/min_length": 97.5, "completions/min_terminated_length": 97.5, "epoch": 0.0026367831245880024, "grad_norm": 4.714284420013428, "kl": 0.00290679931640625, "learning_rate": 9.986807387862796e-07, "loss": -0.0244, "num_tokens": 1071931.0, "reward": 0.9609375, "reward_std": 0.5958000123500824, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.46946612000465393, "rewards/counterfactual_reasoning_reward/mean": 0.15625, "rewards/counterfactual_reasoning_reward/std": 0.31679005175828934, "rewards/multiturn_format_reward/mean": 0.71875, "rewards/multiturn_format_reward/std": 0.45543521642684937, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.5, "completions/max_terminated_length": 671.5, "completions/mean_length": 233.796875, "completions/mean_terminated_length": 233.796875, "completions/min_length": 83.5, "completions/min_terminated_length": 83.5, "epoch": 0.003955174686882004, "grad_norm": 4.630187511444092, "kl": 0.0024871826171875, "learning_rate": 9.980211081794195e-07, "loss": 0.0108, "num_tokens": 1665785.0, "reward": 1.1171875, "reward_std": 0.6009760797023773, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.4907747954130173, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4638662487268448, "rewards/multiturn_format_reward/mean": 0.78125, "rewards/multiturn_format_reward/std": 0.4128527194261551, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 272.0625, "completions/mean_terminated_length": 272.0625, "completions/min_length": 114.5, "completions/min_terminated_length": 114.5, "epoch": 0.005273566249176005, "grad_norm": 142.42579650878906, "kl": 0.207427978515625, "learning_rate": 9.973614775725592e-07, "loss": -0.0458, "num_tokens": 2188762.0, "reward": 1.171875, "reward_std": 0.350565642118454, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.462014764547348, "rewards/counterfactual_reasoning_reward/mean": 0.234375, "rewards/counterfactual_reasoning_reward/std": 0.4299773871898651, "rewards/multiturn_format_reward/mean": 0.890625, "rewards/multiturn_format_reward/std": 0.3074183538556099, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.5, "completions/max_terminated_length": 816.5, "completions/mean_length": 268.78125, "completions/mean_terminated_length": 268.78125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.0065919578114700065, "grad_norm": 247196.234375, "kl": 446.00445556640625, "learning_rate": 9.967018469656991e-07, "loss": 2.2313, "num_tokens": 2757320.0, "reward": 1.3515625, "reward_std": 0.5034354627132416, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.49899089336395264, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.4932873994112015, "rewards/multiturn_format_reward/mean": 0.90625, "rewards/multiturn_format_reward/std": 0.2909727171063423, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 236.71875, "completions/mean_terminated_length": 236.71875, "completions/min_length": 67.5, "completions/min_terminated_length": 67.5, "epoch": 0.007910349373764008, "grad_norm": 2.1691019535064697, "kl": 0.0186767578125, "learning_rate": 9.96042216358839e-07, "loss": 0.0011, "num_tokens": 3325306.0, "reward": 1.34375, "reward_std": 0.49967344105243683, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.48495998978614807, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4559413939714432, "rewards/multiturn_format_reward/mean": 0.921875, "rewards/multiturn_format_reward/std": 0.2710396274924278, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 272.078125, "completions/mean_terminated_length": 272.078125, "completions/min_length": 96.5, "completions/min_terminated_length": 96.5, "epoch": 0.00922874093605801, "grad_norm": 3.264988422393799, "kl": 0.01812744140625, "learning_rate": 9.953825857519788e-07, "loss": -0.0526, "num_tokens": 3900763.0, "reward": 1.1953125, "reward_std": 0.39904333651065826, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.48721402883529663, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.40346992015838623, "rewards/multiturn_format_reward/mean": 0.90625, "rewards/multiturn_format_reward/std": 0.2909727171063423, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.5, "completions/max_terminated_length": 689.5, "completions/mean_length": 259.078125, "completions/mean_terminated_length": 259.078125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.01054713249835201, "grad_norm": 13.94677448272705, "kl": 0.06378173828125, "learning_rate": 9.947229551451187e-07, "loss": 0.0462, "num_tokens": 4419434.0, "reward": 1.109375, "reward_std": 0.3776468485593796, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41824956238269806, "rewards/counterfactual_reasoning_reward/mean": 0.15625, "rewards/counterfactual_reasoning_reward/std": 0.34293801337480545, "rewards/multiturn_format_reward/mean": 0.921875, "rewards/multiturn_format_reward/std": 0.2563937231898308, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.5, "completions/max_terminated_length": 636.5, "completions/mean_length": 232.609375, "completions/mean_terminated_length": 232.609375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.011865524060646011, "grad_norm": 2.5805885791778564, "kl": 0.01422119140625, "learning_rate": 9.940633245382586e-07, "loss": -0.0058, "num_tokens": 4971733.0, "reward": 1.171875, "reward_std": 0.4036417454481125, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.4767438918352127, "rewards/counterfactual_reasoning_reward/mean": 0.171875, "rewards/counterfactual_reasoning_reward/std": 0.378012090921402, "rewards/multiturn_format_reward/mean": 0.921875, "rewards/multiturn_format_reward/std": 0.2710396274924278, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.5, "completions/max_terminated_length": 929.5, "completions/mean_length": 276.921875, "completions/mean_terminated_length": 276.921875, "completions/min_length": 72.5, "completions/min_terminated_length": 72.5, "epoch": 0.013183915622940013, "grad_norm": 1.2122479677200317, "kl": 0.01861572265625, "learning_rate": 9.934036939313983e-07, "loss": 0.002, "num_tokens": 5518004.0, "reward": 1.21875, "reward_std": 0.4054251164197922, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.47197872400283813, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.3378837928175926, "rewards/multiturn_format_reward/mean": 0.90625, "rewards/multiturn_format_reward/std": 0.2909727171063423, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 259.328125, "completions/mean_terminated_length": 259.328125, "completions/min_length": 141.5, "completions/min_terminated_length": 141.5, "epoch": 0.014502307185234015, "grad_norm": 2.037294864654541, "kl": 0.0635986328125, "learning_rate": 9.927440633245382e-07, "loss": -0.0026, "num_tokens": 6097533.0, "reward": 1.1796875, "reward_std": 0.5044101774692535, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.4635103940963745, "rewards/counterfactual_reasoning_reward/mean": 0.203125, "rewards/counterfactual_reasoning_reward/std": 0.3964070826768875, "rewards/multiturn_format_reward/mean": 0.90625, "rewards/multiturn_format_reward/std": 0.2909727171063423, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 242.71875, "completions/mean_terminated_length": 242.71875, "completions/min_length": 142.5, "completions/min_terminated_length": 142.5, "epoch": 0.015820698747528016, "grad_norm": 1.3885926008224487, "kl": 0.013031005859375, "learning_rate": 9.92084432717678e-07, "loss": -0.0165, "num_tokens": 6672991.0, "reward": 1.2890625, "reward_std": 0.32206132262945175, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.4824019521474838, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.40346992015838623, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.5, "completions/max_terminated_length": 549.5, "completions/mean_length": 251.875, "completions/mean_terminated_length": 251.875, "completions/min_length": 126.5, "completions/min_terminated_length": 126.5, "epoch": 0.017139090309822018, "grad_norm": 2.450016975402832, "kl": 0.01483154296875, "learning_rate": 9.914248021108179e-07, "loss": -0.0322, "num_tokens": 7216201.0, "reward": 1.3125, "reward_std": 0.301506832242012, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.48946478962898254, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.41824956238269806, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.5, "completions/max_terminated_length": 665.5, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.01845748187211602, "grad_norm": 1.693463921546936, "kl": 0.0245361328125, "learning_rate": 9.907651715039578e-07, "loss": -0.0048, "num_tokens": 7714133.0, "reward": 1.1875, "reward_std": 0.2738431394100189, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.43038569390773773, "rewards/counterfactual_reasoning_reward/mean": 0.109375, "rewards/counterfactual_reasoning_reward/std": 0.31607766449451447, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 238.734375, "completions/mean_terminated_length": 238.734375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.01977587343441002, "grad_norm": 2.075230360031128, "kl": 0.02435302734375, "learning_rate": 9.901055408970977e-07, "loss": -0.0155, "num_tokens": 8227966.0, "reward": 1.21875, "reward_std": 0.3455280065536499, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.45178256928920746, "rewards/counterfactual_reasoning_reward/mean": 0.140625, "rewards/counterfactual_reasoning_reward/std": 0.33297405391931534, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 267.421875, "completions/mean_terminated_length": 267.421875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.02109426499670402, "grad_norm": 5.076559543609619, "kl": 0.05328369140625, "learning_rate": 9.894459102902374e-07, "loss": 0.0149, "num_tokens": 8779750.0, "reward": 1.25, "reward_std": 0.5101586878299713, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49899089336395264, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.4399413466453552, "rewards/multiturn_format_reward/mean": 0.921875, "rewards/multiturn_format_reward/std": 0.2710396274924278, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 265.5, "completions/mean_terminated_length": 265.5, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.02241265655899802, "grad_norm": 4.723637104034424, "kl": 0.439208984375, "learning_rate": 9.887862796833773e-07, "loss": 0.0051, "num_tokens": 9329895.0, "reward": 1.2578125, "reward_std": 0.38886311650276184, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.4635103940963745, "rewards/counterfactual_reasoning_reward/mean": 0.234375, "rewards/counterfactual_reasoning_reward/std": 0.4199155569076538, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 217.3125, "completions/mean_terminated_length": 217.3125, "completions/min_length": 124.5, "completions/min_terminated_length": 124.5, "epoch": 0.023731048121292023, "grad_norm": 5.470117092132568, "kl": 0.1859130859375, "learning_rate": 9.88126649076517e-07, "loss": -0.0186, "num_tokens": 9891991.0, "reward": 1.2421875, "reward_std": 0.24380210041999817, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.4635103940963745, "rewards/counterfactual_reasoning_reward/mean": 0.140625, "rewards/counterfactual_reasoning_reward/std": 0.35245639085769653, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 264.328125, "completions/mean_terminated_length": 264.328125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.025049439683586024, "grad_norm": 1.529012680053711, "kl": 0.070556640625, "learning_rate": 9.87467018469657e-07, "loss": 0.0004, "num_tokens": 10450257.0, "reward": 1.1796875, "reward_std": 0.40981143712997437, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.42200562357902527, "rewards/counterfactual_reasoning_reward/mean": 0.140625, "rewards/counterfactual_reasoning_reward/std": 0.3083590194582939, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.5, "completions/max_terminated_length": 503.5, "completions/mean_length": 230.09375, "completions/mean_terminated_length": 230.09375, "completions/min_length": 82.5, "completions/min_terminated_length": 82.5, "epoch": 0.026367831245880026, "grad_norm": 3.028373956680298, "kl": 0.0443115234375, "learning_rate": 9.86807387862797e-07, "loss": -0.031, "num_tokens": 11068320.0, "reward": 1.3359375, "reward_std": 0.2834687978029251, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.4979427307844162, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.4395582377910614, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.5, "completions/max_terminated_length": 442.5, "completions/mean_length": 226.140625, "completions/mean_terminated_length": 226.140625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.027686222808174028, "grad_norm": 1.0224933624267578, "kl": 0.044921875, "learning_rate": 9.861477572559366e-07, "loss": -0.0125, "num_tokens": 11625225.0, "reward": 1.140625, "reward_std": 0.2681869566440582, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.4395582377910614, "rewards/counterfactual_reasoning_reward/mean": 0.046875, "rewards/counterfactual_reasoning_reward/std": 0.21135568618774414, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.5, "completions/max_terminated_length": 381.5, "completions/mean_length": 221.859375, "completions/mean_terminated_length": 221.859375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.02900461437046803, "grad_norm": 1.6780363321304321, "kl": 0.0498046875, "learning_rate": 9.854881266490765e-07, "loss": -0.0115, "num_tokens": 12178235.0, "reward": 1.25, "reward_std": 0.2218562290072441, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.44777433574199677, "rewards/counterfactual_reasoning_reward/mean": 0.234375, "rewards/counterfactual_reasoning_reward/std": 0.39400696754455566, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.5, "completions/max_terminated_length": 862.5, "completions/mean_length": 264.984375, "completions/mean_terminated_length": 264.984375, "completions/min_length": 141.5, "completions/min_terminated_length": 141.5, "epoch": 0.03032300593276203, "grad_norm": 2.74994158744812, "kl": 0.06298828125, "learning_rate": 9.848284960422162e-07, "loss": -0.0007, "num_tokens": 12705328.0, "reward": 1.109375, "reward_std": 0.2691454291343689, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.4299773871898651, "rewards/counterfactual_reasoning_reward/mean": 0.046875, "rewards/counterfactual_reasoning_reward/std": 0.21135568618774414, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.5, "completions/max_terminated_length": 820.5, "completions/mean_length": 263.71875, "completions/mean_terminated_length": 263.71875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.03164139749505603, "grad_norm": 1.7880964279174805, "kl": 0.061767578125, "learning_rate": 9.841688654353562e-07, "loss": -0.0173, "num_tokens": 13252560.0, "reward": 1.2734375, "reward_std": 0.35661637783050537, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.48040975630283356, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.41824956238269806, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.5, "completions/max_terminated_length": 568.5, "completions/mean_length": 241.796875, "completions/mean_terminated_length": 241.796875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.03295978905735003, "grad_norm": 2.4859321117401123, "kl": 0.068115234375, "learning_rate": 9.83509234828496e-07, "loss": 0.0101, "num_tokens": 13790296.0, "reward": 1.296875, "reward_std": 0.2460189089179039, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.47197872400283813, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.40346992015838623, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.5, "completions/max_terminated_length": 673.5, "completions/mean_length": 268.296875, "completions/mean_terminated_length": 268.296875, "completions/min_length": 134.5, "completions/min_terminated_length": 134.5, "epoch": 0.034278180619644036, "grad_norm": 1.7926748991012573, "kl": 0.073974609375, "learning_rate": 9.828496042216358e-07, "loss": -0.0035, "num_tokens": 14355345.0, "reward": 1.2578125, "reward_std": 0.3273041099309921, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.48040975630283356, "rewards/counterfactual_reasoning_reward/mean": 0.15625, "rewards/counterfactual_reasoning_reward/std": 0.3580790013074875, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 252.296875, "completions/mean_terminated_length": 252.296875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.035596572181938034, "grad_norm": 3.178527593612671, "kl": 0.06494140625, "learning_rate": 9.821899736147757e-07, "loss": -0.0465, "num_tokens": 14914374.0, "reward": 1.2265625, "reward_std": 0.29877035319805145, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.44777433574199677, "rewards/counterfactual_reasoning_reward/mean": 0.15625, "rewards/counterfactual_reasoning_reward/std": 0.3662842661142349, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.5, "completions/max_terminated_length": 447.5, "completions/mean_length": 236.4375, "completions/mean_terminated_length": 236.4375, "completions/min_length": 144.5, "completions/min_terminated_length": 144.5, "epoch": 0.03691496374423204, "grad_norm": 1.6249388456344604, "kl": 0.0712890625, "learning_rate": 9.815303430079154e-07, "loss": -0.0035, "num_tokens": 15499051.0, "reward": 1.1640625, "reward_std": 0.27878718823194504, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.4299773871898651, "rewards/counterfactual_reasoning_reward/mean": 0.09375, "rewards/counterfactual_reasoning_reward/std": 0.2961445748806, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 228.453125, "completions/mean_terminated_length": 228.453125, "completions/min_length": 107.5, "completions/min_terminated_length": 107.5, "epoch": 0.03823335530652604, "grad_norm": 1.9672752618789673, "kl": 0.077880859375, "learning_rate": 9.808707124010553e-07, "loss": 0.0004, "num_tokens": 16006560.0, "reward": 1.3359375, "reward_std": 0.3543919622898102, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.507007360458374, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.44547125697135925, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.1480722874403, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.5, "completions/max_terminated_length": 595.5, "completions/mean_length": 229.09375, "completions/mean_terminated_length": 229.09375, "completions/min_length": 140.5, "completions/min_terminated_length": 140.5, "epoch": 0.03955174686882004, "grad_norm": 2.1592164039611816, "kl": 0.08544921875, "learning_rate": 9.802110817941953e-07, "loss": -0.0035, "num_tokens": 16576394.0, "reward": 1.3046875, "reward_std": 0.4169527292251587, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4994383603334427, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.45128606259822845, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.5, "completions/max_terminated_length": 514.5, "completions/mean_length": 234.9375, "completions/mean_terminated_length": 234.9375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.04087013843111404, "grad_norm": 2.7155117988586426, "kl": 0.09326171875, "learning_rate": 9.79551451187335e-07, "loss": 0.0386, "num_tokens": 17106906.0, "reward": 1.3515625, "reward_std": 0.38354596495628357, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.48139922320842743, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.45178256928920746, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.1480722874403, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 208.1875, "completions/mean_terminated_length": 208.1875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.04218852999340804, "grad_norm": 1.1984411478042603, "kl": 0.091796875, "learning_rate": 9.788918205804749e-07, "loss": -0.0025, "num_tokens": 17715718.0, "reward": 1.375, "reward_std": 0.3442307263612747, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.5055117309093475, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4638662487268448, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.5, "completions/max_terminated_length": 537.5, "completions/mean_length": 210.734375, "completions/mean_terminated_length": 210.734375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.043506921555702044, "grad_norm": 3.9346837997436523, "kl": 0.140869140625, "learning_rate": 9.782321899736148e-07, "loss": 0.0027, "num_tokens": 18287547.0, "reward": 1.2578125, "reward_std": 0.39340461790561676, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.46946612000465393, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.41824956238269806, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.5, "completions/max_terminated_length": 578.5, "completions/mean_length": 235.640625, "completions/mean_terminated_length": 235.640625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.04482531311799604, "grad_norm": 4.986603736877441, "kl": 0.1416015625, "learning_rate": 9.775725593667545e-07, "loss": 0.0085, "num_tokens": 18854946.0, "reward": 1.3046875, "reward_std": 0.3679187297821045, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.48495998978614807, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.425730362534523, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 186.8125, "completions/mean_terminated_length": 186.8125, "completions/min_length": 60.5, "completions/min_terminated_length": 60.5, "epoch": 0.04614370468029005, "grad_norm": 1.51068913936615, "kl": 0.154541015625, "learning_rate": 9.769129287598944e-07, "loss": 0.0154, "num_tokens": 19401590.0, "reward": 1.375, "reward_std": 0.3107884153723717, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.49899089336395264, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.4339464604854584, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 222.078125, "completions/mean_terminated_length": 222.078125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.047462096242584045, "grad_norm": 4.239743709564209, "kl": 0.095458984375, "learning_rate": 9.762532981530342e-07, "loss": -0.0366, "num_tokens": 19959524.0, "reward": 1.2265625, "reward_std": 0.25684621185064316, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.43038569390773773, "rewards/counterfactual_reasoning_reward/mean": 0.1875, "rewards/counterfactual_reasoning_reward/std": 0.376473993062973, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 225.453125, "completions/mean_terminated_length": 225.453125, "completions/min_length": 107.5, "completions/min_terminated_length": 107.5, "epoch": 0.04878048780487805, "grad_norm": 2.3430044651031494, "kl": 0.107421875, "learning_rate": 9.75593667546174e-07, "loss": 0.0044, "num_tokens": 20540448.0, "reward": 1.1796875, "reward_std": 0.4133179932832718, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.4659053534269333, "rewards/counterfactual_reasoning_reward/mean": 0.140625, "rewards/counterfactual_reasoning_reward/std": 0.33297405391931534, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.5, "completions/max_terminated_length": 412.5, "completions/mean_length": 204.75, "completions/mean_terminated_length": 204.75, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.05009887936717205, "grad_norm": 1.610964298248291, "kl": 0.13232421875, "learning_rate": 9.74934036939314e-07, "loss": -0.0433, "num_tokens": 21109076.0, "reward": 1.171875, "reward_std": 0.24831003695726395, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.44547125697135925, "rewards/counterfactual_reasoning_reward/mean": 0.109375, "rewards/counterfactual_reasoning_reward/std": 0.31607766449451447, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 225.53125, "completions/mean_terminated_length": 225.53125, "completions/min_length": 117.5, "completions/min_terminated_length": 117.5, "epoch": 0.051417270929466054, "grad_norm": 13.313704490661621, "kl": 0.859375, "learning_rate": 9.74274406332454e-07, "loss": 0.0473, "num_tokens": 21661423.0, "reward": 1.1875, "reward_std": 0.2925042062997818, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.46125002205371857, "rewards/counterfactual_reasoning_reward/mean": 0.140625, "rewards/counterfactual_reasoning_reward/std": 0.34635117650032043, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.5, "completions/max_terminated_length": 435.5, "completions/mean_length": 180.0625, "completions/mean_terminated_length": 180.0625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.05273566249176005, "grad_norm": 53.44929885864258, "kl": 3.12548828125, "learning_rate": 9.736147757255936e-07, "loss": 0.0069, "num_tokens": 22220228.0, "reward": 1.1875, "reward_std": 0.3758036643266678, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.4659053534269333, "rewards/counterfactual_reasoning_reward/mean": 0.125, "rewards/counterfactual_reasoning_reward/std": 0.33601075410842896, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.5, "completions/max_terminated_length": 350.5, "completions/mean_length": 186.765625, "completions/mean_terminated_length": 186.765625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.05405405405405406, "grad_norm": 1.8444370031356812, "kl": 0.145751953125, "learning_rate": 9.729551451187335e-07, "loss": -0.0081, "num_tokens": 22785681.0, "reward": 1.2578125, "reward_std": 0.40958209335803986, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.45227913558483124, "rewards/counterfactual_reasoning_reward/mean": 0.203125, "rewards/counterfactual_reasoning_reward/std": 0.3964070826768875, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.5, "completions/max_terminated_length": 289.5, "completions/mean_length": 169.5, "completions/mean_terminated_length": 169.5, "completions/min_length": 81.5, "completions/min_terminated_length": 81.5, "epoch": 0.055372445616348055, "grad_norm": 3.3442482948303223, "kl": 0.38671875, "learning_rate": 9.722955145118733e-07, "loss": 0.0166, "num_tokens": 23342088.0, "reward": 1.390625, "reward_std": 0.3317541033029556, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.49186936020851135, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.4339464604854584, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.5, "completions/max_terminated_length": 320.5, "completions/mean_length": 172.1875, "completions/mean_terminated_length": 172.1875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.05669083717864205, "grad_norm": 369.068359375, "kl": 11.43896484375, "learning_rate": 9.716358839050132e-07, "loss": 0.073, "num_tokens": 23896568.0, "reward": 1.3046875, "reward_std": 0.3497766852378845, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.4824019521474838, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.43840841948986053, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 207.859375, "completions/mean_terminated_length": 207.859375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.05800922874093606, "grad_norm": 1.5121619701385498, "kl": 0.125, "learning_rate": 9.70976253298153e-07, "loss": 0.0016, "num_tokens": 24419615.0, "reward": 1.296875, "reward_std": 0.3379869610071182, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.5015034973621368, "rewards/counterfactual_reasoning_reward/mean": 0.234375, "rewards/counterfactual_reasoning_reward/std": 0.4266805946826935, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.5, "completions/max_terminated_length": 421.5, "completions/mean_length": 181.484375, "completions/mean_terminated_length": 181.484375, "completions/min_length": 78.5, "completions/min_terminated_length": 78.5, "epoch": 0.05932762030323006, "grad_norm": 2.6547884941101074, "kl": 0.16259765625, "learning_rate": 9.703166226912928e-07, "loss": -0.009, "num_tokens": 24980216.0, "reward": 1.3125, "reward_std": 0.4518508315086365, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.4994383603334427, "rewards/counterfactual_reasoning_reward/mean": 0.234375, "rewards/counterfactual_reasoning_reward/std": 0.4266805946826935, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.5, "completions/max_terminated_length": 274.5, "completions/mean_length": 183.921875, "completions/mean_terminated_length": 183.921875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.06064601186552406, "grad_norm": 1.6770036220550537, "kl": 0.1298828125, "learning_rate": 9.696569920844327e-07, "loss": 0.0133, "num_tokens": 25532187.0, "reward": 1.390625, "reward_std": 0.33824336528778076, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.4825586974620819, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.4442135691642761, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.5, "completions/max_terminated_length": 536.5, "completions/mean_length": 189.640625, "completions/mean_terminated_length": 189.640625, "completions/min_length": 94.5, "completions/min_terminated_length": 94.5, "epoch": 0.06196440342781806, "grad_norm": 1.7867424488067627, "kl": 0.1337890625, "learning_rate": 9.689973614775724e-07, "loss": 0.0007, "num_tokens": 26083349.0, "reward": 1.46875, "reward_std": 0.37642186880111694, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.4734743535518646, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48495998978614807, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.5, "completions/max_terminated_length": 434.5, "completions/mean_length": 194.609375, "completions/mean_terminated_length": 194.609375, "completions/min_length": 105.5, "completions/min_terminated_length": 105.5, "epoch": 0.06328279499011207, "grad_norm": 2.135788679122925, "kl": 0.15380859375, "learning_rate": 9.683377308707124e-07, "loss": -0.0149, "num_tokens": 26624262.0, "reward": 1.2890625, "reward_std": 0.43295419216156006, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.495430126786232, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.4399413466453552, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 191.609375, "completions/mean_terminated_length": 191.609375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.06460118655240607, "grad_norm": 1.8953243494033813, "kl": 0.126220703125, "learning_rate": 9.676781002638523e-07, "loss": -0.0453, "num_tokens": 27130084.0, "reward": 1.421875, "reward_std": 0.3005431592464447, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49993492662906647, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.45543521642684937, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 191.140625, "completions/mean_terminated_length": 191.140625, "completions/min_length": 105.5, "completions/min_terminated_length": 105.5, "epoch": 0.06591957811470006, "grad_norm": 1.4142102003097534, "kl": 0.1357421875, "learning_rate": 9.67018469656992e-07, "loss": 0.0007, "num_tokens": 27686490.0, "reward": 1.3203125, "reward_std": 0.42507344484329224, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.48946478962898254, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.4395582377910614, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 176.40625, "completions/mean_terminated_length": 176.40625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.06723796967699407, "grad_norm": 2.368410587310791, "kl": 0.1396484375, "learning_rate": 9.66358839050132e-07, "loss": 0.0202, "num_tokens": 28244255.0, "reward": 1.4140625, "reward_std": 0.27513836324214935, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.48721402883529663, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.45028693974018097, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 189.078125, "completions/mean_terminated_length": 189.078125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.06855636123928807, "grad_norm": 8.37457275390625, "kl": 1.01318359375, "learning_rate": 9.656992084432716e-07, "loss": 0.0051, "num_tokens": 28761175.0, "reward": 1.390625, "reward_std": 0.38176748156547546, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.48495998978614807, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4559413939714432, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.5, "completions/max_terminated_length": 474.5, "completions/mean_length": 181.34375, "completions/mean_terminated_length": 181.34375, "completions/min_length": 96.5, "completions/min_terminated_length": 96.5, "epoch": 0.06987475280158208, "grad_norm": 2.5676705837249756, "kl": 0.1494140625, "learning_rate": 9.650395778364115e-07, "loss": -0.0159, "num_tokens": 29322228.0, "reward": 1.3203125, "reward_std": 0.36082665622234344, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.4979427307844162, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.44837237894535065, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.5, "completions/max_terminated_length": 349.5, "completions/mean_length": 167.65625, "completions/mean_terminated_length": 167.65625, "completions/min_length": 89.5, "completions/min_terminated_length": 89.5, "epoch": 0.07119314436387607, "grad_norm": 1.6922078132629395, "kl": 0.21337890625, "learning_rate": 9.643799472295515e-07, "loss": 0.0011, "num_tokens": 29857134.0, "reward": 1.4453125, "reward_std": 0.4183811843395233, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.4874725937843323, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48040975630283356, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 193.578125, "completions/mean_terminated_length": 193.578125, "completions/min_length": 91.5, "completions/min_terminated_length": 91.5, "epoch": 0.07251153592617007, "grad_norm": 1.943649172782898, "kl": 0.14208984375, "learning_rate": 9.637203166226912e-07, "loss": -0.0286, "num_tokens": 30393157.0, "reward": 1.234375, "reward_std": 0.326050728559494, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.48721402883529663, "rewards/counterfactual_reasoning_reward/mean": 0.203125, "rewards/counterfactual_reasoning_reward/std": 0.40828560292720795, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.07382992748846408, "grad_norm": 1.8586387634277344, "kl": 0.14892578125, "learning_rate": 9.63060686015831e-07, "loss": -0.0295, "num_tokens": 30944997.0, "reward": 1.234375, "reward_std": 0.3193943649530411, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48896822333335876, "rewards/counterfactual_reasoning_reward/mean": 0.109375, "rewards/counterfactual_reasoning_reward/std": 0.31607766449451447, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 162.0, "completions/mean_terminated_length": 162.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.07514831905075807, "grad_norm": 1.9542683362960815, "kl": 0.17431640625, "learning_rate": 9.62401055408971e-07, "loss": 0.0399, "num_tokens": 31506070.0, "reward": 1.625, "reward_std": 0.3348398357629776, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.44547125697135925, "rewards/counterfactual_reasoning_reward/mean": 0.578125, "rewards/counterfactual_reasoning_reward/std": 0.49527959525585175, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 189.65625, "completions/mean_terminated_length": 189.65625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.07646671061305207, "grad_norm": 6.074581623077393, "kl": 0.24462890625, "learning_rate": 9.617414248021107e-07, "loss": 0.012, "num_tokens": 32037541.0, "reward": 1.1796875, "reward_std": 0.34712791442871094, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.45178256928920746, "rewards/counterfactual_reasoning_reward/mean": 0.125, "rewards/counterfactual_reasoning_reward/std": 0.3212462291121483, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.5, "completions/max_terminated_length": 697.5, "completions/mean_length": 194.203125, "completions/mean_terminated_length": 194.203125, "completions/min_length": 94.5, "completions/min_terminated_length": 94.5, "epoch": 0.07778510217534608, "grad_norm": 1.9350131750106812, "kl": 0.17724609375, "learning_rate": 9.610817941952506e-07, "loss": -0.0079, "num_tokens": 32587201.0, "reward": 1.1640625, "reward_std": 0.372851625084877, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.462014764547348, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.40346992015838623, "rewards/multiturn_format_reward/mean": 0.890625, "rewards/multiturn_format_reward/std": 0.31607766449451447, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.5, "completions/max_terminated_length": 310.5, "completions/mean_length": 167.484375, "completions/mean_terminated_length": 167.484375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.07910349373764008, "grad_norm": 2.485581159591675, "kl": 0.15869140625, "learning_rate": 9.604221635883904e-07, "loss": 0.0008, "num_tokens": 33146509.0, "reward": 1.1875, "reward_std": 0.39617881178855896, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.49527959525585175, "rewards/counterfactual_reasoning_reward/mean": 0.203125, "rewards/counterfactual_reasoning_reward/std": 0.40442168712615967, "rewards/multiturn_format_reward/mean": 0.875, "rewards/multiturn_format_reward/std": 0.33601075410842896, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.5, "completions/max_terminated_length": 534.5, "completions/mean_length": 196.203125, "completions/mean_terminated_length": 196.203125, "completions/min_length": 114.5, "completions/min_terminated_length": 114.5, "epoch": 0.08042188529993408, "grad_norm": 2.723259687423706, "kl": 0.14501953125, "learning_rate": 9.597625329815303e-07, "loss": -0.0393, "num_tokens": 33715783.0, "reward": 1.171875, "reward_std": 0.3805767893791199, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.4874725937843323, "rewards/counterfactual_reasoning_reward/mean": 0.09375, "rewards/counterfactual_reasoning_reward/std": 0.27283935993909836, "rewards/multiturn_format_reward/mean": 0.9375, "rewards/multiturn_format_reward/std": 0.2364606335759163, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 153.296875, "completions/mean_terminated_length": 153.296875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.08174027686222808, "grad_norm": 1.8293803930282593, "kl": 0.22119140625, "learning_rate": 9.591029023746702e-07, "loss": -0.0282, "num_tokens": 34237544.0, "reward": 1.4609375, "reward_std": 0.3387679308652878, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.5029991269111633, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.5015034973621368, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.5, "completions/max_terminated_length": 389.5, "completions/mean_length": 168.375, "completions/mean_terminated_length": 168.375, "completions/min_length": 99.5, "completions/min_terminated_length": 99.5, "epoch": 0.08305866842452209, "grad_norm": 3.4177629947662354, "kl": 0.17041015625, "learning_rate": 9.584432717678101e-07, "loss": 0.0087, "num_tokens": 34770698.0, "reward": 1.4140625, "reward_std": 0.3969632536172867, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.5029991269111633, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.495430126786232, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 178.28125, "completions/mean_terminated_length": 178.28125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.08437705998681608, "grad_norm": 2.242076873779297, "kl": 0.1669921875, "learning_rate": 9.577836411609498e-07, "loss": 0.0008, "num_tokens": 35321566.0, "reward": 1.453125, "reward_std": 0.3653489500284195, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.5015034973621368, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.4743363857269287, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 174.625, "completions/mean_terminated_length": 174.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.08569545154911008, "grad_norm": 1.3877933025360107, "kl": 0.18408203125, "learning_rate": 9.571240105540898e-07, "loss": 0.0038, "num_tokens": 35884378.0, "reward": 1.3359375, "reward_std": 0.32178717851638794, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.5055117309093475, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.45128606259822845, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 164.625, "completions/mean_terminated_length": 164.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.08701384311140409, "grad_norm": 2.5920510292053223, "kl": 0.17431640625, "learning_rate": 9.564643799472295e-07, "loss": 0.0009, "num_tokens": 36456916.0, "reward": 1.4296875, "reward_std": 0.46222594380378723, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4994383603334427, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.49478302896022797, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 205.40625, "completions/mean_terminated_length": 205.40625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.08833223467369809, "grad_norm": 657.5914306640625, "kl": 47.078125, "learning_rate": 9.558047493403694e-07, "loss": 0.2603, "num_tokens": 36992654.0, "reward": 1.0, "reward_std": 0.3237132579088211, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.33252330124378204, "rewards/counterfactual_reasoning_reward/mean": 0.03125, "rewards/counterfactual_reasoning_reward/std": 0.1767766922712326, "rewards/multiturn_format_reward/mean": 0.921875, "rewards/multiturn_format_reward/std": 0.2710396274924278, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 188.21875, "completions/mean_terminated_length": 188.21875, "completions/min_length": 100.5, "completions/min_terminated_length": 100.5, "epoch": 0.08965062623599208, "grad_norm": 2.4956626892089844, "kl": 0.48974609375, "learning_rate": 9.551451187335093e-07, "loss": 0.022, "num_tokens": 37514222.0, "reward": 1.2421875, "reward_std": 0.3903527110815048, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.46946612000465393, "rewards/counterfactual_reasoning_reward/mean": 0.1875, "rewards/counterfactual_reasoning_reward/std": 0.39445772767066956, "rewards/multiturn_format_reward/mean": 0.9375, "rewards/multiturn_format_reward/std": 0.2364606335759163, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.5, "completions/max_terminated_length": 365.5, "completions/mean_length": 157.84375, "completions/mean_terminated_length": 157.84375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.09096901779828609, "grad_norm": 2.098158836364746, "kl": 0.181640625, "learning_rate": 9.54485488126649e-07, "loss": -0.0147, "num_tokens": 38059192.0, "reward": 1.3125, "reward_std": 0.4038945585489273, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.5060082972049713, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.4337434321641922, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 166.1875, "completions/mean_terminated_length": 166.1875, "completions/min_length": 91.5, "completions/min_terminated_length": 91.5, "epoch": 0.0922874093605801, "grad_norm": 1.980433464050293, "kl": 0.193359375, "learning_rate": 9.53825857519789e-07, "loss": -0.0029, "num_tokens": 38601027.0, "reward": 1.359375, "reward_std": 0.36802828311920166, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.4979427307844162, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.43840841948986053, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.5, "completions/max_terminated_length": 423.5, "completions/mean_length": 161.71875, "completions/mean_terminated_length": 161.71875, "completions/min_length": 98.5, "completions/min_terminated_length": 98.5, "epoch": 0.0936058009228741, "grad_norm": 2.9033100605010986, "kl": 0.197265625, "learning_rate": 9.531662269129286e-07, "loss": -0.0283, "num_tokens": 39171865.0, "reward": 1.2890625, "reward_std": 0.4322565943002701, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.49186936020851135, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.44837237894535065, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 157.5625, "completions/mean_terminated_length": 157.5625, "completions/min_length": 104.5, "completions/min_terminated_length": 104.5, "epoch": 0.09492419248516809, "grad_norm": 1.6745362281799316, "kl": 0.21923828125, "learning_rate": 9.525065963060686e-07, "loss": 0.0021, "num_tokens": 39711078.0, "reward": 1.3359375, "reward_std": 0.3509994447231293, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.4979427307844162, "rewards/counterfactual_reasoning_reward/mean": 0.203125, "rewards/counterfactual_reasoning_reward/std": 0.3642466887831688, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.5, "completions/max_terminated_length": 297.5, "completions/mean_length": 174.265625, "completions/mean_terminated_length": 174.265625, "completions/min_length": 109.5, "completions/min_terminated_length": 109.5, "epoch": 0.0962425840474621, "grad_norm": 1.0311698913574219, "kl": 0.17626953125, "learning_rate": 9.518469656992084e-07, "loss": 0.0048, "num_tokens": 40242966.0, "reward": 1.1875, "reward_std": 0.2444262057542801, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.4339464604854584, "rewards/counterfactual_reasoning_reward/mean": 0.125, "rewards/counterfactual_reasoning_reward/std": 0.3212462291121483, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.5, "completions/max_terminated_length": 433.5, "completions/mean_length": 169.5, "completions/mean_terminated_length": 169.5, "completions/min_length": 85.5, "completions/min_terminated_length": 85.5, "epoch": 0.0975609756097561, "grad_norm": 3.056715488433838, "kl": 0.20703125, "learning_rate": 9.511873350923483e-07, "loss": 0.0001, "num_tokens": 40796920.0, "reward": 1.34375, "reward_std": 0.37382712215185165, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.4874725937843323, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.45128606259822845, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 177.96875, "completions/mean_terminated_length": 177.96875, "completions/min_length": 103.5, "completions/min_terminated_length": 103.5, "epoch": 0.09887936717205009, "grad_norm": 41.732666015625, "kl": 2.5458984375, "learning_rate": 9.505277044854881e-07, "loss": 0.0127, "num_tokens": 41359213.0, "reward": 1.375, "reward_std": 0.36260873079299927, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5055117309093475, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.44777433574199677, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 150.828125, "completions/mean_terminated_length": 150.828125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.1001977587343441, "grad_norm": 1.838882327079773, "kl": 0.2060546875, "learning_rate": 9.498680738786279e-07, "loss": 0.001, "num_tokens": 41912512.0, "reward": 1.421875, "reward_std": 0.4327230453491211, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.495430126786232, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.45178256928920746, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 172.34375, "completions/mean_terminated_length": 172.34375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.1015161502966381, "grad_norm": 4.647863388061523, "kl": 0.18896484375, "learning_rate": 9.492084432717677e-07, "loss": -0.0225, "num_tokens": 42454885.0, "reward": 1.1953125, "reward_std": 0.36529654264450073, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.4559413939714432, "rewards/counterfactual_reasoning_reward/mean": 0.1875, "rewards/counterfactual_reasoning_reward/std": 0.376473993062973, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 158.03125, "completions/mean_terminated_length": 158.03125, "completions/min_length": 99.5, "completions/min_terminated_length": 99.5, "epoch": 0.10283454185893211, "grad_norm": 1.9620180130004883, "kl": 0.28125, "learning_rate": 9.485488126649076e-07, "loss": 0.02, "num_tokens": 42992044.0, "reward": 1.21875, "reward_std": 0.4943290650844574, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.49186936020851135, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.43840841948986053, "rewards/multiturn_format_reward/mean": 0.90625, "rewards/multiturn_format_reward/std": 0.2961445748806, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 156.578125, "completions/mean_terminated_length": 156.578125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.1041529334212261, "grad_norm": 1.869486689567566, "kl": 0.22021484375, "learning_rate": 9.478891820580475e-07, "loss": -0.0175, "num_tokens": 43514544.0, "reward": 1.2890625, "reward_std": 0.3031915947794914, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4994383603334427, "rewards/counterfactual_reasoning_reward/mean": 0.15625, "rewards/counterfactual_reasoning_reward/std": 0.3662842661142349, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.5, "completions/max_terminated_length": 372.5, "completions/mean_length": 170.46875, "completions/mean_terminated_length": 170.46875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.1054713249835201, "grad_norm": 11.947239875793457, "kl": 0.22802734375, "learning_rate": 9.472295514511873e-07, "loss": -0.0272, "num_tokens": 44064951.0, "reward": 1.265625, "reward_std": 0.21929628774523735, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.4709290862083435, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.41824956238269806, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.5, "completions/max_terminated_length": 764.5, "completions/mean_length": 192.15625, "completions/mean_terminated_length": 192.15625, "completions/min_length": 89.5, "completions/min_terminated_length": 89.5, "epoch": 0.10678971654581411, "grad_norm": 2.767385244369507, "kl": 0.19873046875, "learning_rate": 9.465699208443272e-07, "loss": -0.0195, "num_tokens": 44644568.0, "reward": 1.375, "reward_std": 0.33226732909679413, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.5055117309093475, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48495998978614807, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.5, "completions/max_terminated_length": 338.5, "completions/mean_length": 182.59375, "completions/mean_terminated_length": 182.59375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.10810810810810811, "grad_norm": 2.077484369277954, "kl": 0.203125, "learning_rate": 9.459102902374669e-07, "loss": 0.001, "num_tokens": 45193709.0, "reward": 1.3515625, "reward_std": 0.34031008183956146, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.5015034973621368, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.45128606259822845, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 181.53125, "completions/mean_terminated_length": 181.53125, "completions/min_length": 93.5, "completions/min_terminated_length": 93.5, "epoch": 0.1094264996704021, "grad_norm": 2.115658760070801, "kl": 0.1962890625, "learning_rate": 9.452506596306067e-07, "loss": 0.001, "num_tokens": 45755817.0, "reward": 1.2890625, "reward_std": 0.4008282870054245, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.4739709198474884, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.4175008237361908, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 156.359375, "completions/mean_terminated_length": 156.359375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.11074489123269611, "grad_norm": 2.8160693645477295, "kl": 0.22412109375, "learning_rate": 9.445910290237467e-07, "loss": 0.0011, "num_tokens": 46293580.0, "reward": 1.4296875, "reward_std": 0.452579602599144, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.495430126786232, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.49478302896022797, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.5, "completions/max_terminated_length": 300.5, "completions/mean_length": 151.828125, "completions/mean_terminated_length": 151.828125, "completions/min_length": 101.5, "completions/min_terminated_length": 101.5, "epoch": 0.11206328279499012, "grad_norm": 2.063642740249634, "kl": 0.236328125, "learning_rate": 9.439313984168865e-07, "loss": 0.007, "num_tokens": 46812677.0, "reward": 1.390625, "reward_std": 0.3552626073360443, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.507007360458374, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.45227913558483124, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.5, "completions/max_terminated_length": 383.5, "completions/mean_length": 167.984375, "completions/mean_terminated_length": 167.984375, "completions/min_length": 104.5, "completions/min_terminated_length": 104.5, "epoch": 0.1133816743572841, "grad_norm": 2.148953914642334, "kl": 0.20703125, "learning_rate": 9.432717678100264e-07, "loss": 0.0206, "num_tokens": 47347191.0, "reward": 1.2421875, "reward_std": 0.4054103195667267, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.4767438918352127, "rewards/counterfactual_reasoning_reward/mean": 0.1875, "rewards/counterfactual_reasoning_reward/std": 0.3965577781200409, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.5, "completions/max_terminated_length": 297.5, "completions/mean_length": 161.734375, "completions/mean_terminated_length": 161.734375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.11470006591957811, "grad_norm": 2.2662434577941895, "kl": 0.20947265625, "learning_rate": 9.426121372031662e-07, "loss": 0.001, "num_tokens": 47885175.0, "reward": 1.40625, "reward_std": 0.38426627218723297, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5055117309093475, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.4743363857269287, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.5, "completions/max_terminated_length": 420.5, "completions/mean_length": 166.046875, "completions/mean_terminated_length": 166.046875, "completions/min_length": 100.5, "completions/min_terminated_length": 100.5, "epoch": 0.11601845748187212, "grad_norm": 1.5629639625549316, "kl": 0.2080078125, "learning_rate": 9.41952506596306e-07, "loss": -0.0439, "num_tokens": 48451401.0, "reward": 1.296875, "reward_std": 0.30405670404434204, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48896822333335876, "rewards/counterfactual_reasoning_reward/mean": 0.203125, "rewards/counterfactual_reasoning_reward/std": 0.40442168712615967, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 178.65625, "completions/mean_terminated_length": 178.65625, "completions/min_length": 90.5, "completions/min_terminated_length": 90.5, "epoch": 0.11733684904416612, "grad_norm": 2.569065570831299, "kl": 0.2646484375, "learning_rate": 9.412928759894458e-07, "loss": 0.0062, "num_tokens": 48998024.0, "reward": 1.296875, "reward_std": 0.4080469310283661, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.48896822333335876, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.44837237894535065, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.5, "completions/max_terminated_length": 254.5, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 55.5, "completions/min_terminated_length": 55.5, "epoch": 0.11865524060646011, "grad_norm": 4.5748724937438965, "kl": 0.21630859375, "learning_rate": 9.406332453825857e-07, "loss": 0.006, "num_tokens": 49558261.0, "reward": 1.421875, "reward_std": 0.26423706114292145, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.39445772767066956, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.401575967669487, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 166.25, "completions/mean_terminated_length": 166.25, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.11997363216875412, "grad_norm": 2.1579906940460205, "kl": 0.2548828125, "learning_rate": 9.399736147757256e-07, "loss": 0.056, "num_tokens": 50123023.0, "reward": 1.28125, "reward_std": 0.21878967434167862, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.4364590644836426, "rewards/counterfactual_reasoning_reward/mean": 0.234375, "rewards/counterfactual_reasoning_reward/std": 0.3403963968157768, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 144.765625, "completions/mean_terminated_length": 144.765625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.12129202373104812, "grad_norm": 1.993112564086914, "kl": 0.24072265625, "learning_rate": 9.393139841688654e-07, "loss": 0.0012, "num_tokens": 50661699.0, "reward": 1.3984375, "reward_std": 0.3709433525800705, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.4825586974620819, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4020725339651108, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 145.453125, "completions/mean_terminated_length": 145.453125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.12261041529334213, "grad_norm": 1.7775267362594604, "kl": 0.21142578125, "learning_rate": 9.386543535620053e-07, "loss": 0.003, "num_tokens": 51226884.0, "reward": 1.640625, "reward_std": 0.35847391188144684, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.43038569390773773, "rewards/counterfactual_reasoning_reward/mean": 0.578125, "rewards/counterfactual_reasoning_reward/std": 0.48896822333335876, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.5, "completions/max_terminated_length": 295.5, "completions/mean_length": 148.921875, "completions/mean_terminated_length": 148.921875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.12392880685563612, "grad_norm": 5.47065544128418, "kl": 0.2412109375, "learning_rate": 9.37994722955145e-07, "loss": -0.0252, "num_tokens": 51768161.0, "reward": 1.3828125, "reward_std": 0.28044888377189636, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.504016101360321, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4559413939714432, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 139.015625, "completions/mean_terminated_length": 139.015625, "completions/min_length": 91.5, "completions/min_terminated_length": 91.5, "epoch": 0.12524719841793014, "grad_norm": 2.923825740814209, "kl": 0.2421875, "learning_rate": 9.373350923482848e-07, "loss": -0.0154, "num_tokens": 52282322.0, "reward": 1.453125, "reward_std": 0.33986949920654297, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.4299773871898651, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.4395582377910614, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.5, "completions/max_terminated_length": 370.5, "completions/mean_length": 147.234375, "completions/mean_terminated_length": 147.234375, "completions/min_length": 94.5, "completions/min_terminated_length": 94.5, "epoch": 0.12656558998022413, "grad_norm": 32.3944091796875, "kl": 0.26171875, "learning_rate": 9.366754617414248e-07, "loss": 0.0228, "num_tokens": 52818061.0, "reward": 1.3125, "reward_std": 0.2620321437716484, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.5029991269111633, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.41824956238269806, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 151.96875, "completions/mean_terminated_length": 151.96875, "completions/min_length": 89.5, "completions/min_terminated_length": 89.5, "epoch": 0.12788398154251812, "grad_norm": 2.4956164360046387, "kl": 0.2509765625, "learning_rate": 9.360158311345646e-07, "loss": -0.0075, "num_tokens": 53355045.0, "reward": 1.4140625, "reward_std": 0.3327432721853256, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.4874725937843323, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.4175008237361908, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.5, "completions/max_terminated_length": 265.5, "completions/mean_length": 142.5, "completions/mean_terminated_length": 142.5, "completions/min_length": 97.5, "completions/min_terminated_length": 97.5, "epoch": 0.12920237310481214, "grad_norm": 4.393935203552246, "kl": 0.2294921875, "learning_rate": 9.353562005277045e-07, "loss": 0.0099, "num_tokens": 53904487.0, "reward": 1.40625, "reward_std": 0.31193146109580994, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.4979427307844162, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.4442135691642761, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.5, "completions/max_terminated_length": 326.5, "completions/mean_length": 150.6875, "completions/mean_terminated_length": 150.6875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.13052076466710613, "grad_norm": 2.7752931118011475, "kl": 0.255859375, "learning_rate": 9.346965699208443e-07, "loss": -0.0202, "num_tokens": 54484053.0, "reward": 1.4140625, "reward_std": 0.2552312836050987, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5055117309093475, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.47789715230464935, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 148.015625, "completions/mean_terminated_length": 148.015625, "completions/min_length": 95.5, "completions/min_terminated_length": 95.5, "epoch": 0.13183915622940012, "grad_norm": 4.477982997894287, "kl": 0.2109375, "learning_rate": 9.340369393139841e-07, "loss": 0.0118, "num_tokens": 55016688.0, "reward": 1.4921875, "reward_std": 0.3883073627948761, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.4932873994112015, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4907747954130173, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.5, "completions/max_terminated_length": 287.5, "completions/mean_length": 171.875, "completions/mean_terminated_length": 171.875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.13315754779169414, "grad_norm": 1.6916027069091797, "kl": 0.2109375, "learning_rate": 9.33377308707124e-07, "loss": -0.0243, "num_tokens": 55576740.0, "reward": 1.1796875, "reward_std": 0.24377765506505966, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.45128606259822845, "rewards/counterfactual_reasoning_reward/mean": 0.109375, "rewards/counterfactual_reasoning_reward/std": 0.28666723519563675, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.5, "completions/max_terminated_length": 414.5, "completions/mean_length": 150.484375, "completions/mean_terminated_length": 150.484375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.13447593935398813, "grad_norm": 1.9229685068130493, "kl": 0.2890625, "learning_rate": 9.327176781002638e-07, "loss": -0.0073, "num_tokens": 56142418.0, "reward": 1.3046875, "reward_std": 0.42660292983055115, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.47789715230464935, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.43038569390773773, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 138.140625, "completions/mean_terminated_length": 138.140625, "completions/min_length": 71.5, "completions/min_terminated_length": 71.5, "epoch": 0.13579433091628212, "grad_norm": 2.4934961795806885, "kl": 0.2275390625, "learning_rate": 9.320580474934037e-07, "loss": 0.0158, "num_tokens": 56696134.0, "reward": 1.4453125, "reward_std": 0.3444022089242935, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.4638662487268448, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.4128527194261551, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.5, "completions/max_terminated_length": 234.5, "completions/mean_length": 137.171875, "completions/mean_terminated_length": 137.171875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.13711272247857614, "grad_norm": 1.6206505298614502, "kl": 0.2607421875, "learning_rate": 9.313984168865435e-07, "loss": -0.0378, "num_tokens": 57261306.0, "reward": 1.515625, "reward_std": 0.18604277074337006, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.5029991269111633, "rewards/counterfactual_reasoning_reward/mean": 0.46875, "rewards/counterfactual_reasoning_reward/std": 0.4979427307844162, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 147.328125, "completions/mean_terminated_length": 147.328125, "completions/min_length": 91.5, "completions/min_terminated_length": 91.5, "epoch": 0.13843111404087013, "grad_norm": 1.8797717094421387, "kl": 0.27587890625, "learning_rate": 9.307387862796834e-07, "loss": 0.0024, "num_tokens": 57830124.0, "reward": 1.4921875, "reward_std": 0.3822478652000427, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.4743363857269287, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4874725937843323, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.5, "completions/max_terminated_length": 273.5, "completions/mean_length": 138.1875, "completions/mean_terminated_length": 138.1875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.13974950560316415, "grad_norm": 2.7589833736419678, "kl": 0.24853515625, "learning_rate": 9.300791556728231e-07, "loss": -0.0251, "num_tokens": 58375184.0, "reward": 1.2890625, "reward_std": 0.3103053718805313, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.48190538585186005, "rewards/counterfactual_reasoning_reward/mean": 0.171875, "rewards/counterfactual_reasoning_reward/std": 0.3680429607629776, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 142.359375, "completions/mean_terminated_length": 142.359375, "completions/min_length": 81.5, "completions/min_terminated_length": 81.5, "epoch": 0.14106789716545814, "grad_norm": 1.6963226795196533, "kl": 0.29248046875, "learning_rate": 9.294195250659629e-07, "loss": 0.0249, "num_tokens": 58949303.0, "reward": 1.3515625, "reward_std": 0.3363552838563919, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.49527959525585175, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.45543521642684937, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.5, "completions/max_terminated_length": 225.5, "completions/mean_length": 134.90625, "completions/mean_terminated_length": 134.90625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.14238628872775214, "grad_norm": 4.325846195220947, "kl": 0.2568359375, "learning_rate": 9.287598944591029e-07, "loss": 0.0111, "num_tokens": 59513014.0, "reward": 1.3828125, "reward_std": 0.353815421462059, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.5034956932067871, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.4709290862083435, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 140.328125, "completions/mean_terminated_length": 140.328125, "completions/min_length": 77.5, "completions/min_terminated_length": 77.5, "epoch": 0.14370468029004616, "grad_norm": 2.0975656509399414, "kl": 0.25048828125, "learning_rate": 9.281002638522427e-07, "loss": 0.0013, "num_tokens": 60063889.0, "reward": 1.4609375, "reward_std": 0.3775136321783066, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.4932873994112015, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.469681054353714, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 150.671875, "completions/mean_terminated_length": 150.671875, "completions/min_length": 74.5, "completions/min_terminated_length": 74.5, "epoch": 0.14502307185234015, "grad_norm": 3.872466802597046, "kl": 0.2548828125, "learning_rate": 9.274406332453826e-07, "loss": -0.0026, "num_tokens": 60624348.0, "reward": 1.40625, "reward_std": 0.22775823436677456, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.469681054353714, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.47197872400283813, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 143.890625, "completions/mean_terminated_length": 143.890625, "completions/min_length": 79.5, "completions/min_terminated_length": 79.5, "epoch": 0.14634146341463414, "grad_norm": 1.6275092363357544, "kl": 0.248046875, "learning_rate": 9.267810026385224e-07, "loss": -0.0105, "num_tokens": 61200154.0, "reward": 1.28125, "reward_std": 0.30286262929439545, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.495430126786232, "rewards/counterfactual_reasoning_reward/mean": 0.171875, "rewards/counterfactual_reasoning_reward/std": 0.378012090921402, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 150.84375, "completions/mean_terminated_length": 150.84375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.14765985497692816, "grad_norm": 1.7320324182510376, "kl": 0.2490234375, "learning_rate": 9.261213720316622e-07, "loss": 0.0012, "num_tokens": 61744728.0, "reward": 1.453125, "reward_std": 0.32766495645046234, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5015034973621368, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.5015034973621368, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.5, "completions/max_terminated_length": 257.5, "completions/mean_length": 156.890625, "completions/mean_terminated_length": 156.890625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.14897824653922215, "grad_norm": 2.0359268188476562, "kl": 0.24853515625, "learning_rate": 9.25461741424802e-07, "loss": 0.0012, "num_tokens": 62310831.0, "reward": 1.265625, "reward_std": 0.33568963408470154, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.48495998978614807, "rewards/counterfactual_reasoning_reward/mean": 0.171875, "rewards/counterfactual_reasoning_reward/std": 0.3827299028635025, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 156.921875, "completions/mean_terminated_length": 156.921875, "completions/min_length": 78.5, "completions/min_terminated_length": 78.5, "epoch": 0.15029663810151614, "grad_norm": 1.8079997301101685, "kl": 0.2783203125, "learning_rate": 9.248021108179419e-07, "loss": -0.0015, "num_tokens": 62869627.0, "reward": 1.5234375, "reward_std": 0.4489366114139557, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.469681054353714, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.4932873994112015, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 145.40625, "completions/mean_terminated_length": 145.40625, "completions/min_length": 100.5, "completions/min_terminated_length": 100.5, "epoch": 0.15161502966381016, "grad_norm": 2.559492826461792, "kl": 0.22607421875, "learning_rate": 9.241424802110818e-07, "loss": 0.0148, "num_tokens": 63421447.0, "reward": 1.4375, "reward_std": 0.2738732397556305, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.507007360458374, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.48139922320842743, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.5, "completions/max_terminated_length": 288.5, "completions/mean_length": 157.75, "completions/mean_terminated_length": 157.75, "completions/min_length": 75.5, "completions/min_terminated_length": 75.5, "epoch": 0.15293342122610415, "grad_norm": 2.4074440002441406, "kl": 0.27099609375, "learning_rate": 9.234828496042216e-07, "loss": -0.0016, "num_tokens": 63978361.0, "reward": 1.421875, "reward_std": 0.20788131654262543, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.5055117309093475, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48721402883529663, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.5, "completions/max_terminated_length": 351.5, "completions/mean_length": 180.03125, "completions/mean_terminated_length": 180.03125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.15425181278839814, "grad_norm": 1.6552907228469849, "kl": 0.33251953125, "learning_rate": 9.228232189973615e-07, "loss": -0.0003, "num_tokens": 64530674.0, "reward": 1.3359375, "reward_std": 0.2735481858253479, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.4932873994112015, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.38935163617134094, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.5, "completions/max_terminated_length": 248.5, "completions/mean_length": 148.78125, "completions/mean_terminated_length": 148.78125, "completions/min_length": 68.5, "completions/min_terminated_length": 68.5, "epoch": 0.15557020435069216, "grad_norm": 3.195235013961792, "kl": 0.3232421875, "learning_rate": 9.221635883905012e-07, "loss": -0.0013, "num_tokens": 65041400.0, "reward": 1.40625, "reward_std": 0.2692670002579689, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.4399413466453552, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 162.28125, "completions/mean_terminated_length": 162.28125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.15688859591298615, "grad_norm": 1.224960446357727, "kl": 0.23974609375, "learning_rate": 9.21503957783641e-07, "loss": -0.0095, "num_tokens": 65598079.0, "reward": 1.2890625, "reward_std": 0.2273416668176651, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.4932873994112015, "rewards/counterfactual_reasoning_reward/mean": 0.1875, "rewards/counterfactual_reasoning_reward/std": 0.376473993062973, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 159.96875, "completions/mean_terminated_length": 159.96875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.15820698747528017, "grad_norm": 2.1439692974090576, "kl": 0.314453125, "learning_rate": 9.20844327176781e-07, "loss": 0.0094, "num_tokens": 66089822.0, "reward": 1.3828125, "reward_std": 0.22926432639360428, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.5015034973621368, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.45227913558483124, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 161.1875, "completions/mean_terminated_length": 161.1875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.15952537903757416, "grad_norm": 1.2554386854171753, "kl": 0.25732421875, "learning_rate": 9.201846965699208e-07, "loss": 0.0013, "num_tokens": 66616168.0, "reward": 1.40625, "reward_std": 0.2863292396068573, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.5060082972049713, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.45543521642684937, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 164.890625, "completions/mean_terminated_length": 164.890625, "completions/min_length": 102.5, "completions/min_terminated_length": 102.5, "epoch": 0.16084377059986815, "grad_norm": 1.9963996410369873, "kl": 0.2607421875, "learning_rate": 9.195250659630607e-07, "loss": 0.0013, "num_tokens": 67149149.0, "reward": 1.2734375, "reward_std": 0.3347322940826416, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.4739709198474884, "rewards/counterfactual_reasoning_reward/mean": 0.171875, "rewards/counterfactual_reasoning_reward/std": 0.32385288923978806, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 165.546875, "completions/mean_terminated_length": 165.546875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.16216216216216217, "grad_norm": 1.811282992362976, "kl": 0.23828125, "learning_rate": 9.188654353562005e-07, "loss": -0.0066, "num_tokens": 67710783.0, "reward": 1.5078125, "reward_std": 0.4414493590593338, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.48190538585186005, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.5034956932067871, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.5, "completions/max_terminated_length": 356.5, "completions/mean_length": 168.703125, "completions/mean_terminated_length": 168.703125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.16348055372445616, "grad_norm": 1.5715442895889282, "kl": 0.2255859375, "learning_rate": 9.182058047493403e-07, "loss": -0.0204, "num_tokens": 68216821.0, "reward": 1.4609375, "reward_std": 0.23485340178012848, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.5029991269111633, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.4932873994112015, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 145.71875, "completions/mean_terminated_length": 145.71875, "completions/min_length": 102.5, "completions/min_terminated_length": 102.5, "epoch": 0.16479894528675015, "grad_norm": 8.180221557617188, "kl": 0.6767578125, "learning_rate": 9.175461741424802e-07, "loss": 0.0327, "num_tokens": 68770783.0, "reward": 1.328125, "reward_std": 0.38627080619335175, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.5015034973621368, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.44837237894535065, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.5, "completions/max_terminated_length": 257.5, "completions/mean_length": 165.421875, "completions/mean_terminated_length": 165.421875, "completions/min_length": 87.5, "completions/min_terminated_length": 87.5, "epoch": 0.16611733684904417, "grad_norm": 1.1742225885391235, "kl": 0.236328125, "learning_rate": 9.1688654353562e-07, "loss": 0.0022, "num_tokens": 69327036.0, "reward": 1.4609375, "reward_std": 0.1938823163509369, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.46125002205371857, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.41394005715847015, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 169.90625, "completions/mean_terminated_length": 169.90625, "completions/min_length": 107.5, "completions/min_terminated_length": 107.5, "epoch": 0.16743572841133816, "grad_norm": 11.997087478637695, "kl": 0.22265625, "learning_rate": 9.162269129287599e-07, "loss": -0.0301, "num_tokens": 69862025.0, "reward": 1.6484375, "reward_std": 0.2757682651281357, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4337434321641922, "rewards/counterfactual_reasoning_reward/mean": 0.546875, "rewards/counterfactual_reasoning_reward/std": 0.4932873994112015, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 160.90625, "completions/mean_terminated_length": 160.90625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.16875411997363216, "grad_norm": 2.9977526664733887, "kl": 0.22314453125, "learning_rate": 9.155672823218997e-07, "loss": 0.0011, "num_tokens": 70413992.0, "reward": 1.359375, "reward_std": 0.3529205620288849, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5055117309093475, "rewards/counterfactual_reasoning_reward/mean": 0.203125, "rewards/counterfactual_reasoning_reward/std": 0.3964070826768875, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 150.109375, "completions/mean_terminated_length": 150.109375, "completions/min_length": 73.5, "completions/min_terminated_length": 73.5, "epoch": 0.17007251153592617, "grad_norm": 2.070298194885254, "kl": 0.26806640625, "learning_rate": 9.149076517150396e-07, "loss": -0.0094, "num_tokens": 70972329.0, "reward": 1.4453125, "reward_std": 0.2773820757865906, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.48190538585186005, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4638662487268448, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.5, "completions/max_terminated_length": 291.5, "completions/mean_length": 164.515625, "completions/mean_terminated_length": 164.515625, "completions/min_length": 96.5, "completions/min_terminated_length": 96.5, "epoch": 0.17139090309822017, "grad_norm": 2.0071654319763184, "kl": 0.23046875, "learning_rate": 9.142480211081793e-07, "loss": 0.0012, "num_tokens": 71521867.0, "reward": 1.3125, "reward_std": 0.3999403864145279, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48896822333335876, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.44837237894535065, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.5, "completions/max_terminated_length": 351.5, "completions/mean_length": 183.171875, "completions/mean_terminated_length": 183.171875, "completions/min_length": 126.5, "completions/min_terminated_length": 126.5, "epoch": 0.17270929466051418, "grad_norm": 1.7305384874343872, "kl": 0.251953125, "learning_rate": 9.135883905013191e-07, "loss": 0.0335, "num_tokens": 72068399.0, "reward": 1.2578125, "reward_std": 0.16200191527605057, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.4767438918352127, "rewards/counterfactual_reasoning_reward/mean": 0.1875, "rewards/counterfactual_reasoning_reward/std": 0.3879760503768921, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 156.984375, "completions/mean_terminated_length": 156.984375, "completions/min_length": 105.5, "completions/min_terminated_length": 105.5, "epoch": 0.17402768622280818, "grad_norm": 5.096155166625977, "kl": 0.2216796875, "learning_rate": 9.129287598944591e-07, "loss": -0.0165, "num_tokens": 72606107.0, "reward": 1.2734375, "reward_std": 0.2569843828678131, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.4767438918352127, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.41824956238269806, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 166.96875, "completions/mean_terminated_length": 166.96875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.17534607778510217, "grad_norm": 1.6386364698410034, "kl": 0.2587890625, "learning_rate": 9.122691292875989e-07, "loss": 0.0111, "num_tokens": 73163814.0, "reward": 1.4609375, "reward_std": 0.31999707967042923, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.5015034973621368, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.45227913558483124, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 175.046875, "completions/mean_terminated_length": 175.046875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.17666446934739619, "grad_norm": 1.0852595567703247, "kl": 0.22265625, "learning_rate": 9.116094986807388e-07, "loss": 0.0001, "num_tokens": 73679342.0, "reward": 1.2890625, "reward_std": 0.22324800491333008, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.38353683054447174, "rewards/counterfactual_reasoning_reward/mean": 0.1875, "rewards/counterfactual_reasoning_reward/std": 0.24593468010425568, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.5, "completions/max_terminated_length": 323.5, "completions/mean_length": 178.75, "completions/mean_terminated_length": 178.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.17798286090969018, "grad_norm": 1.7038899660110474, "kl": 0.20654296875, "learning_rate": 9.109498680738786e-07, "loss": -0.0156, "num_tokens": 74238935.0, "reward": 1.390625, "reward_std": 0.3882310390472412, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.48139922320842743, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.45227913558483124, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.5, "completions/max_terminated_length": 337.5, "completions/mean_length": 181.390625, "completions/mean_terminated_length": 181.390625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.17930125247198417, "grad_norm": 1.85861337184906, "kl": 0.2509765625, "learning_rate": 9.102902374670183e-07, "loss": 0.0139, "num_tokens": 74784916.0, "reward": 1.46875, "reward_std": 0.29167112708091736, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4932873994112015, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.4824019521474838, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.5, "completions/max_terminated_length": 302.5, "completions/mean_length": 189.0, "completions/mean_terminated_length": 189.0, "completions/min_length": 107.5, "completions/min_terminated_length": 107.5, "epoch": 0.1806196440342782, "grad_norm": 2.8721487522125244, "kl": 0.29931640625, "learning_rate": 9.096306068601583e-07, "loss": 0.0015, "num_tokens": 75326660.0, "reward": 1.4921875, "reward_std": 0.346216082572937, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.4635103940963745, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48721402883529663, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 200.765625, "completions/mean_terminated_length": 200.765625, "completions/min_length": 117.5, "completions/min_terminated_length": 117.5, "epoch": 0.18193803559657218, "grad_norm": 1.7294589281082153, "kl": 0.23095703125, "learning_rate": 9.089709762532981e-07, "loss": -0.0096, "num_tokens": 75868151.0, "reward": 1.5, "reward_std": 0.38455937802791595, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48190538585186005, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4907747954130173, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 187.640625, "completions/mean_terminated_length": 187.640625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.18325642715886617, "grad_norm": 1.2517303228378296, "kl": 0.2392578125, "learning_rate": 9.08311345646438e-07, "loss": -0.0232, "num_tokens": 76393021.0, "reward": 1.4453125, "reward_std": 0.37452907860279083, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.4979427307844162, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48495998978614807, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 205.03125, "completions/mean_terminated_length": 205.03125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.1845748187211602, "grad_norm": 1.4534850120544434, "kl": 0.22705078125, "learning_rate": 9.076517150395778e-07, "loss": 0.0343, "num_tokens": 76936933.0, "reward": 1.3984375, "reward_std": 0.36516137421131134, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.5060082972049713, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.4767438918352127, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.5, "completions/max_terminated_length": 415.5, "completions/mean_length": 204.75, "completions/mean_terminated_length": 204.75, "completions/min_length": 108.5, "completions/min_terminated_length": 108.5, "epoch": 0.18589321028345418, "grad_norm": 1.5091770887374878, "kl": 0.33642578125, "learning_rate": 9.069920844327177e-07, "loss": -0.0257, "num_tokens": 77469982.0, "reward": 1.40625, "reward_std": 0.3921046406030655, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.49527959525585175, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.46125002205371857, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 197.15625, "completions/mean_terminated_length": 197.15625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.1872116018457482, "grad_norm": 2.4705917835235596, "kl": 0.234375, "learning_rate": 9.063324538258574e-07, "loss": -0.0115, "num_tokens": 78013377.0, "reward": 1.3203125, "reward_std": 0.36527004837989807, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.46125002205371857, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 164.0, "completions/mean_terminated_length": 164.0, "completions/min_length": 114.5, "completions/min_terminated_length": 114.5, "epoch": 0.1885299934080422, "grad_norm": 1.8946279287338257, "kl": 0.2333984375, "learning_rate": 9.056728232189973e-07, "loss": 0.0285, "num_tokens": 78597602.0, "reward": 1.546875, "reward_std": 0.2971753776073456, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.4659053534269333, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.4659053534269333, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.5, "completions/max_terminated_length": 413.5, "completions/mean_length": 193.828125, "completions/mean_terminated_length": 193.828125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.18984838497033618, "grad_norm": 1.1943597793579102, "kl": 0.23486328125, "learning_rate": 9.050131926121372e-07, "loss": 0.0227, "num_tokens": 79126277.0, "reward": 1.4921875, "reward_std": 0.16588576138019562, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.469681054353714, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.4743363857269287, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.5, "completions/max_terminated_length": 362.5, "completions/mean_length": 181.46875, "completions/mean_terminated_length": 181.46875, "completions/min_length": 116.5, "completions/min_terminated_length": 116.5, "epoch": 0.1911667765326302, "grad_norm": 1.6528741121292114, "kl": 0.25390625, "learning_rate": 9.04353562005277e-07, "loss": 0.0013, "num_tokens": 79661842.0, "reward": 1.5625, "reward_std": 0.3215447664260864, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.48495998978614807, "rewards/counterfactual_reasoning_reward/mean": 0.484375, "rewards/counterfactual_reasoning_reward/std": 0.5015034973621368, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.5, "completions/max_terminated_length": 303.5, "completions/mean_length": 166.984375, "completions/mean_terminated_length": 166.984375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.1924851680949242, "grad_norm": 1.4309096336364746, "kl": 0.22900390625, "learning_rate": 9.036939313984169e-07, "loss": 0.0011, "num_tokens": 80210823.0, "reward": 1.546875, "reward_std": 0.3511880785226822, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.44777433574199677, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.495430126786232, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 183.234375, "completions/mean_terminated_length": 183.234375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.19380355965721818, "grad_norm": 1.469364047050476, "kl": 0.29345703125, "learning_rate": 9.030343007915567e-07, "loss": 0.0015, "num_tokens": 80783434.0, "reward": 1.3515625, "reward_std": 0.42973293364048004, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.5015034973621368, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.4659053534269333, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 186.890625, "completions/mean_terminated_length": 186.890625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.1951219512195122, "grad_norm": 1.1849377155303955, "kl": 0.24462890625, "learning_rate": 9.023746701846964e-07, "loss": -0.0076, "num_tokens": 81341667.0, "reward": 1.4609375, "reward_std": 0.27174006402492523, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.5015034973621368, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.47789715230464935, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.5, "completions/max_terminated_length": 295.5, "completions/mean_length": 178.203125, "completions/mean_terminated_length": 178.203125, "completions/min_length": 114.5, "completions/min_terminated_length": 114.5, "epoch": 0.1964403427818062, "grad_norm": 8.118152618408203, "kl": 1.310546875, "learning_rate": 9.017150395778364e-07, "loss": -0.0013, "num_tokens": 81873396.0, "reward": 1.3046875, "reward_std": 0.21693426929414272, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48190538585186005, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.44837237894535065, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 110.5, "completions/min_terminated_length": 110.5, "epoch": 0.19775873434410018, "grad_norm": 2.6720142364501953, "kl": 0.32958984375, "learning_rate": 9.010554089709762e-07, "loss": 0.0016, "num_tokens": 82419880.0, "reward": 1.484375, "reward_std": 0.38933973014354706, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.5029991269111633, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.4979427307844162, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 190.46875, "completions/mean_terminated_length": 190.46875, "completions/min_length": 118.5, "completions/min_terminated_length": 118.5, "epoch": 0.1990771259063942, "grad_norm": 1.2013498544692993, "kl": 0.20263671875, "learning_rate": 9.003957783641161e-07, "loss": 0.001, "num_tokens": 82959793.0, "reward": 1.3046875, "reward_std": 0.37839697301387787, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.4979427307844162, "rewards/counterfactual_reasoning_reward/mean": 0.203125, "rewards/counterfactual_reasoning_reward/std": 0.40442168712615967, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 185.640625, "completions/mean_terminated_length": 185.640625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.2003955174686882, "grad_norm": 2.065361738204956, "kl": 0.2041015625, "learning_rate": 8.997361477572559e-07, "loss": 0.0, "num_tokens": 83508787.0, "reward": 1.203125, "reward_std": 0.2776031717658043, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.43845126032829285, "rewards/counterfactual_reasoning_reward/mean": 0.203125, "rewards/counterfactual_reasoning_reward/std": 0.334323026239872, "rewards/multiturn_format_reward/mean": 0.9375, "rewards/multiturn_format_reward/std": 0.16800537705421448, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.5, "completions/max_terminated_length": 479.5, "completions/mean_length": 210.1875, "completions/mean_terminated_length": 210.1875, "completions/min_length": 115.5, "completions/min_terminated_length": 115.5, "epoch": 0.2017139090309822, "grad_norm": 3.078693389892578, "kl": 0.2822265625, "learning_rate": 8.990765171503958e-07, "loss": -0.0054, "num_tokens": 84053560.0, "reward": 1.21875, "reward_std": 0.37172043323516846, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.4635103940963745, "rewards/counterfactual_reasoning_reward/mean": 0.125, "rewards/counterfactual_reasoning_reward/std": 0.33252330124378204, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 202.53125, "completions/mean_terminated_length": 202.53125, "completions/min_length": 140.5, "completions/min_terminated_length": 140.5, "epoch": 0.2030323005932762, "grad_norm": 1.5648541450500488, "kl": 0.21337890625, "learning_rate": 8.984168865435355e-07, "loss": -0.0038, "num_tokens": 84592126.0, "reward": 1.3984375, "reward_std": 0.3227352648973465, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.4907747954130173, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.46946612000465393, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.5, "completions/max_terminated_length": 436.5, "completions/mean_length": 190.671875, "completions/mean_terminated_length": 190.671875, "completions/min_length": 133.5, "completions/min_terminated_length": 133.5, "epoch": 0.2043506921555702, "grad_norm": 7.055335998535156, "kl": 0.236328125, "learning_rate": 8.977572559366754e-07, "loss": 0.0275, "num_tokens": 85129388.0, "reward": 1.328125, "reward_std": 0.3497494161128998, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.4266805946826935, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.44777433574199677, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.1480722874403, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 188.734375, "completions/mean_terminated_length": 188.734375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.20566908371786422, "grad_norm": 3.8892452716827393, "kl": 0.25244140625, "learning_rate": 8.970976253298153e-07, "loss": 0.0315, "num_tokens": 85698555.0, "reward": 1.1953125, "reward_std": 0.3627253919839859, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.469681054353714, "rewards/counterfactual_reasoning_reward/mean": 0.171875, "rewards/counterfactual_reasoning_reward/std": 0.3827299028635025, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 208.078125, "completions/mean_terminated_length": 208.078125, "completions/min_length": 125.5, "completions/min_terminated_length": 125.5, "epoch": 0.2069874752801582, "grad_norm": 1.0977294445037842, "kl": 0.22314453125, "learning_rate": 8.964379947229551e-07, "loss": 0.006, "num_tokens": 86278480.0, "reward": 1.3359375, "reward_std": 0.2811931222677231, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.5060082972049713, "rewards/counterfactual_reasoning_reward/mean": 0.171875, "rewards/counterfactual_reasoning_reward/std": 0.3680429607629776, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 194.5, "completions/mean_terminated_length": 194.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.2083058668424522, "grad_norm": 3.33601450920105, "kl": 0.259765625, "learning_rate": 8.95778364116095e-07, "loss": 0.0013, "num_tokens": 86857415.0, "reward": 1.40625, "reward_std": 0.3400811702013016, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.4907747954130173, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.4339464604854584, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 216.640625, "completions/mean_terminated_length": 216.640625, "completions/min_length": 124.5, "completions/min_terminated_length": 124.5, "epoch": 0.20962425840474622, "grad_norm": 1.5048441886901855, "kl": 0.21337890625, "learning_rate": 8.951187335092348e-07, "loss": -0.0107, "num_tokens": 87419560.0, "reward": 1.4609375, "reward_std": 0.3317965269088745, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.48040975630283356, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.4979427307844162, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 220.6875, "completions/mean_terminated_length": 220.6875, "completions/min_length": 127.5, "completions/min_terminated_length": 127.5, "epoch": 0.2109426499670402, "grad_norm": 1.5122101306915283, "kl": 0.24462890625, "learning_rate": 8.944591029023745e-07, "loss": 0.0374, "num_tokens": 87977858.0, "reward": 1.2734375, "reward_std": 0.34534919261932373, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4395582377910614, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.41394005715847015, "rewards/multiturn_format_reward/mean": 0.9375, "rewards/multiturn_format_reward/std": 0.2364606335759163, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 209.0, "completions/mean_terminated_length": 209.0, "completions/min_length": 136.5, "completions/min_terminated_length": 136.5, "epoch": 0.2122610415293342, "grad_norm": 2.241800308227539, "kl": 0.20654296875, "learning_rate": 8.937994722955145e-07, "loss": 0.001, "num_tokens": 88521041.0, "reward": 1.4375, "reward_std": 0.3699793219566345, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5055117309093475, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48040975630283356, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 224.3125, "completions/mean_terminated_length": 224.3125, "completions/min_length": 120.5, "completions/min_terminated_length": 120.5, "epoch": 0.21357943309162822, "grad_norm": 35.36426544189453, "kl": 3.08642578125, "learning_rate": 8.931398416886543e-07, "loss": 0.0184, "num_tokens": 89084569.0, "reward": 1.3828125, "reward_std": 0.365848183631897, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.507007360458374, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.4767438918352127, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 212.09375, "completions/mean_terminated_length": 212.09375, "completions/min_length": 137.5, "completions/min_terminated_length": 137.5, "epoch": 0.2148978246539222, "grad_norm": 3.4261221885681152, "kl": 0.23779296875, "learning_rate": 8.924802110817942e-07, "loss": 0.01, "num_tokens": 89669267.0, "reward": 1.3359375, "reward_std": 0.40647201240062714, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.48495998978614807, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.45543521642684937, "rewards/multiturn_format_reward/mean": 0.921875, "rewards/multiturn_format_reward/std": 0.2710396274924278, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.5, "completions/max_terminated_length": 435.5, "completions/mean_length": 240.953125, "completions/mean_terminated_length": 240.953125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.21621621621621623, "grad_norm": 3.4475231170654297, "kl": 0.21435546875, "learning_rate": 8.91820580474934e-07, "loss": -0.0087, "num_tokens": 90186564.0, "reward": 1.25, "reward_std": 0.40746088325977325, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.4734743535518646, "rewards/counterfactual_reasoning_reward/mean": 0.234375, "rewards/counterfactual_reasoning_reward/std": 0.4299773871898651, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 223.859375, "completions/mean_terminated_length": 223.859375, "completions/min_length": 127.5, "completions/min_terminated_length": 127.5, "epoch": 0.21753460777851022, "grad_norm": 1.3252679109573364, "kl": 0.23291015625, "learning_rate": 8.911609498680739e-07, "loss": 0.0168, "num_tokens": 90759522.0, "reward": 1.3671875, "reward_std": 0.32107701897621155, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.5029991269111633, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.4767438918352127, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 232.078125, "completions/mean_terminated_length": 232.078125, "completions/min_length": 131.5, "completions/min_terminated_length": 131.5, "epoch": 0.2188529993408042, "grad_norm": 2.1762688159942627, "kl": 0.22119140625, "learning_rate": 8.905013192612136e-07, "loss": -0.0087, "num_tokens": 91331034.0, "reward": 1.4296875, "reward_std": 0.39385056495666504, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.507007360458374, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48721402883529663, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 242.234375, "completions/mean_terminated_length": 242.234375, "completions/min_length": 128.5, "completions/min_terminated_length": 128.5, "epoch": 0.22017139090309823, "grad_norm": 3.937960386276245, "kl": 0.20556640625, "learning_rate": 8.898416886543535e-07, "loss": 0.001, "num_tokens": 91901498.0, "reward": 1.265625, "reward_std": 0.3281840980052948, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.4339464604854584, "rewards/counterfactual_reasoning_reward/mean": 0.1875, "rewards/counterfactual_reasoning_reward/std": 0.376473993062973, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 197.3125, "completions/mean_terminated_length": 197.3125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.22148978246539222, "grad_norm": 2.260918617248535, "kl": 0.2119140625, "learning_rate": 8.891820580474934e-07, "loss": -0.0263, "num_tokens": 92472824.0, "reward": 1.4375, "reward_std": 0.2346404492855072, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.4638662487268448, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.40928472578525543, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 196.828125, "completions/mean_terminated_length": 196.828125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.2228081740276862, "grad_norm": 2.0171120166778564, "kl": 0.18896484375, "learning_rate": 8.885224274406332e-07, "loss": 0.0517, "num_tokens": 93024481.0, "reward": 1.5390625, "reward_std": 0.2725583165884018, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.515625, "rewards/counterfactual_reasoning_reward/std": 0.5075039267539978, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.5, "completions/max_terminated_length": 313.5, "completions/mean_length": 201.3125, "completions/mean_terminated_length": 201.3125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.22412656558998023, "grad_norm": 1.9011871814727783, "kl": 0.2314453125, "learning_rate": 8.878627968337731e-07, "loss": 0.0148, "num_tokens": 93581971.0, "reward": 1.3828125, "reward_std": 0.4296386241912842, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.4638662487268448, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.43795469403266907, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 204.625, "completions/mean_terminated_length": 204.625, "completions/min_length": 128.5, "completions/min_terminated_length": 128.5, "epoch": 0.22544495715227422, "grad_norm": 1.3240770101547241, "kl": 0.15478515625, "learning_rate": 8.872031662269129e-07, "loss": 0.0047, "num_tokens": 94108407.0, "reward": 1.421875, "reward_std": 0.24131912738084793, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5080004930496216, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.47197872400283813, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.5, "completions/max_terminated_length": 380.5, "completions/mean_length": 203.703125, "completions/mean_terminated_length": 203.703125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.2267633487145682, "grad_norm": 2.188588857650757, "kl": 0.14599609375, "learning_rate": 8.865435356200526e-07, "loss": -0.0422, "num_tokens": 94690738.0, "reward": 1.40625, "reward_std": 0.21183805912733078, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5075039267539978, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.46125002205371857, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.5, "completions/max_terminated_length": 427.5, "completions/mean_length": 202.90625, "completions/mean_terminated_length": 202.90625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.22808174027686223, "grad_norm": 1.4665859937667847, "kl": 0.13671875, "learning_rate": 8.858839050131926e-07, "loss": 0.0007, "num_tokens": 95260495.0, "reward": 1.515625, "reward_std": 0.39025117456912994, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.47197872400283813, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.47197872400283813, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 196.28125, "completions/mean_terminated_length": 196.28125, "completions/min_length": 136.5, "completions/min_terminated_length": 136.5, "epoch": 0.22940013183915622, "grad_norm": 1.6812487840652466, "kl": 0.14794921875, "learning_rate": 8.852242744063324e-07, "loss": -0.0481, "num_tokens": 95783008.0, "reward": 1.546875, "reward_std": 0.29614754021167755, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.401575967669487, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.4979427307844162, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 204.71875, "completions/mean_terminated_length": 204.71875, "completions/min_length": 124.5, "completions/min_terminated_length": 124.5, "epoch": 0.23071852340145024, "grad_norm": 1.0249260663986206, "kl": 0.1455078125, "learning_rate": 8.845646437994723e-07, "loss": -0.0022, "num_tokens": 96318118.0, "reward": 1.359375, "reward_std": 0.2857973203063011, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.4825586974620819, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.38935163617134094, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.5, "completions/max_terminated_length": 282.5, "completions/mean_length": 193.328125, "completions/mean_terminated_length": 193.328125, "completions/min_length": 125.5, "completions/min_terminated_length": 125.5, "epoch": 0.23203691496374423, "grad_norm": 1.6255261898040771, "kl": 0.15478515625, "learning_rate": 8.839050131926121e-07, "loss": -0.0129, "num_tokens": 96860732.0, "reward": 1.4609375, "reward_std": 0.3648538738489151, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.495430126786232, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.469681054353714, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 196.328125, "completions/mean_terminated_length": 196.328125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.23335530652603823, "grad_norm": 1.3528594970703125, "kl": 0.14892578125, "learning_rate": 8.83245382585752e-07, "loss": 0.0047, "num_tokens": 97385004.0, "reward": 1.5078125, "reward_std": 0.261700764298439, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.48190538585186005, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.48495998978614807, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.5, "completions/max_terminated_length": 351.5, "completions/mean_length": 197.078125, "completions/mean_terminated_length": 197.078125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.23467369808833224, "grad_norm": 2.8546056747436523, "kl": 0.17724609375, "learning_rate": 8.825857519788917e-07, "loss": -0.0401, "num_tokens": 97972271.0, "reward": 1.296875, "reward_std": 0.3537324219942093, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.4932873994112015, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.44547125697135925, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.5, "completions/max_terminated_length": 286.5, "completions/mean_length": 180.40625, "completions/mean_terminated_length": 180.40625, "completions/min_length": 126.5, "completions/min_terminated_length": 126.5, "epoch": 0.23599208965062624, "grad_norm": 3.726440191268921, "kl": 0.15478515625, "learning_rate": 8.819261213720316e-07, "loss": 0.0008, "num_tokens": 98522376.0, "reward": 1.3828125, "reward_std": 0.4538841098546982, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.44547125697135925, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.43795469403266907, "rewards/multiturn_format_reward/mean": 0.9375, "rewards/multiturn_format_reward/std": 0.24593468010425568, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.5, "completions/max_terminated_length": 326.5, "completions/mean_length": 194.140625, "completions/mean_terminated_length": 194.140625, "completions/min_length": 140.5, "completions/min_terminated_length": 140.5, "epoch": 0.23731048121292023, "grad_norm": 1.4958515167236328, "kl": 0.15234375, "learning_rate": 8.812664907651715e-07, "loss": -0.01, "num_tokens": 99052745.0, "reward": 1.4765625, "reward_std": 0.29508669674396515, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.49186936020851135, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.4767438918352127, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.5, "completions/max_terminated_length": 412.5, "completions/mean_length": 204.0, "completions/mean_terminated_length": 204.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.23862887277521425, "grad_norm": 2.6482203006744385, "kl": 0.1689453125, "learning_rate": 8.806068601583113e-07, "loss": 0.0008, "num_tokens": 99607830.0, "reward": 1.2421875, "reward_std": 0.4510084539651871, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.49527959525585175, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.38935163617134094, "rewards/multiturn_format_reward/mean": 0.921875, "rewards/multiturn_format_reward/std": 0.2710396274924278, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.5, "completions/max_terminated_length": 326.5, "completions/mean_length": 177.640625, "completions/mean_terminated_length": 177.640625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.23994726433750824, "grad_norm": 1.107133388519287, "kl": 0.15625, "learning_rate": 8.799472295514512e-07, "loss": 0.0096, "num_tokens": 100140190.0, "reward": 1.4921875, "reward_std": 0.2748822569847107, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.48721402883529663, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.48139922320842743, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.5, "completions/max_terminated_length": 313.5, "completions/mean_length": 183.53125, "completions/mean_terminated_length": 183.53125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.24126565589980223, "grad_norm": 1.6874104738235474, "kl": 0.16357421875, "learning_rate": 8.79287598944591e-07, "loss": -0.006, "num_tokens": 100696872.0, "reward": 1.5078125, "reward_std": 0.30022794008255005, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.37246278673410416, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.41394005715847015, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.5, "completions/max_terminated_length": 302.5, "completions/mean_length": 179.75, "completions/mean_terminated_length": 179.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.24258404746209625, "grad_norm": 1.5990190505981445, "kl": 0.18603515625, "learning_rate": 8.786279683377307e-07, "loss": -0.0079, "num_tokens": 101236010.0, "reward": 1.3828125, "reward_std": 0.33912965655326843, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.4638662487268448, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.45028693974018097, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.5, "completions/max_terminated_length": 267.5, "completions/mean_length": 168.03125, "completions/mean_terminated_length": 168.03125, "completions/min_length": 113.5, "completions/min_terminated_length": 113.5, "epoch": 0.24390243902439024, "grad_norm": 2.2649242877960205, "kl": 0.162109375, "learning_rate": 8.779683377308707e-07, "loss": 0.0008, "num_tokens": 101785600.0, "reward": 1.3125, "reward_std": 0.345254123210907, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.48946478962898254, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.425730362534523, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.5, "completions/max_terminated_length": 263.5, "completions/mean_length": 168.78125, "completions/mean_terminated_length": 168.78125, "completions/min_length": 123.5, "completions/min_terminated_length": 123.5, "epoch": 0.24522083058668426, "grad_norm": 2.6250767707824707, "kl": 0.18212890625, "learning_rate": 8.773087071240105e-07, "loss": 0.0146, "num_tokens": 102317997.0, "reward": 1.4453125, "reward_std": 0.3608371168375015, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.47789715230464935, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.4743363857269287, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.5, "completions/max_terminated_length": 256.5, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.24653922214897825, "grad_norm": 3.063750982284546, "kl": 0.18896484375, "learning_rate": 8.766490765171504e-07, "loss": -0.0254, "num_tokens": 102888468.0, "reward": 1.28125, "reward_std": 0.27193254232406616, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.1875, "rewards/counterfactual_reasoning_reward/std": 0.3965577781200409, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 169.3125, "completions/mean_terminated_length": 169.3125, "completions/min_length": 114.5, "completions/min_terminated_length": 114.5, "epoch": 0.24785761371127224, "grad_norm": 1.775459885597229, "kl": 0.1826171875, "learning_rate": 8.759894459102902e-07, "loss": -0.0323, "num_tokens": 103429546.0, "reward": 1.25, "reward_std": 0.23438066989183426, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.45178256928920746, "rewards/counterfactual_reasoning_reward/mean": 0.171875, "rewards/counterfactual_reasoning_reward/std": 0.24127934873104095, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 160.703125, "completions/mean_terminated_length": 160.703125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.24917600527356626, "grad_norm": 1.7786389589309692, "kl": 0.2041015625, "learning_rate": 8.753298153034301e-07, "loss": 0.002, "num_tokens": 103994782.0, "reward": 1.546875, "reward_std": 0.3282313793897629, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.4767438918352127, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.4994383603334427, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 170.4375, "completions/mean_terminated_length": 170.4375, "completions/min_length": 119.5, "completions/min_terminated_length": 119.5, "epoch": 0.2504943968358603, "grad_norm": 1.3539693355560303, "kl": 0.18505859375, "learning_rate": 8.746701846965698e-07, "loss": 0.0009, "num_tokens": 104545200.0, "reward": 1.3984375, "reward_std": 0.2925562858581543, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5015034973621368, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.4339464604854584, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.5, "completions/max_terminated_length": 245.5, "completions/mean_length": 162.921875, "completions/mean_terminated_length": 162.921875, "completions/min_length": 117.5, "completions/min_terminated_length": 117.5, "epoch": 0.25181278839815424, "grad_norm": 1.5081756114959717, "kl": 0.22705078125, "learning_rate": 8.740105540897097e-07, "loss": 0.0011, "num_tokens": 105078059.0, "reward": 1.5, "reward_std": 0.37494590878486633, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.4825586974620819, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4907747954130173, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.5, "completions/max_terminated_length": 270.5, "completions/mean_length": 161.15625, "completions/mean_terminated_length": 161.15625, "completions/min_length": 111.5, "completions/min_terminated_length": 111.5, "epoch": 0.25313117996044826, "grad_norm": 1.9885047674179077, "kl": 0.1982421875, "learning_rate": 8.733509234828496e-07, "loss": -0.0097, "num_tokens": 105613466.0, "reward": 1.4140625, "reward_std": 0.2857416793704033, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49899089336395264, "rewards/counterfactual_reasoning_reward/mean": 0.234375, "rewards/counterfactual_reasoning_reward/std": 0.40928472578525543, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.5, "completions/max_terminated_length": 211.5, "completions/mean_length": 154.390625, "completions/mean_terminated_length": 154.390625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.2544495715227423, "grad_norm": 1.7447974681854248, "kl": 0.19677734375, "learning_rate": 8.726912928759894e-07, "loss": 0.0264, "num_tokens": 106165098.0, "reward": 1.3671875, "reward_std": 0.29712240397930145, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.49527959525585175, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.469681054353714, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 160.375, "completions/mean_terminated_length": 160.375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.25576796308503624, "grad_norm": 1.1314243078231812, "kl": 0.2021484375, "learning_rate": 8.720316622691293e-07, "loss": 0.0186, "num_tokens": 106695485.0, "reward": 1.671875, "reward_std": 0.2668311595916748, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.4299773871898651, "rewards/counterfactual_reasoning_reward/mean": 0.578125, "rewards/counterfactual_reasoning_reward/std": 0.4994383603334427, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.5, "completions/max_terminated_length": 222.5, "completions/mean_length": 152.515625, "completions/mean_terminated_length": 152.515625, "completions/min_length": 109.5, "completions/min_terminated_length": 109.5, "epoch": 0.25708635464733026, "grad_norm": 2.050349235534668, "kl": 0.189453125, "learning_rate": 8.713720316622691e-07, "loss": 0.0166, "num_tokens": 107250822.0, "reward": 1.5, "reward_std": 0.29741112887859344, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.44777433574199677, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.4734743535518646, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.5, "completions/max_terminated_length": 301.5, "completions/mean_length": 166.203125, "completions/mean_terminated_length": 166.203125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2584047462096243, "grad_norm": 1.790379524230957, "kl": 0.1865234375, "learning_rate": 8.707124010554088e-07, "loss": -0.0039, "num_tokens": 107796259.0, "reward": 1.34375, "reward_std": 0.2437184453010559, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.4874725937843323, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.45128606259822845, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 155.4375, "completions/mean_terminated_length": 155.4375, "completions/min_length": 111.5, "completions/min_terminated_length": 111.5, "epoch": 0.25972313777191824, "grad_norm": 3.445435047149658, "kl": 0.58544921875, "learning_rate": 8.700527704485488e-07, "loss": 0.0029, "num_tokens": 108356070.0, "reward": 1.3671875, "reward_std": 0.4247415065765381, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.4638662487268448, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.401575967669487, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.5, "completions/max_terminated_length": 240.5, "completions/mean_length": 154.765625, "completions/mean_terminated_length": 154.765625, "completions/min_length": 112.5, "completions/min_terminated_length": 112.5, "epoch": 0.26104152933421226, "grad_norm": 1.8542872667312622, "kl": 0.2119140625, "learning_rate": 8.693931398416886e-07, "loss": 0.0059, "num_tokens": 108905882.0, "reward": 1.2734375, "reward_std": 0.2364109754562378, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.48721402883529663, "rewards/counterfactual_reasoning_reward/mean": 0.1875, "rewards/counterfactual_reasoning_reward/std": 0.3965577781200409, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 161.296875, "completions/mean_terminated_length": 161.296875, "completions/min_length": 111.5, "completions/min_terminated_length": 111.5, "epoch": 0.2623599208965063, "grad_norm": 3.172243356704712, "kl": 0.19287109375, "learning_rate": 8.687335092348285e-07, "loss": -0.0029, "num_tokens": 109436361.0, "reward": 1.390625, "reward_std": 0.31162063777446747, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.495430126786232, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.37647102028131485, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.5, "completions/max_terminated_length": 202.5, "completions/mean_length": 158.90625, "completions/mean_terminated_length": 158.90625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.26367831245880025, "grad_norm": 1.714583158493042, "kl": 0.16552734375, "learning_rate": 8.680738786279683e-07, "loss": 0.0125, "num_tokens": 109950038.0, "reward": 1.3046875, "reward_std": 0.31260083615779877, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49899089336395264, "rewards/counterfactual_reasoning_reward/mean": 0.203125, "rewards/counterfactual_reasoning_reward/std": 0.40828560292720795, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.5, "completions/max_terminated_length": 233.5, "completions/mean_length": 161.046875, "completions/mean_terminated_length": 161.046875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.26499670402109426, "grad_norm": 1.6712524890899658, "kl": 0.2080078125, "learning_rate": 8.674142480211082e-07, "loss": 0.001, "num_tokens": 110512391.0, "reward": 1.4375, "reward_std": 0.3116406500339508, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5015034973621368, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.4734743535518646, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.2663150955833883, "grad_norm": 1.869868278503418, "kl": 0.3095703125, "learning_rate": 8.667546174142479e-07, "loss": 0.0055, "num_tokens": 111076904.0, "reward": 1.2890625, "reward_std": 0.3471610099077225, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.46125002205371857, "rewards/counterfactual_reasoning_reward/mean": 0.1875, "rewards/counterfactual_reasoning_reward/std": 0.3879760503768921, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 153.140625, "completions/mean_terminated_length": 153.140625, "completions/min_length": 101.5, "completions/min_terminated_length": 101.5, "epoch": 0.26763348714568225, "grad_norm": 2.092139959335327, "kl": 0.22998046875, "learning_rate": 8.660949868073878e-07, "loss": -0.0125, "num_tokens": 111629976.0, "reward": 1.359375, "reward_std": 0.2686549127101898, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.5034956932067871, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.43038569390773773, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 152.8125, "completions/mean_terminated_length": 152.8125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.26895187870797627, "grad_norm": 1.927712321281433, "kl": 0.24560546875, "learning_rate": 8.654353562005277e-07, "loss": 0.0198, "num_tokens": 112199161.0, "reward": 1.4609375, "reward_std": 0.30613429844379425, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4743363857269287, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4874725937843323, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 191.3125, "completions/mean_terminated_length": 191.3125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.2702702702702703, "grad_norm": 2.1045477390289307, "kl": 0.3662109375, "learning_rate": 8.647757255936675e-07, "loss": -0.0196, "num_tokens": 112732415.0, "reward": 1.484375, "reward_std": 0.27253682166337967, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4874725937843323, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.504016101360321, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 145.671875, "completions/mean_terminated_length": 145.671875, "completions/min_length": 96.5, "completions/min_terminated_length": 96.5, "epoch": 0.27158866183256425, "grad_norm": 1.8391852378845215, "kl": 0.1953125, "learning_rate": 8.641160949868074e-07, "loss": 0.001, "num_tokens": 113277687.0, "reward": 1.671875, "reward_std": 0.4089687466621399, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 0.3689020276069641, "rewards/counterfactual_reasoning_reward/mean": 0.5625, "rewards/counterfactual_reasoning_reward/std": 0.49993492662906647, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 161.265625, "completions/mean_terminated_length": 161.265625, "completions/min_length": 94.5, "completions/min_terminated_length": 94.5, "epoch": 0.27290705339485827, "grad_norm": 3.9655251502990723, "kl": 0.2939453125, "learning_rate": 8.634564643799472e-07, "loss": 0.02, "num_tokens": 113857230.0, "reward": 1.359375, "reward_std": 0.35159172117710114, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4932873994112015, "rewards/counterfactual_reasoning_reward/mean": 0.234375, "rewards/counterfactual_reasoning_reward/std": 0.4199155569076538, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.5, "completions/max_terminated_length": 217.5, "completions/mean_length": 153.921875, "completions/mean_terminated_length": 153.921875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.2742254449571523, "grad_norm": 1.9164501428604126, "kl": 0.193359375, "learning_rate": 8.627968337730869e-07, "loss": 0.0078, "num_tokens": 114434246.0, "reward": 1.5, "reward_std": 0.2733023911714554, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4339464604854584, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.425730362534523, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 150.75, "completions/mean_terminated_length": 150.75, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.27554383651944625, "grad_norm": 6.350325584411621, "kl": 0.1572265625, "learning_rate": 8.621372031662269e-07, "loss": -0.0158, "num_tokens": 114969811.0, "reward": 1.3515625, "reward_std": 0.2755580097436905, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.5034956932067871, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.4337434321641922, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 150.234375, "completions/mean_terminated_length": 150.234375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.27686222808174027, "grad_norm": 5.82377290725708, "kl": 0.193359375, "learning_rate": 8.614775725593667e-07, "loss": 0.0273, "num_tokens": 115508900.0, "reward": 1.46875, "reward_std": 0.2672116681933403, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.49899089336395264, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.5, "completions/max_terminated_length": 268.5, "completions/mean_length": 161.859375, "completions/mean_terminated_length": 161.859375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.2781806196440343, "grad_norm": 1.8840621709823608, "kl": 0.17626953125, "learning_rate": 8.608179419525066e-07, "loss": -0.006, "num_tokens": 116052700.0, "reward": 1.390625, "reward_std": 0.38004428148269653, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5075039267539978, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.44777433574199677, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.5, "completions/max_terminated_length": 248.5, "completions/mean_length": 154.109375, "completions/mean_terminated_length": 154.109375, "completions/min_length": 109.5, "completions/min_terminated_length": 109.5, "epoch": 0.2794990112063283, "grad_norm": 3.165605068206787, "kl": 0.177734375, "learning_rate": 8.601583113456464e-07, "loss": 0.0009, "num_tokens": 116593219.0, "reward": 1.4140625, "reward_std": 0.36822986602783203, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.5029991269111633, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4559413939714432, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 152.234375, "completions/mean_terminated_length": 152.234375, "completions/min_length": 105.5, "completions/min_terminated_length": 105.5, "epoch": 0.28081740276862227, "grad_norm": 1.2701598405838013, "kl": 0.17578125, "learning_rate": 8.594986807387863e-07, "loss": 0.0009, "num_tokens": 117176512.0, "reward": 1.421875, "reward_std": 0.3409455418586731, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.4635103940963745, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 152.5625, "completions/mean_terminated_length": 152.5625, "completions/min_length": 104.5, "completions/min_terminated_length": 104.5, "epoch": 0.2821357943309163, "grad_norm": 1.525876522064209, "kl": 0.1884765625, "learning_rate": 8.58839050131926e-07, "loss": 0.0029, "num_tokens": 117752104.0, "reward": 1.359375, "reward_std": 0.30304722487926483, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.4907747954130173, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.4442135691642761, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 148.203125, "completions/mean_terminated_length": 148.203125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.2834541858932103, "grad_norm": 1.6029776334762573, "kl": 0.18017578125, "learning_rate": 8.581794195250659e-07, "loss": -0.0245, "num_tokens": 118302959.0, "reward": 1.4296875, "reward_std": 0.31402209401130676, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.48896822333335876, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.4659053534269333, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 160.3125, "completions/mean_terminated_length": 160.3125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.28477257745550427, "grad_norm": 2.766634941101074, "kl": 0.20263671875, "learning_rate": 8.575197889182058e-07, "loss": 0.0157, "num_tokens": 118838742.0, "reward": 1.359375, "reward_std": 0.23934779316186905, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.507007360458374, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.40346992015838623, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 148.0625, "completions/mean_terminated_length": 148.0625, "completions/min_length": 109.5, "completions/min_terminated_length": 109.5, "epoch": 0.2860909690177983, "grad_norm": 1.2723420858383179, "kl": 0.17578125, "learning_rate": 8.568601583113456e-07, "loss": -0.0216, "num_tokens": 119378256.0, "reward": 1.515625, "reward_std": 0.22960031032562256, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48190538585186005, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.4442135691642761, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.5, "completions/max_terminated_length": 282.5, "completions/mean_length": 157.578125, "completions/mean_terminated_length": 157.578125, "completions/min_length": 107.5, "completions/min_terminated_length": 107.5, "epoch": 0.2874093605800923, "grad_norm": 1.684826374053955, "kl": 0.31884765625, "learning_rate": 8.562005277044855e-07, "loss": 0.0133, "num_tokens": 119911810.0, "reward": 1.40625, "reward_std": 0.2898322641849518, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.44837237894535065, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4364590644836426, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.5, "completions/max_terminated_length": 223.5, "completions/mean_length": 162.59375, "completions/mean_terminated_length": 162.59375, "completions/min_length": 107.5, "completions/min_terminated_length": 107.5, "epoch": 0.2887277521423863, "grad_norm": 2.3830795288085938, "kl": 0.18017578125, "learning_rate": 8.555408970976253e-07, "loss": 0.0282, "num_tokens": 120436644.0, "reward": 1.3984375, "reward_std": 0.26562613248825073, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5015034973621368, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.4339464604854584, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.5, "completions/max_terminated_length": 207.5, "completions/mean_length": 153.53125, "completions/mean_terminated_length": 153.53125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2900461437046803, "grad_norm": 1.5542792081832886, "kl": 0.17724609375, "learning_rate": 8.54881266490765e-07, "loss": 0.0097, "num_tokens": 120983336.0, "reward": 1.5859375, "reward_std": 0.3157659024000168, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.44547125697135925, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.49993492662906647, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 150.828125, "completions/mean_terminated_length": 150.828125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.2913645352669743, "grad_norm": 1.6478992700576782, "kl": 0.18408203125, "learning_rate": 8.54221635883905e-07, "loss": 0.0009, "num_tokens": 121535051.0, "reward": 1.515625, "reward_std": 0.3657534569501877, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.48139922320842743, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.4979427307844162, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.5, "completions/max_terminated_length": 262.5, "completions/mean_length": 151.125, "completions/mean_terminated_length": 151.125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2926829268292683, "grad_norm": 1.4304864406585693, "kl": 0.1767578125, "learning_rate": 8.535620052770448e-07, "loss": 0.0028, "num_tokens": 122078620.0, "reward": 1.5546875, "reward_std": 0.27309004217386246, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.44837237894535065, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.49186936020851135, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 154.359375, "completions/mean_terminated_length": 154.359375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2940013183915623, "grad_norm": 1.441246509552002, "kl": 0.2236328125, "learning_rate": 8.529023746701847e-07, "loss": 0.005, "num_tokens": 122657867.0, "reward": 1.46875, "reward_std": 0.3934902548789978, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.4979427307844162, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.47789715230464935, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 149.390625, "completions/mean_terminated_length": 149.390625, "completions/min_length": 110.5, "completions/min_terminated_length": 110.5, "epoch": 0.2953197099538563, "grad_norm": 1.9029501676559448, "kl": 0.1611328125, "learning_rate": 8.522427440633245e-07, "loss": 0.0008, "num_tokens": 123210242.0, "reward": 1.578125, "reward_std": 0.3462071716785431, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.4638662487268448, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.4994383603334427, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 159.671875, "completions/mean_terminated_length": 159.671875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.2966381015161503, "grad_norm": 2.259085178375244, "kl": 0.18603515625, "learning_rate": 8.515831134564644e-07, "loss": 0.0068, "num_tokens": 123744062.0, "reward": 1.375, "reward_std": 0.3309820592403412, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.4979427307844162, "rewards/counterfactual_reasoning_reward/mean": 0.15625, "rewards/counterfactual_reasoning_reward/std": 0.3662842661142349, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 149.515625, "completions/mean_terminated_length": 149.515625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.2979564930784443, "grad_norm": 2.289480686187744, "kl": 0.17822265625, "learning_rate": 8.509234828496041e-07, "loss": 0.0292, "num_tokens": 124274894.0, "reward": 1.4140625, "reward_std": 0.3192010372877121, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.507007360458374, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.4767438918352127, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.5, "completions/max_terminated_length": 290.5, "completions/mean_length": 159.015625, "completions/mean_terminated_length": 159.015625, "completions/min_length": 101.5, "completions/min_terminated_length": 101.5, "epoch": 0.2992748846407383, "grad_norm": 1.844226360321045, "kl": 0.29248046875, "learning_rate": 8.50263852242744e-07, "loss": 0.0015, "num_tokens": 124863683.0, "reward": 1.4140625, "reward_std": 0.36280661821365356, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.5034956932067871, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.45543521642684937, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.5, "completions/max_terminated_length": 210.5, "completions/mean_length": 149.328125, "completions/mean_terminated_length": 149.328125, "completions/min_length": 103.5, "completions/min_terminated_length": 103.5, "epoch": 0.3005932762030323, "grad_norm": 1.7497187852859497, "kl": 0.34228515625, "learning_rate": 8.496042216358839e-07, "loss": 0.0095, "num_tokens": 125434445.0, "reward": 1.546875, "reward_std": 0.33576367795467377, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4364590644836426, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.48495998978614807, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.5, "completions/max_terminated_length": 264.5, "completions/mean_length": 158.3125, "completions/mean_terminated_length": 158.3125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.3019116677653263, "grad_norm": 2.279448986053467, "kl": 0.28955078125, "learning_rate": 8.489445910290237e-07, "loss": 0.0063, "num_tokens": 125982759.0, "reward": 1.3125, "reward_std": 0.19918899983167648, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.4734743535518646, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.43038569390773773, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.5, "completions/max_terminated_length": 271.5, "completions/mean_length": 161.796875, "completions/mean_terminated_length": 161.796875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.3032300593276203, "grad_norm": 4.44816255569458, "kl": 0.3330078125, "learning_rate": 8.482849604221636e-07, "loss": -0.0042, "num_tokens": 126547657.0, "reward": 1.375, "reward_std": 0.34831857681274414, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.48190538585186005, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.47197872400283813, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 151.046875, "completions/mean_terminated_length": 151.046875, "completions/min_length": 110.5, "completions/min_terminated_length": 110.5, "epoch": 0.3045484508899143, "grad_norm": 1.7258495092391968, "kl": 0.31103515625, "learning_rate": 8.476253298153034e-07, "loss": -0.0063, "num_tokens": 127122825.0, "reward": 1.515625, "reward_std": 0.22095344215631485, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.4932873994112015, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.4994383603334427, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 170.421875, "completions/mean_terminated_length": 170.421875, "completions/min_length": 117.5, "completions/min_terminated_length": 117.5, "epoch": 0.3058668424522083, "grad_norm": 1.6231685876846313, "kl": 0.23583984375, "learning_rate": 8.469656992084431e-07, "loss": 0.0109, "num_tokens": 127683104.0, "reward": 1.515625, "reward_std": 0.2640637904405594, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.4659053534269333, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.48139922320842743, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.5, "completions/max_terminated_length": 252.5, "completions/mean_length": 170.921875, "completions/mean_terminated_length": 170.921875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.3071852340145023, "grad_norm": 1.6449296474456787, "kl": 0.1923828125, "learning_rate": 8.463060686015831e-07, "loss": 0.0068, "num_tokens": 128193177.0, "reward": 1.3828125, "reward_std": 0.3156128600239754, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.5029991269111633, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4559413939714432, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 149.46875, "completions/mean_terminated_length": 149.46875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.3085036255767963, "grad_norm": 2.03678035736084, "kl": 0.36279296875, "learning_rate": 8.456464379947229e-07, "loss": -0.0128, "num_tokens": 128783606.0, "reward": 1.609375, "reward_std": 0.3388998955488205, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.4442135691642761, "rewards/counterfactual_reasoning_reward/mean": 0.53125, "rewards/counterfactual_reasoning_reward/std": 0.507007360458374, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.5, "completions/max_terminated_length": 250.5, "completions/mean_length": 163.53125, "completions/mean_terminated_length": 163.53125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.3098220171390903, "grad_norm": 1.9481152296066284, "kl": 0.1552734375, "learning_rate": 8.449868073878628e-07, "loss": -0.0051, "num_tokens": 129298754.0, "reward": 1.390625, "reward_std": 0.447740375995636, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.5055117309093475, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.4767438918352127, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.5, "completions/max_terminated_length": 286.5, "completions/mean_length": 162.078125, "completions/mean_terminated_length": 162.078125, "completions/min_length": 94.5, "completions/min_terminated_length": 94.5, "epoch": 0.3111404087013843, "grad_norm": 1.5560412406921387, "kl": 0.2109375, "learning_rate": 8.443271767810026e-07, "loss": 0.005, "num_tokens": 129894008.0, "reward": 1.4296875, "reward_std": 0.247682586312294, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5075039267539978, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4907747954130173, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 157.5625, "completions/mean_terminated_length": 157.5625, "completions/min_length": 103.5, "completions/min_terminated_length": 103.5, "epoch": 0.31245880026367834, "grad_norm": 8.236194610595703, "kl": 0.177734375, "learning_rate": 8.436675461741425e-07, "loss": 0.0214, "num_tokens": 130427692.0, "reward": 1.546875, "reward_std": 0.3038269877433777, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4734743535518646, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.5034956932067871, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 155.09375, "completions/mean_terminated_length": 155.09375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.3137771918259723, "grad_norm": 1.1533715724945068, "kl": 0.14599609375, "learning_rate": 8.430079155672822e-07, "loss": -0.0178, "num_tokens": 130967207.0, "reward": 1.5, "reward_std": 0.2814220190048218, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.48040975630283356, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.4559413939714432, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.5, "completions/max_terminated_length": 256.5, "completions/mean_length": 157.625, "completions/mean_terminated_length": 157.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.3150955833882663, "grad_norm": 2.3362338542938232, "kl": 0.173828125, "learning_rate": 8.423482849604221e-07, "loss": 0.0116, "num_tokens": 131520019.0, "reward": 1.3828125, "reward_std": 0.32495926320552826, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.5060082972049713, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4638662487268448, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.5, "completions/max_terminated_length": 265.5, "completions/mean_length": 155.546875, "completions/mean_terminated_length": 155.546875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.31641397495056034, "grad_norm": 2.055114507675171, "kl": 0.16015625, "learning_rate": 8.41688654353562e-07, "loss": -0.0178, "num_tokens": 132089438.0, "reward": 1.46875, "reward_std": 0.19164105504751205, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.5034956932067871, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.4994383603334427, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.5, "completions/max_terminated_length": 248.5, "completions/mean_length": 166.5625, "completions/mean_terminated_length": 166.5625, "completions/min_length": 116.5, "completions/min_terminated_length": 116.5, "epoch": 0.3177323665128543, "grad_norm": 7.263826847076416, "kl": 0.1484375, "learning_rate": 8.410290237467018e-07, "loss": 0.0007, "num_tokens": 132652001.0, "reward": 1.3125, "reward_std": 0.379679873585701, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.5029991269111633, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.420013427734375, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.5, "completions/max_terminated_length": 278.5, "completions/mean_length": 164.3125, "completions/mean_terminated_length": 164.3125, "completions/min_length": 76.5, "completions/min_terminated_length": 76.5, "epoch": 0.3190507580751483, "grad_norm": 1.8368525505065918, "kl": 0.22607421875, "learning_rate": 8.403693931398417e-07, "loss": -0.0018, "num_tokens": 133206406.0, "reward": 1.46875, "reward_std": 0.2998022064566612, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4395582377910614, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.43038569390773773, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 161.9375, "completions/mean_terminated_length": 161.9375, "completions/min_length": 115.5, "completions/min_terminated_length": 115.5, "epoch": 0.32036914963744234, "grad_norm": 1.877377986907959, "kl": 0.19482421875, "learning_rate": 8.397097625329815e-07, "loss": 0.001, "num_tokens": 133781801.0, "reward": 1.390625, "reward_std": 0.3874116688966751, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.45543521642684937, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.43795469403266907, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 154.140625, "completions/mean_terminated_length": 154.140625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.3216875411997363, "grad_norm": 5.23651647567749, "kl": 0.16015625, "learning_rate": 8.390501319261212e-07, "loss": 0.0203, "num_tokens": 134332419.0, "reward": 1.53125, "reward_std": 0.3142581880092621, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4559413939714432, "rewards/counterfactual_reasoning_reward/mean": 0.484375, "rewards/counterfactual_reasoning_reward/std": 0.4767438918352127, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.5, "completions/max_terminated_length": 253.5, "completions/mean_length": 158.5, "completions/mean_terminated_length": 158.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.3230059327620303, "grad_norm": 0.9616804718971252, "kl": 0.234375, "learning_rate": 8.383905013192612e-07, "loss": 0.0012, "num_tokens": 134880627.0, "reward": 1.5703125, "reward_std": 0.24541422724723816, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.4364590644836426, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.4874725937843323, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 161.359375, "completions/mean_terminated_length": 161.359375, "completions/min_length": 106.5, "completions/min_terminated_length": 106.5, "epoch": 0.32432432432432434, "grad_norm": 1.866129994392395, "kl": 0.4580078125, "learning_rate": 8.37730870712401e-07, "loss": 0.0023, "num_tokens": 135471133.0, "reward": 1.5078125, "reward_std": 0.3292084038257599, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.3689020201563835, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.4559413939714432, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.5, "completions/max_terminated_length": 283.5, "completions/mean_length": 156.0, "completions/mean_terminated_length": 156.0, "completions/min_length": 109.5, "completions/min_terminated_length": 109.5, "epoch": 0.3256427158866183, "grad_norm": 2.0880014896392822, "kl": 0.18603515625, "learning_rate": 8.370712401055409e-07, "loss": 0.0097, "num_tokens": 136029145.0, "reward": 1.3515625, "reward_std": 0.433412566781044, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.5060082972049713, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.4767438918352127, "rewards/multiturn_format_reward/mean": 0.921875, "rewards/multiturn_format_reward/std": 0.2563937231898308, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.5, "completions/max_terminated_length": 291.5, "completions/mean_length": 163.09375, "completions/mean_terminated_length": 163.09375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.3269611074489123, "grad_norm": 1.8495664596557617, "kl": 0.16845703125, "learning_rate": 8.364116094986807e-07, "loss": 0.0292, "num_tokens": 136618222.0, "reward": 1.59375, "reward_std": 0.2972627207636833, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.44547125697135925, "rewards/counterfactual_reasoning_reward/mean": 0.484375, "rewards/counterfactual_reasoning_reward/std": 0.5055117309093475, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.5, "completions/max_terminated_length": 388.5, "completions/mean_length": 171.765625, "completions/mean_terminated_length": 171.765625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.32827949901120634, "grad_norm": 2.30902361869812, "kl": 0.2431640625, "learning_rate": 8.357519788918205e-07, "loss": 0.0227, "num_tokens": 137153276.0, "reward": 1.2734375, "reward_std": 0.34316691756248474, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.40442168712615967, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.3975677341222763, "rewards/multiturn_format_reward/mean": 0.921875, "rewards/multiturn_format_reward/std": 0.2710396274924278, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.5, "completions/max_terminated_length": 251.5, "completions/mean_length": 147.96875, "completions/mean_terminated_length": 147.96875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.3295978905735003, "grad_norm": 2.10632061958313, "kl": 0.1884765625, "learning_rate": 8.350923482849603e-07, "loss": -0.0078, "num_tokens": 137694257.0, "reward": 1.625, "reward_std": 0.40368136763572693, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.4199155569076538, "rewards/counterfactual_reasoning_reward/mean": 0.546875, "rewards/counterfactual_reasoning_reward/std": 0.4932873994112015, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 167.84375, "completions/mean_terminated_length": 167.84375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.3309162821357943, "grad_norm": 1.756796956062317, "kl": 0.1708984375, "learning_rate": 8.344327176781002e-07, "loss": 0.0116, "num_tokens": 138231899.0, "reward": 1.5078125, "reward_std": 0.2759730964899063, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.4767438918352127, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.47789715230464935, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 174.609375, "completions/mean_terminated_length": 174.609375, "completions/min_length": 119.5, "completions/min_terminated_length": 119.5, "epoch": 0.33223467369808835, "grad_norm": 1.7278567552566528, "kl": 0.26708984375, "learning_rate": 8.337730870712401e-07, "loss": 0.0306, "num_tokens": 138763804.0, "reward": 1.3515625, "reward_std": 0.30070945620536804, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.504016101360321, "rewards/counterfactual_reasoning_reward/mean": 0.234375, "rewards/counterfactual_reasoning_reward/std": 0.4266805946826935, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 168.203125, "completions/mean_terminated_length": 168.203125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.3335530652603823, "grad_norm": 4.997721195220947, "kl": 0.18603515625, "learning_rate": 8.331134564643799e-07, "loss": -0.0069, "num_tokens": 139330284.0, "reward": 1.546875, "reward_std": 0.416938841342926, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.4638662487268448, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.5015034973621368, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.5, "completions/max_terminated_length": 297.5, "completions/mean_length": 178.25, "completions/mean_terminated_length": 178.25, "completions/min_length": 132.5, "completions/min_terminated_length": 132.5, "epoch": 0.3348714568226763, "grad_norm": 1.743480920791626, "kl": 0.26025390625, "learning_rate": 8.324538258575198e-07, "loss": 0.012, "num_tokens": 139881964.0, "reward": 1.1875, "reward_std": 0.2735274061560631, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.44777433574199677, "rewards/counterfactual_reasoning_reward/mean": 0.140625, "rewards/counterfactual_reasoning_reward/std": 0.3083590194582939, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 173.828125, "completions/mean_terminated_length": 173.828125, "completions/min_length": 116.5, "completions/min_terminated_length": 116.5, "epoch": 0.33618984838497035, "grad_norm": 1.5121779441833496, "kl": 0.185546875, "learning_rate": 8.317941952506596e-07, "loss": -0.0127, "num_tokens": 140447467.0, "reward": 1.5859375, "reward_std": 0.20348840951919556, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.45227913558483124, "rewards/counterfactual_reasoning_reward/mean": 0.515625, "rewards/counterfactual_reasoning_reward/std": 0.4299773871898651, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.5, "completions/max_terminated_length": 267.5, "completions/mean_length": 172.265625, "completions/mean_terminated_length": 172.265625, "completions/min_length": 127.5, "completions/min_terminated_length": 127.5, "epoch": 0.3375082399472643, "grad_norm": 1.7585612535476685, "kl": 0.1904296875, "learning_rate": 8.311345646437993e-07, "loss": 0.0185, "num_tokens": 141005236.0, "reward": 1.5234375, "reward_std": 0.27515991032123566, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.46946612000465393, "rewards/counterfactual_reasoning_reward/mean": 0.46875, "rewards/counterfactual_reasoning_reward/std": 0.43840841948986053, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.5, "completions/max_terminated_length": 340.5, "completions/mean_length": 185.84375, "completions/mean_terminated_length": 185.84375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.33882663150955833, "grad_norm": 2.5330657958984375, "kl": 0.20263671875, "learning_rate": 8.304749340369393e-07, "loss": -0.0039, "num_tokens": 141560020.0, "reward": 1.578125, "reward_std": 0.38286441564559937, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.43038569390773773, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.5055117309093475, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 177.359375, "completions/mean_terminated_length": 177.359375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.34014502307185235, "grad_norm": 37.61130142211914, "kl": 1.18408203125, "learning_rate": 8.298153034300791e-07, "loss": -0.0009, "num_tokens": 142157155.0, "reward": 1.40625, "reward_std": 0.20160073041915894, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.44837237894535065, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.43845126032829285, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.5, "completions/max_terminated_length": 353.5, "completions/mean_length": 196.234375, "completions/mean_terminated_length": 196.234375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.34146341463414637, "grad_norm": 2.4258625507354736, "kl": 0.28466796875, "learning_rate": 8.29155672823219e-07, "loss": 0.0004, "num_tokens": 142730238.0, "reward": 1.46875, "reward_std": 0.4564836174249649, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49899089336395264, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4874725937843323, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.5, "completions/max_terminated_length": 335.5, "completions/mean_length": 193.046875, "completions/mean_terminated_length": 193.046875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.34278180619644033, "grad_norm": 1.6254724264144897, "kl": 0.26025390625, "learning_rate": 8.284960422163588e-07, "loss": 0.0081, "num_tokens": 143291089.0, "reward": 1.4375, "reward_std": 0.29045480489730835, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.4767438918352127, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.4734743535518646, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.5, "completions/max_terminated_length": 388.5, "completions/mean_length": 192.90625, "completions/mean_terminated_length": 192.90625, "completions/min_length": 134.5, "completions/min_terminated_length": 134.5, "epoch": 0.34410019775873435, "grad_norm": 2.169651508331299, "kl": 0.3291015625, "learning_rate": 8.278364116094986e-07, "loss": -0.0101, "num_tokens": 143833067.0, "reward": 1.3359375, "reward_std": 0.2540917322039604, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.33297405391931534, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.2520080506801605, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.5, "completions/max_terminated_length": 268.5, "completions/mean_length": 175.8125, "completions/mean_terminated_length": 175.8125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.34541858932102837, "grad_norm": 2.47721791267395, "kl": 0.23193359375, "learning_rate": 8.271767810026385e-07, "loss": 0.0012, "num_tokens": 144349496.0, "reward": 1.421875, "reward_std": 0.39166946709156036, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.495430126786232, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48495998978614807, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.5, "completions/max_terminated_length": 304.5, "completions/mean_length": 182.53125, "completions/mean_terminated_length": 182.53125, "completions/min_length": 122.5, "completions/min_terminated_length": 122.5, "epoch": 0.34673698088332233, "grad_norm": 1.9794095754623413, "kl": 0.30712890625, "learning_rate": 8.265171503957783e-07, "loss": 0.0035, "num_tokens": 144945592.0, "reward": 1.53125, "reward_std": 0.48354440927505493, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.47789715230464935, "rewards/counterfactual_reasoning_reward/mean": 0.46875, "rewards/counterfactual_reasoning_reward/std": 0.5060082972049713, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 182.421875, "completions/mean_terminated_length": 182.421875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.34805537244561635, "grad_norm": 3.057155132293701, "kl": 0.3076171875, "learning_rate": 8.258575197889182e-07, "loss": -0.0112, "num_tokens": 145527241.0, "reward": 1.5234375, "reward_std": 0.4066152274608612, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4640069603919983, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.49478302896022797, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 205.734375, "completions/mean_terminated_length": 205.734375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.34937376400791037, "grad_norm": 1.9991483688354492, "kl": 0.2041015625, "learning_rate": 8.25197889182058e-07, "loss": 0.001, "num_tokens": 146093397.0, "reward": 1.65625, "reward_std": 0.35040371119976044, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.39445772767066956, "rewards/counterfactual_reasoning_reward/mean": 0.5, "rewards/counterfactual_reasoning_reward/std": 0.49899089336395264, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 191.6875, "completions/mean_terminated_length": 191.6875, "completions/min_length": 129.5, "completions/min_terminated_length": 129.5, "epoch": 0.35069215557020433, "grad_norm": 2.560934543609619, "kl": 0.58642578125, "learning_rate": 8.245382585751979e-07, "loss": 0.0244, "num_tokens": 146646647.0, "reward": 1.609375, "reward_std": 0.33800652623176575, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.4266805946826935, "rewards/counterfactual_reasoning_reward/mean": 0.484375, "rewards/counterfactual_reasoning_reward/std": 0.495430126786232, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.5, "completions/max_terminated_length": 313.5, "completions/mean_length": 201.0, "completions/mean_terminated_length": 201.0, "completions/min_length": 127.5, "completions/min_terminated_length": 127.5, "epoch": 0.35201054713249835, "grad_norm": 1.4711872339248657, "kl": 0.298828125, "learning_rate": 8.238786279683377e-07, "loss": 0.023, "num_tokens": 147198265.0, "reward": 1.4765625, "reward_std": 0.3223053365945816, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.5015034973621368, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.5029991269111633, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.5, "completions/max_terminated_length": 344.5, "completions/mean_length": 204.0625, "completions/mean_terminated_length": 204.0625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.35332893869479237, "grad_norm": 3.2954726219177246, "kl": 0.19189453125, "learning_rate": 8.232189973614774e-07, "loss": 0.0176, "num_tokens": 147798386.0, "reward": 1.5859375, "reward_std": 0.2940710186958313, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4907747954130173, "rewards/counterfactual_reasoning_reward/mean": 0.546875, "rewards/counterfactual_reasoning_reward/std": 0.5034956932067871, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 193.46875, "completions/mean_terminated_length": 193.46875, "completions/min_length": 121.5, "completions/min_terminated_length": 121.5, "epoch": 0.35464733025708634, "grad_norm": 1.991726040840149, "kl": 0.18701171875, "learning_rate": 8.225593667546174e-07, "loss": 0.0009, "num_tokens": 148340026.0, "reward": 1.5, "reward_std": 0.4347732365131378, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.4932873994112015, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.4994383603334427, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.5, "completions/max_terminated_length": 377.5, "completions/mean_length": 198.875, "completions/mean_terminated_length": 198.875, "completions/min_length": 145.5, "completions/min_terminated_length": 145.5, "epoch": 0.35596572181938035, "grad_norm": 6.206878185272217, "kl": 0.2841796875, "learning_rate": 8.218997361477572e-07, "loss": -0.0044, "num_tokens": 148903115.0, "reward": 1.453125, "reward_std": 0.2512580156326294, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.48495998978614807, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.4640069603919983, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 202.203125, "completions/mean_terminated_length": 202.203125, "completions/min_length": 126.5, "completions/min_terminated_length": 126.5, "epoch": 0.3572841133816744, "grad_norm": 1.8537336587905884, "kl": 0.1953125, "learning_rate": 8.212401055408971e-07, "loss": -0.0088, "num_tokens": 149469593.0, "reward": 1.3671875, "reward_std": 0.43318524956703186, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5075039267539978, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.45680341124534607, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.5, "completions/max_terminated_length": 341.5, "completions/mean_length": 199.84375, "completions/mean_terminated_length": 199.84375, "completions/min_length": 149.5, "completions/min_terminated_length": 149.5, "epoch": 0.35860250494396834, "grad_norm": 1.722954511642456, "kl": 0.21044921875, "learning_rate": 8.205804749340369e-07, "loss": 0.0089, "num_tokens": 150007770.0, "reward": 1.3984375, "reward_std": 0.4094388335943222, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.49186936020851135, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.44547125697135925, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 206.28125, "completions/mean_terminated_length": 206.28125, "completions/min_length": 129.5, "completions/min_terminated_length": 129.5, "epoch": 0.35992089650626236, "grad_norm": 1.829460620880127, "kl": 0.19677734375, "learning_rate": 8.199208443271767e-07, "loss": -0.0019, "num_tokens": 150609391.0, "reward": 1.6484375, "reward_std": 0.4093272089958191, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4337434321641922, "rewards/counterfactual_reasoning_reward/mean": 0.578125, "rewards/counterfactual_reasoning_reward/std": 0.49527959525585175, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.5, "completions/max_terminated_length": 514.5, "completions/mean_length": 207.9375, "completions/mean_terminated_length": 207.9375, "completions/min_length": 136.5, "completions/min_terminated_length": 136.5, "epoch": 0.3612392880685564, "grad_norm": 1.8041356801986694, "kl": 0.19287109375, "learning_rate": 8.192612137203166e-07, "loss": 0.0068, "num_tokens": 151152864.0, "reward": 1.3515625, "reward_std": 0.37337036430835724, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.4825586974620819, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.4395582377910614, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 222.734375, "completions/mean_terminated_length": 222.734375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.36255767963085034, "grad_norm": 1.0388671159744263, "kl": 0.1865234375, "learning_rate": 8.186015831134564e-07, "loss": 0.0009, "num_tokens": 151722114.0, "reward": 1.4453125, "reward_std": 0.30771908164024353, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4743363857269287, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.45028693974018097, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.5, "completions/max_terminated_length": 359.5, "completions/mean_length": 197.453125, "completions/mean_terminated_length": 197.453125, "completions/min_length": 119.5, "completions/min_terminated_length": 119.5, "epoch": 0.36387607119314436, "grad_norm": 2.933138608932495, "kl": 0.17578125, "learning_rate": 8.179419525065963e-07, "loss": -0.0118, "num_tokens": 152278261.0, "reward": 1.5390625, "reward_std": 0.37079615890979767, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.47789715230464935, "rewards/counterfactual_reasoning_reward/mean": 0.484375, "rewards/counterfactual_reasoning_reward/std": 0.48721402883529663, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.5, "completions/max_terminated_length": 353.5, "completions/mean_length": 214.484375, "completions/mean_terminated_length": 214.484375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.3651944627554384, "grad_norm": 1.3220118284225464, "kl": 0.22900390625, "learning_rate": 8.172823218997361e-07, "loss": 0.0011, "num_tokens": 152794877.0, "reward": 1.4296875, "reward_std": 0.351147785782814, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.4734743535518646, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.43840841948986053, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 191.546875, "completions/mean_terminated_length": 191.546875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.36651285431773234, "grad_norm": 3.2457451820373535, "kl": 0.16748046875, "learning_rate": 8.16622691292876e-07, "loss": -0.0519, "num_tokens": 153323139.0, "reward": 1.65625, "reward_std": 0.23047573864459991, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.4199155569076538, "rewards/counterfactual_reasoning_reward/mean": 0.546875, "rewards/counterfactual_reasoning_reward/std": 0.48495998978614807, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 198.265625, "completions/mean_terminated_length": 198.265625, "completions/min_length": 133.5, "completions/min_terminated_length": 133.5, "epoch": 0.36783124588002636, "grad_norm": 1.509379267692566, "kl": 0.15966796875, "learning_rate": 8.159630606860158e-07, "loss": -0.0119, "num_tokens": 153879566.0, "reward": 1.453125, "reward_std": 0.3648359179496765, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.5060082972049713, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4907747954130173, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 212.453125, "completions/mean_terminated_length": 212.453125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.3691496374423204, "grad_norm": 2.2196407318115234, "kl": 0.20703125, "learning_rate": 8.153034300791555e-07, "loss": 0.0313, "num_tokens": 154408782.0, "reward": 1.3203125, "reward_std": 0.22208934277296066, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.203125, "rewards/counterfactual_reasoning_reward/std": 0.3642466887831688, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.5, "completions/max_terminated_length": 456.5, "completions/mean_length": 230.453125, "completions/mean_terminated_length": 230.453125, "completions/min_length": 140.5, "completions/min_terminated_length": 140.5, "epoch": 0.3704680290046144, "grad_norm": 2.765371799468994, "kl": 0.2255859375, "learning_rate": 8.146437994722955e-07, "loss": 0.0002, "num_tokens": 154995239.0, "reward": 1.3671875, "reward_std": 0.4522506892681122, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.5060082972049713, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48495998978614807, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.1480722874403, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.5, "completions/max_terminated_length": 463.5, "completions/mean_length": 204.40625, "completions/mean_terminated_length": 204.40625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.37178642056690836, "grad_norm": 1.7068382501602173, "kl": 0.17236328125, "learning_rate": 8.139841688654353e-07, "loss": -0.005, "num_tokens": 155569970.0, "reward": 1.609375, "reward_std": 0.40064050257205963, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.425730362534523, "rewards/counterfactual_reasoning_reward/mean": 0.5625, "rewards/counterfactual_reasoning_reward/std": 0.49478302896022797, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 218.75, "completions/mean_terminated_length": 218.75, "completions/min_length": 137.5, "completions/min_terminated_length": 137.5, "epoch": 0.3731048121292024, "grad_norm": 3.369981527328491, "kl": 0.46142578125, "learning_rate": 8.133245382585752e-07, "loss": 0.013, "num_tokens": 156126469.0, "reward": 1.453125, "reward_std": 0.4048271179199219, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4932873994112015, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48495998978614807, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 215.546875, "completions/mean_terminated_length": 215.546875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.3744232036914964, "grad_norm": 1.5298014879226685, "kl": 0.173828125, "learning_rate": 8.12664907651715e-07, "loss": 0.0009, "num_tokens": 156681597.0, "reward": 1.4609375, "reward_std": 0.3770909607410431, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4640069603919983, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.48139922320842743, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.5, "completions/max_terminated_length": 422.5, "completions/mean_length": 206.421875, "completions/mean_terminated_length": 206.421875, "completions/min_length": 136.5, "completions/min_terminated_length": 136.5, "epoch": 0.37574159525379036, "grad_norm": 1.2755626440048218, "kl": 0.22802734375, "learning_rate": 8.120052770448548e-07, "loss": -0.0096, "num_tokens": 157225095.0, "reward": 1.2109375, "reward_std": 0.3133752718567848, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.44837237894535065, "rewards/counterfactual_reasoning_reward/mean": 0.1875, "rewards/counterfactual_reasoning_reward/std": 0.3965577781200409, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.5, "completions/max_terminated_length": 386.5, "completions/mean_length": 205.953125, "completions/mean_terminated_length": 205.953125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.3770599868160844, "grad_norm": 1.5457804203033447, "kl": 0.16650390625, "learning_rate": 8.113456464379947e-07, "loss": 0.0262, "num_tokens": 157783333.0, "reward": 1.5234375, "reward_std": 0.338888555765152, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.4824019521474838, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.49993492662906647, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.5, "completions/max_terminated_length": 367.5, "completions/mean_length": 209.0625, "completions/mean_terminated_length": 209.0625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.3783783783783784, "grad_norm": 3.0198287963867188, "kl": 0.150390625, "learning_rate": 8.106860158311345e-07, "loss": -0.0158, "num_tokens": 158325595.0, "reward": 1.4375, "reward_std": 0.3784145414829254, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4874725937843323, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.469681054353714, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 186.734375, "completions/mean_terminated_length": 186.734375, "completions/min_length": 131.5, "completions/min_terminated_length": 131.5, "epoch": 0.37969676994067236, "grad_norm": 2.071027994155884, "kl": 0.18115234375, "learning_rate": 8.100263852242744e-07, "loss": 0.0234, "num_tokens": 158856386.0, "reward": 1.5703125, "reward_std": 0.2884394899010658, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.4638662487268448, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.49478302896022797, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.5, "completions/max_terminated_length": 352.5, "completions/mean_length": 195.796875, "completions/mean_terminated_length": 195.796875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.3810151615029664, "grad_norm": 1.8931350708007812, "kl": 0.189453125, "learning_rate": 8.093667546174142e-07, "loss": -0.002, "num_tokens": 159415589.0, "reward": 1.5703125, "reward_std": 0.28556376695632935, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.4364590644836426, "rewards/counterfactual_reasoning_reward/mean": 0.46875, "rewards/counterfactual_reasoning_reward/std": 0.4979427307844162, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.5, "completions/max_terminated_length": 572.5, "completions/mean_length": 222.46875, "completions/mean_terminated_length": 222.46875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.3823335530652604, "grad_norm": 1.4169689416885376, "kl": 0.208984375, "learning_rate": 8.087071240105541e-07, "loss": 0.0079, "num_tokens": 159980170.0, "reward": 1.5703125, "reward_std": 0.34547585994005203, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.48139922320842743, "rewards/counterfactual_reasoning_reward/mean": 0.515625, "rewards/counterfactual_reasoning_reward/std": 0.5075039267539978, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.38365194462755436, "grad_norm": 1.4200514554977417, "kl": 0.20849609375, "learning_rate": 8.080474934036939e-07, "loss": 0.0186, "num_tokens": 160577427.0, "reward": 1.3359375, "reward_std": 0.4011779725551605, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5080004930496216, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.44837237894535065, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.1480722874403, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.5, "completions/max_terminated_length": 333.5, "completions/mean_length": 207.71875, "completions/mean_terminated_length": 207.71875, "completions/min_length": 134.5, "completions/min_terminated_length": 134.5, "epoch": 0.3849703361898484, "grad_norm": 2.1430251598358154, "kl": 0.2392578125, "learning_rate": 8.073878627968337e-07, "loss": 0.0012, "num_tokens": 161155763.0, "reward": 1.578125, "reward_std": 0.3129945993423462, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.46125002205371857, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.5055117309093475, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 211.90625, "completions/mean_terminated_length": 211.90625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.3862887277521424, "grad_norm": 3.251171112060547, "kl": 0.21484375, "learning_rate": 8.067282321899736e-07, "loss": 0.0431, "num_tokens": 161689262.0, "reward": 1.453125, "reward_std": 0.33708515763282776, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.4979427307844162, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4907747954130173, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 191.15625, "completions/mean_terminated_length": 191.15625, "completions/min_length": 137.5, "completions/min_terminated_length": 137.5, "epoch": 0.38760711931443637, "grad_norm": 1.4685084819793701, "kl": 0.19287109375, "learning_rate": 8.060686015831134e-07, "loss": 0.001, "num_tokens": 162236621.0, "reward": 1.5703125, "reward_std": 0.3219379484653473, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4399413466453552, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.495430126786232, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.5, "completions/max_terminated_length": 386.5, "completions/mean_length": 206.140625, "completions/mean_terminated_length": 206.140625, "completions/min_length": 144.5, "completions/min_terminated_length": 144.5, "epoch": 0.3889255108767304, "grad_norm": 1.2679147720336914, "kl": 0.17529296875, "learning_rate": 8.054089709762533e-07, "loss": 0.0009, "num_tokens": 162767280.0, "reward": 1.4765625, "reward_std": 0.1920287385582924, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.48946478962898254, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48495998978614807, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 211.859375, "completions/mean_terminated_length": 211.859375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.3902439024390244, "grad_norm": 1.5715464353561401, "kl": 0.2109375, "learning_rate": 8.047493403693931e-07, "loss": -0.0019, "num_tokens": 163307068.0, "reward": 1.59375, "reward_std": 0.2859852463006973, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.4000803381204605, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.5055117309093475, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.5, "completions/max_terminated_length": 420.5, "completions/mean_length": 213.015625, "completions/mean_terminated_length": 213.015625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.39156229400131837, "grad_norm": 2.10971999168396, "kl": 0.18701171875, "learning_rate": 8.040897097625329e-07, "loss": -0.0059, "num_tokens": 163866498.0, "reward": 1.4453125, "reward_std": 0.29641158878803253, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.4709290862083435, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.43038569390773773, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.5, "completions/max_terminated_length": 327.5, "completions/mean_length": 195.890625, "completions/mean_terminated_length": 195.890625, "completions/min_length": 121.5, "completions/min_terminated_length": 121.5, "epoch": 0.3928806855636124, "grad_norm": 1.4583791494369507, "kl": 0.17138671875, "learning_rate": 8.034300791556728e-07, "loss": -0.0089, "num_tokens": 164409300.0, "reward": 1.375, "reward_std": 0.35668398439884186, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.48139922320842743, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.5, "completions/max_terminated_length": 392.5, "completions/mean_length": 208.390625, "completions/mean_terminated_length": 208.390625, "completions/min_length": 133.5, "completions/min_terminated_length": 133.5, "epoch": 0.3941990771259064, "grad_norm": 1.2619853019714355, "kl": 0.17236328125, "learning_rate": 8.027704485488126e-07, "loss": 0.0009, "num_tokens": 164960916.0, "reward": 1.3984375, "reward_std": 0.33825021982192993, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.39445772767066956, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.4000803381204605, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.5, "completions/max_terminated_length": 332.5, "completions/mean_length": 213.609375, "completions/mean_terminated_length": 213.609375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.39551746868820037, "grad_norm": 2.0366742610931396, "kl": 0.1943359375, "learning_rate": 8.021108179419525e-07, "loss": 0.0088, "num_tokens": 165513613.0, "reward": 1.6328125, "reward_std": 0.33214858174324036, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.4175008237361908, "rewards/counterfactual_reasoning_reward/mean": 0.5625, "rewards/counterfactual_reasoning_reward/std": 0.4874725937843323, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.5, "completions/max_terminated_length": 454.5, "completions/mean_length": 203.421875, "completions/mean_terminated_length": 203.421875, "completions/min_length": 136.5, "completions/min_terminated_length": 136.5, "epoch": 0.3968358602504944, "grad_norm": 1.44057035446167, "kl": 0.18505859375, "learning_rate": 8.014511873350923e-07, "loss": -0.0137, "num_tokens": 166069539.0, "reward": 1.53125, "reward_std": 0.3034716844558716, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49899089336395264, "rewards/counterfactual_reasoning_reward/mean": 0.46875, "rewards/counterfactual_reasoning_reward/std": 0.5029991269111633, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.5, "completions/max_terminated_length": 331.5, "completions/mean_length": 201.71875, "completions/mean_terminated_length": 201.71875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.3981542518127884, "grad_norm": 6.112383842468262, "kl": 0.173828125, "learning_rate": 8.007915567282322e-07, "loss": -0.0021, "num_tokens": 166592295.0, "reward": 1.453125, "reward_std": 0.21595829725265503, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.4638662487268448, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.44777433574199677, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 207.890625, "completions/mean_terminated_length": 207.890625, "completions/min_length": 132.5, "completions/min_terminated_length": 132.5, "epoch": 0.3994726433750824, "grad_norm": 2.1021111011505127, "kl": 0.3056640625, "learning_rate": 8.00131926121372e-07, "loss": 0.0152, "num_tokens": 167152095.0, "reward": 1.3671875, "reward_std": 0.2884810268878937, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49993492662906647, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.46125002205371857, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.5, "completions/max_terminated_length": 342.5, "completions/mean_length": 193.953125, "completions/mean_terminated_length": 193.953125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.4007910349373764, "grad_norm": 6.859421730041504, "kl": 0.142578125, "learning_rate": 7.994722955145118e-07, "loss": 0.0007, "num_tokens": 167725504.0, "reward": 1.65625, "reward_std": 0.3867743909358978, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.39445772767066956, "rewards/counterfactual_reasoning_reward/mean": 0.53125, "rewards/counterfactual_reasoning_reward/std": 0.5060082972049713, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 205.640625, "completions/mean_terminated_length": 205.640625, "completions/min_length": 144.5, "completions/min_terminated_length": 144.5, "epoch": 0.4021094264996704, "grad_norm": 1.3790535926818848, "kl": 0.20703125, "learning_rate": 7.988126649076517e-07, "loss": 0.0059, "num_tokens": 168284506.0, "reward": 1.3203125, "reward_std": 0.24427008628845215, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.203125, "rewards/counterfactual_reasoning_reward/std": 0.3964070826768875, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.5, "completions/max_terminated_length": 488.5, "completions/mean_length": 208.578125, "completions/mean_terminated_length": 208.578125, "completions/min_length": 129.5, "completions/min_terminated_length": 129.5, "epoch": 0.4034278180619644, "grad_norm": 1.3498561382293701, "kl": 0.16650390625, "learning_rate": 7.981530343007915e-07, "loss": 0.0126, "num_tokens": 168802424.0, "reward": 1.1875, "reward_std": 0.2488291785120964, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.425730362534523, "rewards/counterfactual_reasoning_reward/mean": 0.125, "rewards/counterfactual_reasoning_reward/std": 0.33601075410842896, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 222.59375, "completions/mean_terminated_length": 222.59375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.4047462096242584, "grad_norm": 1.6001856327056885, "kl": 0.17919921875, "learning_rate": 7.974934036939314e-07, "loss": 0.0009, "num_tokens": 169335348.0, "reward": 1.6875, "reward_std": 0.3800952434539795, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.3965577781200409, "rewards/counterfactual_reasoning_reward/mean": 0.5625, "rewards/counterfactual_reasoning_reward/std": 0.5029991269111633, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.5, "completions/max_terminated_length": 439.5, "completions/mean_length": 210.828125, "completions/mean_terminated_length": 210.828125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.4060646011865524, "grad_norm": 2.1112008094787598, "kl": 0.18017578125, "learning_rate": 7.968337730870712e-07, "loss": -0.0059, "num_tokens": 169892142.0, "reward": 1.3515625, "reward_std": 0.3241316229104996, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.504016101360321, "rewards/counterfactual_reasoning_reward/mean": 0.203125, "rewards/counterfactual_reasoning_reward/std": 0.40442168712615967, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 208.890625, "completions/mean_terminated_length": 208.890625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.40738299274884643, "grad_norm": 1.8878083229064941, "kl": 0.3046875, "learning_rate": 7.96174142480211e-07, "loss": 0.0445, "num_tokens": 170440204.0, "reward": 1.2265625, "reward_std": 0.27383825182914734, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.47789715230464935, "rewards/counterfactual_reasoning_reward/mean": 0.140625, "rewards/counterfactual_reasoning_reward/std": 0.34635117650032043, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 185.625, "completions/mean_terminated_length": 185.625, "completions/min_length": 127.5, "completions/min_terminated_length": 127.5, "epoch": 0.4087013843111404, "grad_norm": 1.6649792194366455, "kl": 0.26611328125, "learning_rate": 7.955145118733509e-07, "loss": -0.0065, "num_tokens": 171019134.0, "reward": 1.5546875, "reward_std": 0.33353038877248764, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.46946612000465393, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.4874725937843323, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 220.625, "completions/mean_terminated_length": 220.625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.4100197758734344, "grad_norm": 1.807529330253601, "kl": 0.13916015625, "learning_rate": 7.948548812664907e-07, "loss": 0.0007, "num_tokens": 171602545.0, "reward": 1.453125, "reward_std": 0.35940586030483246, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.5060082972049713, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4907747954130173, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 218.703125, "completions/mean_terminated_length": 218.703125, "completions/min_length": 148.5, "completions/min_terminated_length": 148.5, "epoch": 0.41133816743572843, "grad_norm": 1.8388562202453613, "kl": 0.1787109375, "learning_rate": 7.941952506596306e-07, "loss": 0.0273, "num_tokens": 172170083.0, "reward": 1.421875, "reward_std": 0.3046977072954178, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.507007360458374, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.48139922320842743, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.5, "completions/max_terminated_length": 312.5, "completions/mean_length": 193.875, "completions/mean_terminated_length": 193.875, "completions/min_length": 135.5, "completions/min_terminated_length": 135.5, "epoch": 0.4126565589980224, "grad_norm": 3.0080432891845703, "kl": 0.189453125, "learning_rate": 7.935356200527704e-07, "loss": 0.0273, "num_tokens": 172723796.0, "reward": 1.4765625, "reward_std": 0.31594114005565643, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4907747954130173, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.4743363857269287, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.5, "completions/max_terminated_length": 388.5, "completions/mean_length": 219.265625, "completions/mean_terminated_length": 219.265625, "completions/min_length": 143.5, "completions/min_terminated_length": 143.5, "epoch": 0.4139749505603164, "grad_norm": 1.2776849269866943, "kl": 0.17578125, "learning_rate": 7.928759894459102e-07, "loss": 0.0009, "num_tokens": 173258909.0, "reward": 1.5234375, "reward_std": 0.3306838572025299, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.46946612000465393, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4739709198474884, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.5, "completions/max_terminated_length": 585.5, "completions/mean_length": 223.15625, "completions/mean_terminated_length": 223.15625, "completions/min_length": 136.5, "completions/min_terminated_length": 136.5, "epoch": 0.41529334212261043, "grad_norm": 2.168354034423828, "kl": 0.2099609375, "learning_rate": 7.922163588390501e-07, "loss": 0.0089, "num_tokens": 173805377.0, "reward": 1.453125, "reward_std": 0.30922409892082214, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4932873994112015, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48721402883529663, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.5, "completions/max_terminated_length": 427.5, "completions/mean_length": 217.453125, "completions/mean_terminated_length": 217.453125, "completions/min_length": 141.5, "completions/min_terminated_length": 141.5, "epoch": 0.4166117336849044, "grad_norm": 1.3840676546096802, "kl": 0.1650390625, "learning_rate": 7.915567282321899e-07, "loss": 0.0008, "num_tokens": 174358259.0, "reward": 1.5546875, "reward_std": 0.41334201395511627, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.41394005715847015, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.48040975630283356, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.5, "completions/max_terminated_length": 434.5, "completions/mean_length": 243.34375, "completions/mean_terminated_length": 243.34375, "completions/min_length": 131.5, "completions/min_terminated_length": 131.5, "epoch": 0.4179301252471984, "grad_norm": 1.3121964931488037, "kl": 0.23974609375, "learning_rate": 7.908970976253298e-07, "loss": 0.0149, "num_tokens": 174875409.0, "reward": 1.3984375, "reward_std": 0.25090962648391724, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5055117309093475, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.420013427734375, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 223.734375, "completions/mean_terminated_length": 223.734375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.41924851680949243, "grad_norm": 3.0448079109191895, "kl": 0.181640625, "learning_rate": 7.902374670184696e-07, "loss": -0.005, "num_tokens": 175430361.0, "reward": 1.5625, "reward_std": 0.3993477374315262, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.45680341124534607, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.504016101360321, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 222.828125, "completions/mean_terminated_length": 222.828125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.4205669083717864, "grad_norm": 2.738830804824829, "kl": 0.2490234375, "learning_rate": 7.895778364116095e-07, "loss": -0.0056, "num_tokens": 176009594.0, "reward": 1.3984375, "reward_std": 0.33694323897361755, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.5055117309093475, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.469681054353714, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 196.53125, "completions/mean_terminated_length": 196.53125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.4218852999340804, "grad_norm": 1.705451250076294, "kl": 0.16650390625, "learning_rate": 7.889182058047493e-07, "loss": 0.0067, "num_tokens": 176546047.0, "reward": 1.5078125, "reward_std": 0.3415832817554474, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.48896822333335876, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.504016101360321, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.5, "completions/max_terminated_length": 416.5, "completions/mean_length": 231.6875, "completions/mean_terminated_length": 231.6875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.42320369149637443, "grad_norm": 2.528270959854126, "kl": 0.3037109375, "learning_rate": 7.882585751978891e-07, "loss": -0.0073, "num_tokens": 177106090.0, "reward": 1.421875, "reward_std": 0.3440583050251007, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.5029991269111633, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.4825586974620819, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.5, "completions/max_terminated_length": 503.5, "completions/mean_length": 242.359375, "completions/mean_terminated_length": 242.359375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.4245220830586684, "grad_norm": 1.3968405723571777, "kl": 0.16015625, "learning_rate": 7.87598944591029e-07, "loss": 0.0174, "num_tokens": 177684579.0, "reward": 1.25, "reward_std": 0.23535014688968658, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.462014764547348, "rewards/counterfactual_reasoning_reward/mean": 0.140625, "rewards/counterfactual_reasoning_reward/std": 0.3083590194582939, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.5, "completions/max_terminated_length": 406.5, "completions/mean_length": 217.34375, "completions/mean_terminated_length": 217.34375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.4258404746209624, "grad_norm": 1.6374986171722412, "kl": 0.15771484375, "learning_rate": 7.869393139841688e-07, "loss": 0.0008, "num_tokens": 178243657.0, "reward": 1.515625, "reward_std": 0.3822704404592514, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.420013427734375, "rewards/counterfactual_reasoning_reward/mean": 0.46875, "rewards/counterfactual_reasoning_reward/std": 0.4907747954130173, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.1480722874403, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.5, "completions/max_terminated_length": 479.5, "completions/mean_length": 236.890625, "completions/mean_terminated_length": 236.890625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.42715886618325644, "grad_norm": 1.4865893125534058, "kl": 0.17431640625, "learning_rate": 7.862796833773087e-07, "loss": -0.0392, "num_tokens": 178787797.0, "reward": 1.3046875, "reward_std": 0.34504133462905884, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.48040975630283356, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.40346992015838623, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.5, "completions/max_terminated_length": 526.5, "completions/mean_length": 240.4375, "completions/mean_terminated_length": 240.4375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.42847725774555045, "grad_norm": 1.0296603441238403, "kl": 0.1953125, "learning_rate": 7.856200527704485e-07, "loss": -0.0068, "num_tokens": 179360947.0, "reward": 1.46875, "reward_std": 0.2359210029244423, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.41394005715847015, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.43795469403266907, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 223.796875, "completions/mean_terminated_length": 223.796875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.4297956493078444, "grad_norm": 2.453672409057617, "kl": 0.18505859375, "learning_rate": 7.849604221635883e-07, "loss": -0.0215, "num_tokens": 179930971.0, "reward": 1.6796875, "reward_std": 0.33977167308330536, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4337434321641922, "rewards/counterfactual_reasoning_reward/mean": 0.640625, "rewards/counterfactual_reasoning_reward/std": 0.48040975630283356, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.5, "completions/max_terminated_length": 581.5, "completions/mean_length": 241.3125, "completions/mean_terminated_length": 241.3125, "completions/min_length": 138.5, "completions/min_terminated_length": 138.5, "epoch": 0.43111404087013844, "grad_norm": 1.1915420293807983, "kl": 0.1796875, "learning_rate": 7.843007915567282e-07, "loss": 0.0097, "num_tokens": 180484086.0, "reward": 1.5390625, "reward_std": 0.22921262681484222, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.4635103940963745, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.4994383603334427, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 233.71875, "completions/mean_terminated_length": 233.71875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.43243243243243246, "grad_norm": 1.2385947704315186, "kl": 0.18310546875, "learning_rate": 7.83641160949868e-07, "loss": 0.0038, "num_tokens": 181057784.0, "reward": 1.296875, "reward_std": 0.26527372747659683, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.45227913558483124, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.3418920263648033, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.5, "completions/max_terminated_length": 516.5, "completions/mean_length": 239.96875, "completions/mean_terminated_length": 239.96875, "completions/min_length": 132.5, "completions/min_terminated_length": 132.5, "epoch": 0.4337508239947264, "grad_norm": 1.5733782052993774, "kl": 0.18603515625, "learning_rate": 7.829815303430079e-07, "loss": 0.0332, "num_tokens": 181635969.0, "reward": 1.390625, "reward_std": 0.17481552809476852, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.4907747954130173, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.4595021605491638, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 221.25, "completions/mean_terminated_length": 221.25, "completions/min_length": 120.5, "completions/min_terminated_length": 120.5, "epoch": 0.43506921555702044, "grad_norm": 2.268585205078125, "kl": 0.21435546875, "learning_rate": 7.823218997361477e-07, "loss": 0.0011, "num_tokens": 182215199.0, "reward": 1.4921875, "reward_std": 0.28575657308101654, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.4767438918352127, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.4595021605491638, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 243.71875, "completions/mean_terminated_length": 243.71875, "completions/min_length": 134.5, "completions/min_terminated_length": 134.5, "epoch": 0.43638760711931446, "grad_norm": 1.3634920120239258, "kl": 0.138671875, "learning_rate": 7.816622691292876e-07, "loss": 0.0144, "num_tokens": 182768505.0, "reward": 1.5703125, "reward_std": 0.30077045410871506, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.43795469403266907, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.4994383603334427, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 264.78125, "completions/mean_terminated_length": 264.78125, "completions/min_length": 144.5, "completions/min_terminated_length": 144.5, "epoch": 0.4377059986816084, "grad_norm": 1.1219345331192017, "kl": 0.177734375, "learning_rate": 7.810026385224274e-07, "loss": 0.0107, "num_tokens": 183331008.0, "reward": 1.34375, "reward_std": 0.21382881700992584, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.5015034973621368, "rewards/counterfactual_reasoning_reward/mean": 0.203125, "rewards/counterfactual_reasoning_reward/std": 0.40828560292720795, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 244.828125, "completions/mean_terminated_length": 244.828125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.43902439024390244, "grad_norm": 1.2601338624954224, "kl": 0.15771484375, "learning_rate": 7.803430079155672e-07, "loss": -0.0041, "num_tokens": 183881732.0, "reward": 1.34375, "reward_std": 0.2858250066637993, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.4659053534269333, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.5, "completions/max_terminated_length": 508.5, "completions/mean_length": 229.171875, "completions/mean_terminated_length": 229.171875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.44034278180619646, "grad_norm": 1.1783356666564941, "kl": 0.19580078125, "learning_rate": 7.796833773087071e-07, "loss": 0.0029, "num_tokens": 184458646.0, "reward": 1.515625, "reward_std": 0.29838570952415466, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.425730362534523, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.4659053534269333, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.5, "completions/max_terminated_length": 480.5, "completions/mean_length": 227.0, "completions/mean_terminated_length": 227.0, "completions/min_length": 136.5, "completions/min_terminated_length": 136.5, "epoch": 0.4416611733684904, "grad_norm": 1.3234957456588745, "kl": 0.15087890625, "learning_rate": 7.790237467018469e-07, "loss": 0.0125, "num_tokens": 185000405.0, "reward": 1.34375, "reward_std": 0.26237140595912933, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.5060082972049713, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.4128527194261551, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.5, "completions/max_terminated_length": 367.5, "completions/mean_length": 208.203125, "completions/mean_terminated_length": 208.203125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.44297956493078444, "grad_norm": 1.7237800359725952, "kl": 0.1708984375, "learning_rate": 7.783641160949868e-07, "loss": 0.0009, "num_tokens": 185550197.0, "reward": 1.4375, "reward_std": 0.3555232882499695, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.4979427307844162, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.47197872400283813, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 188.640625, "completions/mean_terminated_length": 188.640625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.44429795649307846, "grad_norm": 1.5205352306365967, "kl": 0.14208984375, "learning_rate": 7.777044854881266e-07, "loss": 0.0281, "num_tokens": 186105568.0, "reward": 1.5859375, "reward_std": 0.2520020827651024, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.47197872400283813, "rewards/counterfactual_reasoning_reward/mean": 0.515625, "rewards/counterfactual_reasoning_reward/std": 0.48721402883529663, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.5, "completions/max_terminated_length": 331.5, "completions/mean_length": 188.84375, "completions/mean_terminated_length": 188.84375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.4456163480553724, "grad_norm": 1.3839302062988281, "kl": 0.189453125, "learning_rate": 7.770448548812664e-07, "loss": 0.0185, "num_tokens": 186663442.0, "reward": 1.40625, "reward_std": 0.25989269465208054, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.49186936020851135, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.469681054353714, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.5, "completions/max_terminated_length": 540.5, "completions/mean_length": 203.59375, "completions/mean_terminated_length": 203.59375, "completions/min_length": 124.5, "completions/min_terminated_length": 124.5, "epoch": 0.44693473961766644, "grad_norm": 9.889595031738281, "kl": 0.1875, "learning_rate": 7.763852242744063e-07, "loss": -0.001, "num_tokens": 187166543.0, "reward": 1.3671875, "reward_std": 0.20627917349338531, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5075039267539978, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.425730362534523, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.5, "completions/max_terminated_length": 328.5, "completions/mean_length": 195.53125, "completions/mean_terminated_length": 195.53125, "completions/min_length": 125.5, "completions/min_terminated_length": 125.5, "epoch": 0.44825313117996046, "grad_norm": 6.298471927642822, "kl": 0.17626953125, "learning_rate": 7.757255936675461e-07, "loss": -0.0079, "num_tokens": 187761752.0, "reward": 1.765625, "reward_std": 0.22059447318315506, "rewards/accuracy_reward/mean": 0.828125, "rewards/accuracy_reward/std": 0.3827299028635025, "rewards/counterfactual_reasoning_reward/mean": 0.703125, "rewards/counterfactual_reasoning_reward/std": 0.4638662487268448, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 204.71875, "completions/mean_terminated_length": 204.71875, "completions/min_length": 121.5, "completions/min_terminated_length": 121.5, "epoch": 0.4495715227422544, "grad_norm": 3.63222336769104, "kl": 0.19384765625, "learning_rate": 7.75065963060686e-07, "loss": -0.0293, "num_tokens": 188339366.0, "reward": 1.453125, "reward_std": 0.2663671672344208, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.4932873994112015, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.44777433574199677, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 207.015625, "completions/mean_terminated_length": 207.015625, "completions/min_length": 123.5, "completions/min_terminated_length": 123.5, "epoch": 0.45088991430454844, "grad_norm": 2.5461981296539307, "kl": 0.19970703125, "learning_rate": 7.744063324538258e-07, "loss": 0.0205, "num_tokens": 188861436.0, "reward": 1.1953125, "reward_std": 0.2721085250377655, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.4215090572834015, "rewards/counterfactual_reasoning_reward/mean": 0.09375, "rewards/counterfactual_reasoning_reward/std": 0.2961445748806, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.5, "completions/max_terminated_length": 372.5, "completions/mean_length": 185.859375, "completions/mean_terminated_length": 185.859375, "completions/min_length": 113.5, "completions/min_terminated_length": 113.5, "epoch": 0.45220830586684246, "grad_norm": 1.1723432540893555, "kl": 0.212890625, "learning_rate": 7.737467018469657e-07, "loss": -0.0302, "num_tokens": 189433155.0, "reward": 1.453125, "reward_std": 0.27856065332889557, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.5034956932067871, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48721402883529663, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 205.640625, "completions/mean_terminated_length": 205.640625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4535266974291364, "grad_norm": 1.312558650970459, "kl": 0.1884765625, "learning_rate": 7.730870712401055e-07, "loss": 0.0009, "num_tokens": 189971104.0, "reward": 1.5390625, "reward_std": 0.3596854954957962, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.41394005715847015, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.4734743535518646, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.5, "completions/max_terminated_length": 373.5, "completions/mean_length": 182.859375, "completions/mean_terminated_length": 182.859375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.45484508899143045, "grad_norm": 3.815779685974121, "kl": 0.17578125, "learning_rate": 7.724274406332453e-07, "loss": 0.0136, "num_tokens": 190507291.0, "reward": 1.4765625, "reward_std": 0.1823110654950142, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4932873994112015, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.48190538585186005, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.5, "completions/max_terminated_length": 398.5, "completions/mean_length": 219.21875, "completions/mean_terminated_length": 219.21875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.45616348055372447, "grad_norm": 1.4453219175338745, "kl": 0.20068359375, "learning_rate": 7.717678100263852e-07, "loss": -0.0058, "num_tokens": 191099958.0, "reward": 1.296875, "reward_std": 0.26676009595394135, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.4932873994112015, "rewards/counterfactual_reasoning_reward/mean": 0.171875, "rewards/counterfactual_reasoning_reward/std": 0.378012090921402, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.5, "completions/max_terminated_length": 484.5, "completions/mean_length": 212.71875, "completions/mean_terminated_length": 212.71875, "completions/min_length": 112.5, "completions/min_terminated_length": 112.5, "epoch": 0.4574818721160185, "grad_norm": 2.500900983810425, "kl": 0.197265625, "learning_rate": 7.71108179419525e-07, "loss": -0.0342, "num_tokens": 191653820.0, "reward": 1.546875, "reward_std": 0.3488876447081566, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.469681054353714, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.49478302896022797, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.5, "completions/max_terminated_length": 423.5, "completions/mean_length": 178.109375, "completions/mean_terminated_length": 178.109375, "completions/min_length": 118.5, "completions/min_terminated_length": 118.5, "epoch": 0.45880026367831245, "grad_norm": 5.47750997543335, "kl": 0.19287109375, "learning_rate": 7.704485488126649e-07, "loss": 0.001, "num_tokens": 192210714.0, "reward": 1.46875, "reward_std": 0.37546592950820923, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.5029991269111633, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4907747954130173, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.5, "completions/max_terminated_length": 406.5, "completions/mean_length": 195.78125, "completions/mean_terminated_length": 195.78125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.46011865524060647, "grad_norm": 1.7408441305160522, "kl": 0.21728515625, "learning_rate": 7.697889182058047e-07, "loss": 0.0265, "num_tokens": 192780525.0, "reward": 1.2578125, "reward_std": 0.32169267535209656, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.44777433574199677, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.425730362534523, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.5, "completions/max_terminated_length": 374.5, "completions/mean_length": 195.125, "completions/mean_terminated_length": 195.125, "completions/min_length": 129.5, "completions/min_terminated_length": 129.5, "epoch": 0.4614370468029005, "grad_norm": 1.0022386312484741, "kl": 0.21240234375, "learning_rate": 7.691292875989445e-07, "loss": 0.0128, "num_tokens": 193331110.0, "reward": 1.4609375, "reward_std": 0.2243683859705925, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.3975677341222763, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.4175008237361908, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.5, "completions/max_terminated_length": 346.5, "completions/mean_length": 166.515625, "completions/mean_terminated_length": 166.515625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.46275543836519445, "grad_norm": 4.099958419799805, "kl": 0.73095703125, "learning_rate": 7.684696569920844e-07, "loss": 0.0203, "num_tokens": 193852336.0, "reward": 1.46875, "reward_std": 0.30905766785144806, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4175008237361908, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4364590644836426, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.5, "completions/max_terminated_length": 368.5, "completions/mean_length": 174.3125, "completions/mean_terminated_length": 174.3125, "completions/min_length": 105.5, "completions/min_terminated_length": 105.5, "epoch": 0.46407382992748847, "grad_norm": 1.9724088907241821, "kl": 0.1884765625, "learning_rate": 7.678100263852242e-07, "loss": -0.0069, "num_tokens": 194404829.0, "reward": 1.2421875, "reward_std": 0.19396990537643433, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.4709290862083435, "rewards/counterfactual_reasoning_reward/mean": 0.171875, "rewards/counterfactual_reasoning_reward/std": 0.3827299028635025, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.5, "completions/max_terminated_length": 382.5, "completions/mean_length": 200.15625, "completions/mean_terminated_length": 200.15625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.4653922214897825, "grad_norm": 0.8456544280052185, "kl": 0.203125, "learning_rate": 7.671503957783641e-07, "loss": 0.001, "num_tokens": 194953399.0, "reward": 1.2734375, "reward_std": 0.31059296429157257, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.4979427307844162, "rewards/counterfactual_reasoning_reward/mean": 0.171875, "rewards/counterfactual_reasoning_reward/std": 0.3827299028635025, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 210.125, "completions/mean_terminated_length": 210.125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.46671061305207645, "grad_norm": 2.079271078109741, "kl": 0.22509765625, "learning_rate": 7.664907651715039e-07, "loss": 0.0548, "num_tokens": 195497182.0, "reward": 1.4609375, "reward_std": 0.24416711181402206, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.5055117309093475, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4874725937843323, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.5, "completions/max_terminated_length": 333.5, "completions/mean_length": 184.46875, "completions/mean_terminated_length": 184.46875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.46802900461437047, "grad_norm": 1.382488489151001, "kl": 0.17431640625, "learning_rate": 7.658311345646438e-07, "loss": 0.0067, "num_tokens": 196047111.0, "reward": 1.328125, "reward_std": 0.33007654547691345, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.425730362534523, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.5, "completions/max_terminated_length": 298.5, "completions/mean_length": 175.875, "completions/mean_terminated_length": 175.875, "completions/min_length": 110.5, "completions/min_terminated_length": 110.5, "epoch": 0.4693473961766645, "grad_norm": 1.8674745559692383, "kl": 0.1904296875, "learning_rate": 7.651715039577836e-07, "loss": 0.001, "num_tokens": 196583516.0, "reward": 1.515625, "reward_std": 0.31871289014816284, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.4743363857269287, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48721402883529663, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.5, "completions/max_terminated_length": 317.5, "completions/mean_length": 174.1875, "completions/mean_terminated_length": 174.1875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.47066578773895845, "grad_norm": 1.5446696281433105, "kl": 0.2431640625, "learning_rate": 7.645118733509234e-07, "loss": -0.0007, "num_tokens": 197111166.0, "reward": 1.3671875, "reward_std": 0.29746749997138977, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.495430126786232, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.38935163617134094, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 168.328125, "completions/mean_terminated_length": 168.328125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.47198417930125247, "grad_norm": 1.3544058799743652, "kl": 0.2587890625, "learning_rate": 7.638522427440633e-07, "loss": -0.0007, "num_tokens": 197682205.0, "reward": 1.625, "reward_std": 0.3317541107535362, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.4339464604854584, "rewards/counterfactual_reasoning_reward/mean": 0.53125, "rewards/counterfactual_reasoning_reward/std": 0.48139922320842743, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.5, "completions/max_terminated_length": 528.5, "completions/mean_length": 206.109375, "completions/mean_terminated_length": 206.109375, "completions/min_length": 104.5, "completions/min_terminated_length": 104.5, "epoch": 0.4733025708635465, "grad_norm": 2.7651515007019043, "kl": 0.78125, "learning_rate": 7.631926121372031e-07, "loss": 0.0098, "num_tokens": 198246716.0, "reward": 1.3671875, "reward_std": 0.2597433179616928, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.5055117309093475, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.469681054353714, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 173.078125, "completions/mean_terminated_length": 173.078125, "completions/min_length": 114.5, "completions/min_terminated_length": 114.5, "epoch": 0.47462096242584045, "grad_norm": 1.7829608917236328, "kl": 0.23779296875, "learning_rate": 7.62532981530343e-07, "loss": 0.0012, "num_tokens": 198813169.0, "reward": 1.59375, "reward_std": 0.3512505143880844, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.4339464604854584, "rewards/counterfactual_reasoning_reward/mean": 0.46875, "rewards/counterfactual_reasoning_reward/std": 0.4979427307844162, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.5, "completions/max_terminated_length": 430.5, "completions/mean_length": 182.28125, "completions/mean_terminated_length": 182.28125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4759393539881345, "grad_norm": 1.797888994216919, "kl": 0.21875, "learning_rate": 7.618733509234828e-07, "loss": -0.0038, "num_tokens": 199362367.0, "reward": 1.578125, "reward_std": 0.41052503883838654, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.4395582377910614, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.48495998978614807, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 211.484375, "completions/mean_terminated_length": 211.484375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.4772577455504285, "grad_norm": 1.4032807350158691, "kl": 0.22021484375, "learning_rate": 7.612137203166226e-07, "loss": 0.0353, "num_tokens": 199920865.0, "reward": 1.359375, "reward_std": 0.31017760932445526, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.49527959525585175, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.46125002205371857, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 197.046875, "completions/mean_terminated_length": 197.046875, "completions/min_length": 108.5, "completions/min_terminated_length": 108.5, "epoch": 0.47857613711272246, "grad_norm": 1.3392174243927002, "kl": 0.2001953125, "learning_rate": 7.605540897097626e-07, "loss": -0.001, "num_tokens": 200455680.0, "reward": 1.5078125, "reward_std": 0.2830342948436737, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49899089336395264, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.5034956932067871, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.5, "completions/max_terminated_length": 361.5, "completions/mean_length": 189.5625, "completions/mean_terminated_length": 189.5625, "completions/min_length": 110.5, "completions/min_terminated_length": 110.5, "epoch": 0.4798945286750165, "grad_norm": 2.145650625228882, "kl": 0.1748046875, "learning_rate": 7.598944591029023e-07, "loss": -0.0001, "num_tokens": 201005494.0, "reward": 1.515625, "reward_std": 0.24672859907150269, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.495430126786232, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.49527959525585175, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 165.328125, "completions/mean_terminated_length": 165.328125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4812129202373105, "grad_norm": 2.0097687244415283, "kl": 0.2294921875, "learning_rate": 7.592348284960422e-07, "loss": 0.0197, "num_tokens": 201524471.0, "reward": 1.3984375, "reward_std": 0.3014257550239563, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.507007360458374, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4638662487268448, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 185.09375, "completions/mean_terminated_length": 185.09375, "completions/min_length": 113.5, "completions/min_terminated_length": 113.5, "epoch": 0.48253131179960446, "grad_norm": 1.6746400594711304, "kl": 0.234375, "learning_rate": 7.58575197889182e-07, "loss": 0.0041, "num_tokens": 202075256.0, "reward": 1.4609375, "reward_std": 0.3553185760974884, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4734743535518646, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.48190538585186005, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.5, "completions/max_terminated_length": 253.5, "completions/mean_length": 154.921875, "completions/mean_terminated_length": 154.921875, "completions/min_length": 114.5, "completions/min_terminated_length": 114.5, "epoch": 0.4838497033618985, "grad_norm": 2.1746621131896973, "kl": 0.22802734375, "learning_rate": 7.579155672823219e-07, "loss": -0.0047, "num_tokens": 202588583.0, "reward": 1.578125, "reward_std": 0.2853103280067444, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.4635103940963745, "rewards/counterfactual_reasoning_reward/mean": 0.5, "rewards/counterfactual_reasoning_reward/std": 0.4825586974620819, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.5, "completions/max_terminated_length": 314.5, "completions/mean_length": 168.140625, "completions/mean_terminated_length": 168.140625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.4851680949241925, "grad_norm": 1.618688941001892, "kl": 0.28271484375, "learning_rate": 7.572559366754617e-07, "loss": -0.0132, "num_tokens": 203172008.0, "reward": 1.5390625, "reward_std": 0.27959371358156204, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4739709198474884, "rewards/counterfactual_reasoning_reward/mean": 0.484375, "rewards/counterfactual_reasoning_reward/std": 0.5015034973621368, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 159.671875, "completions/mean_terminated_length": 159.671875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4864864864864865, "grad_norm": 1.5737465620040894, "kl": 0.25048828125, "learning_rate": 7.565963060686015e-07, "loss": 0.0013, "num_tokens": 203704409.0, "reward": 1.625, "reward_std": 0.3784441500902176, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.425730362534523, "rewards/counterfactual_reasoning_reward/mean": 0.53125, "rewards/counterfactual_reasoning_reward/std": 0.469681054353714, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 157.515625, "completions/mean_terminated_length": 157.515625, "completions/min_length": 98.5, "completions/min_terminated_length": 98.5, "epoch": 0.4878048780487805, "grad_norm": 2.1531553268432617, "kl": 0.421875, "learning_rate": 7.559366754617414e-07, "loss": 0.0207, "num_tokens": 204269391.0, "reward": 1.3203125, "reward_std": 0.42263369262218475, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.5034956932067871, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.43840841948986053, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.5, "completions/max_terminated_length": 289.5, "completions/mean_length": 159.96875, "completions/mean_terminated_length": 159.96875, "completions/min_length": 111.5, "completions/min_terminated_length": 111.5, "epoch": 0.4891232696110745, "grad_norm": 1.4517130851745605, "kl": 0.23291015625, "learning_rate": 7.552770448548812e-07, "loss": 0.0051, "num_tokens": 204812393.0, "reward": 1.40625, "reward_std": 0.32894107699394226, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.5015034973621368, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.4743363857269287, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 167.4375, "completions/mean_terminated_length": 167.4375, "completions/min_length": 93.5, "completions/min_terminated_length": 93.5, "epoch": 0.4904416611733685, "grad_norm": 2.119249105453491, "kl": 0.2724609375, "learning_rate": 7.546174142480211e-07, "loss": 0.0209, "num_tokens": 205366290.0, "reward": 1.296875, "reward_std": 0.24454617500305176, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48190538585186005, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.40346992015838623, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.5, "completions/max_terminated_length": 442.5, "completions/mean_length": 174.015625, "completions/mean_terminated_length": 174.015625, "completions/min_length": 119.5, "completions/min_terminated_length": 119.5, "epoch": 0.4917600527356625, "grad_norm": 1.6805349588394165, "kl": 0.1806640625, "learning_rate": 7.539577836411609e-07, "loss": 0.0077, "num_tokens": 205886499.0, "reward": 1.4921875, "reward_std": 0.3523232042789459, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.48040975630283356, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.49186936020851135, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 156.25, "completions/mean_terminated_length": 156.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.4930784442979565, "grad_norm": 1.4792063236236572, "kl": 0.25732421875, "learning_rate": 7.532981530343007e-07, "loss": 0.0062, "num_tokens": 206413743.0, "reward": 1.5234375, "reward_std": 0.29467204213142395, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.4743363857269287, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.48190538585186005, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 161.53125, "completions/mean_terminated_length": 161.53125, "completions/min_length": 112.5, "completions/min_terminated_length": 112.5, "epoch": 0.4943968358602505, "grad_norm": 344.5700988769531, "kl": 12.86328125, "learning_rate": 7.526385224274407e-07, "loss": 0.0642, "num_tokens": 206972636.0, "reward": 1.578125, "reward_std": 0.2946252375841141, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.43038569390773773, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.5034956932067871, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.5, "completions/max_terminated_length": 292.5, "completions/mean_length": 158.46875, "completions/mean_terminated_length": 158.46875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4957152274225445, "grad_norm": 2.8440568447113037, "kl": 0.36376953125, "learning_rate": 7.519788918205804e-07, "loss": 0.0018, "num_tokens": 207521518.0, "reward": 1.640625, "reward_std": 0.29658204317092896, "rewards/accuracy_reward/mean": 0.78125, "rewards/accuracy_reward/std": 0.41824956238269806, "rewards/counterfactual_reasoning_reward/mean": 0.5, "rewards/counterfactual_reasoning_reward/std": 0.49899089336395264, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.5, "completions/max_terminated_length": 209.5, "completions/mean_length": 151.140625, "completions/mean_terminated_length": 151.140625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.4970336189848385, "grad_norm": 1.9395447969436646, "kl": 0.20947265625, "learning_rate": 7.513192612137203e-07, "loss": 0.0137, "num_tokens": 208052441.0, "reward": 1.4296875, "reward_std": 0.27177757024765015, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.4979427307844162, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.45178256928920746, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.5, "completions/max_terminated_length": 289.5, "completions/mean_length": 172.921875, "completions/mean_terminated_length": 172.921875, "completions/min_length": 131.5, "completions/min_terminated_length": 131.5, "epoch": 0.4983520105471325, "grad_norm": 1.3854053020477295, "kl": 0.1962890625, "learning_rate": 7.506596306068601e-07, "loss": 0.001, "num_tokens": 208580513.0, "reward": 1.4609375, "reward_std": 0.293335422873497, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.4734743535518646, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.4635103940963745, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 167.578125, "completions/mean_terminated_length": 167.578125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.4996704021094265, "grad_norm": 1.8146915435791016, "kl": 0.21728515625, "learning_rate": 7.5e-07, "loss": 0.0099, "num_tokens": 209149436.0, "reward": 1.609375, "reward_std": 0.27724190056324005, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.3378837928175926, "rewards/counterfactual_reasoning_reward/mean": 0.5625, "rewards/counterfactual_reasoning_reward/std": 0.4337434321641922, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 169.96875, "completions/mean_terminated_length": 169.96875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.5009887936717206, "grad_norm": 1.5519425868988037, "kl": 0.1748046875, "learning_rate": 7.493403693931398e-07, "loss": 0.0018, "num_tokens": 209678242.0, "reward": 1.65625, "reward_std": 0.25566761940717697, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.3584318831562996, "rewards/counterfactual_reasoning_reward/mean": 0.5625, "rewards/counterfactual_reasoning_reward/std": 0.49478302896022797, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 150.078125, "completions/mean_terminated_length": 150.078125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.5023071852340145, "grad_norm": 2.135385513305664, "kl": 0.2109375, "learning_rate": 7.486807387862796e-07, "loss": 0.0011, "num_tokens": 210214193.0, "reward": 1.3984375, "reward_std": 0.3974299728870392, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.4299773871898651, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.42200562357902527, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 161.609375, "completions/mean_terminated_length": 161.609375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.5036255767963085, "grad_norm": 1.3202711343765259, "kl": 0.20068359375, "learning_rate": 7.480211081794196e-07, "loss": 0.0059, "num_tokens": 210752211.0, "reward": 1.6015625, "reward_std": 0.3085828423500061, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.37497539073228836, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.40442168712615967, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 161.890625, "completions/mean_terminated_length": 161.890625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5049439683586026, "grad_norm": 4.272532939910889, "kl": 0.2763671875, "learning_rate": 7.473614775725593e-07, "loss": 0.0024, "num_tokens": 211302245.0, "reward": 1.4921875, "reward_std": 0.36358577013015747, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.48946478962898254, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.4932873994112015, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 150.421875, "completions/mean_terminated_length": 150.421875, "completions/min_length": 117.5, "completions/min_terminated_length": 117.5, "epoch": 0.5062623599208965, "grad_norm": 3.1509361267089844, "kl": 0.1845703125, "learning_rate": 7.467018469656992e-07, "loss": 0.0244, "num_tokens": 211860314.0, "reward": 1.6953125, "reward_std": 0.3078947365283966, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.40928472578525543, "rewards/counterfactual_reasoning_reward/mean": 0.625, "rewards/counterfactual_reasoning_reward/std": 0.48190538585186005, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.5, "completions/max_terminated_length": 302.5, "completions/mean_length": 170.09375, "completions/mean_terminated_length": 170.09375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5075807514831905, "grad_norm": 1.5550843477249146, "kl": 0.18310546875, "learning_rate": 7.46042216358839e-07, "loss": 0.0156, "num_tokens": 212386738.0, "reward": 1.375, "reward_std": 0.24573804438114166, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.504016101360321, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.48139922320842743, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.5, "completions/max_terminated_length": 228.5, "completions/mean_length": 163.09375, "completions/mean_terminated_length": 163.09375, "completions/min_length": 125.5, "completions/min_terminated_length": 125.5, "epoch": 0.5088991430454846, "grad_norm": 1.2955602407455444, "kl": 0.1669921875, "learning_rate": 7.453825857519788e-07, "loss": 0.0008, "num_tokens": 212935461.0, "reward": 1.4296875, "reward_std": 0.3167223334312439, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.44547125697135925, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.43795469403266907, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.5, "completions/max_terminated_length": 257.5, "completions/mean_length": 153.625, "completions/mean_terminated_length": 153.625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5102175346077785, "grad_norm": 1.642083764076233, "kl": 0.17822265625, "learning_rate": 7.447229551451188e-07, "loss": 0.0107, "num_tokens": 213482885.0, "reward": 1.6015625, "reward_std": 0.19127750396728516, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.462014764547348, "rewards/counterfactual_reasoning_reward/mean": 0.53125, "rewards/counterfactual_reasoning_reward/std": 0.48139922320842743, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 159.375, "completions/mean_terminated_length": 159.375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.5115359261700725, "grad_norm": 1.9828922748565674, "kl": 0.1806640625, "learning_rate": 7.440633245382586e-07, "loss": -0.0382, "num_tokens": 214021590.0, "reward": 1.4453125, "reward_std": 0.28840014338493347, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.48190538585186005, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4364590644836426, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 157.28125, "completions/mean_terminated_length": 157.28125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.5128543177323666, "grad_norm": 2.5036816596984863, "kl": 0.18212890625, "learning_rate": 7.434036939313984e-07, "loss": 0.0224, "num_tokens": 214592667.0, "reward": 1.5625, "reward_std": 0.31130756437778473, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.4442135691642761, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.5029991269111633, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 153.125, "completions/mean_terminated_length": 153.125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5141727092946605, "grad_norm": 14.70569896697998, "kl": 0.19287109375, "learning_rate": 7.427440633245382e-07, "loss": 0.0264, "num_tokens": 215136762.0, "reward": 1.4609375, "reward_std": 0.3367668688297272, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5055117309093475, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.4979427307844162, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.5, "completions/max_terminated_length": 257.5, "completions/mean_length": 161.578125, "completions/mean_terminated_length": 161.578125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.5154911008569545, "grad_norm": 3.0729470252990723, "kl": 0.23046875, "learning_rate": 7.42084432717678e-07, "loss": 0.0012, "num_tokens": 215694926.0, "reward": 1.53125, "reward_std": 0.36878904700279236, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.462014764547348, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.4824019521474838, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 157.796875, "completions/mean_terminated_length": 157.796875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.5168094924192486, "grad_norm": 3.2940866947174072, "kl": 0.41064453125, "learning_rate": 7.414248021108179e-07, "loss": 0.0021, "num_tokens": 216230250.0, "reward": 1.375, "reward_std": 0.33267800509929657, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49993492662906647, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.4399413466453552, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.5, "completions/max_terminated_length": 277.5, "completions/mean_length": 159.609375, "completions/mean_terminated_length": 159.609375, "completions/min_length": 112.5, "completions/min_terminated_length": 112.5, "epoch": 0.5181278839815425, "grad_norm": 1.654248833656311, "kl": 0.24755859375, "learning_rate": 7.407651715039578e-07, "loss": 0.0012, "num_tokens": 216789309.0, "reward": 1.578125, "reward_std": 0.3433515280485153, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.3423885926604271, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.4743363857269287, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 150.515625, "completions/mean_terminated_length": 150.515625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5194462755438365, "grad_norm": 1.6646603345870972, "kl": 0.21240234375, "learning_rate": 7.401055408970977e-07, "loss": 0.0011, "num_tokens": 217347611.0, "reward": 1.3359375, "reward_std": 0.3262050449848175, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.5029991269111633, "rewards/counterfactual_reasoning_reward/mean": 0.203125, "rewards/counterfactual_reasoning_reward/std": 0.3642466887831688, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 149.1875, "completions/mean_terminated_length": 149.1875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.5207646671061306, "grad_norm": 1.1046637296676636, "kl": 0.1865234375, "learning_rate": 7.394459102902374e-07, "loss": 0.0009, "num_tokens": 217897620.0, "reward": 1.5234375, "reward_std": 0.3125290423631668, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.47197872400283813, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.48896822333335876, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 158.640625, "completions/mean_terminated_length": 158.640625, "completions/min_length": 110.5, "completions/min_terminated_length": 110.5, "epoch": 0.5220830586684245, "grad_norm": 1.6615976095199585, "kl": 0.1787109375, "learning_rate": 7.387862796833773e-07, "loss": -0.0069, "num_tokens": 218429648.0, "reward": 1.4765625, "reward_std": 0.275302529335022, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.47197872400283813, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.45178256928920746, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 153.828125, "completions/mean_terminated_length": 153.828125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.5234014502307185, "grad_norm": 1.3731876611709595, "kl": 0.22216796875, "learning_rate": 7.381266490765171e-07, "loss": 0.0011, "num_tokens": 218969254.0, "reward": 1.5703125, "reward_std": 0.3334064334630966, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.45178256928920746, "rewards/counterfactual_reasoning_reward/mean": 0.46875, "rewards/counterfactual_reasoning_reward/std": 0.48139922320842743, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.5, "completions/max_terminated_length": 264.5, "completions/mean_length": 156.109375, "completions/mean_terminated_length": 156.109375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.5247198417930126, "grad_norm": 6.019032955169678, "kl": 0.828125, "learning_rate": 7.374670184696569e-07, "loss": 0.0012, "num_tokens": 219523212.0, "reward": 1.5234375, "reward_std": 0.2970837652683258, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.48946478962898254, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.4743363857269287, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 157.6875, "completions/mean_terminated_length": 157.6875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.5260382333553065, "grad_norm": 2.6566803455352783, "kl": 0.22021484375, "learning_rate": 7.368073878627969e-07, "loss": 0.0011, "num_tokens": 220056348.0, "reward": 1.3984375, "reward_std": 0.3611602336168289, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.504016101360321, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.44837237894535065, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.5, "completions/max_terminated_length": 215.5, "completions/mean_length": 152.8125, "completions/mean_terminated_length": 152.8125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.5273566249176005, "grad_norm": 9.714990615844727, "kl": 0.17041015625, "learning_rate": 7.361477572559367e-07, "loss": 0.0145, "num_tokens": 220606248.0, "reward": 1.53125, "reward_std": 0.2542991414666176, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.4635103940963745, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.49478302896022797, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.5, "completions/max_terminated_length": 297.5, "completions/mean_length": 164.046875, "completions/mean_terminated_length": 164.046875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.5286750164798946, "grad_norm": 2.278655767440796, "kl": 0.26416015625, "learning_rate": 7.354881266490765e-07, "loss": 0.0091, "num_tokens": 221169453.0, "reward": 1.6640625, "reward_std": 0.21214542537927628, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.3403963968157768, "rewards/counterfactual_reasoning_reward/mean": 0.5625, "rewards/counterfactual_reasoning_reward/std": 0.45128606259822845, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 150.765625, "completions/mean_terminated_length": 150.765625, "completions/min_length": 105.5, "completions/min_terminated_length": 105.5, "epoch": 0.5299934080421885, "grad_norm": 1.4431713819503784, "kl": 0.228515625, "learning_rate": 7.348284960422163e-07, "loss": 0.008, "num_tokens": 221707202.0, "reward": 1.453125, "reward_std": 0.30943262577056885, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.5055117309093475, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48495998978614807, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.5, "completions/max_terminated_length": 290.5, "completions/mean_length": 161.265625, "completions/mean_terminated_length": 161.265625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5313117996044825, "grad_norm": 2.049102306365967, "kl": 0.1962890625, "learning_rate": 7.341688654353561e-07, "loss": 0.0176, "num_tokens": 222279190.0, "reward": 1.46875, "reward_std": 0.21029303595423698, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49993492662906647, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4874725937843323, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.5, "completions/max_terminated_length": 282.5, "completions/mean_length": 170.34375, "completions/mean_terminated_length": 170.34375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5326301911667766, "grad_norm": 1.4652104377746582, "kl": 0.20947265625, "learning_rate": 7.33509234828496e-07, "loss": -0.0224, "num_tokens": 222814307.0, "reward": 1.3671875, "reward_std": 0.17407145351171494, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.4874725937843323, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4364590644836426, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.5, "completions/max_terminated_length": 209.5, "completions/mean_length": 154.09375, "completions/mean_terminated_length": 154.09375, "completions/min_length": 111.5, "completions/min_terminated_length": 111.5, "epoch": 0.5339485827290705, "grad_norm": 1.576002836227417, "kl": 0.23828125, "learning_rate": 7.328496042216359e-07, "loss": -0.0037, "num_tokens": 223365856.0, "reward": 1.4296875, "reward_std": 0.2506173476576805, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.43840841948986053, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48040975630283356, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 165.515625, "completions/mean_terminated_length": 165.515625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.5352669742913645, "grad_norm": 3.0153253078460693, "kl": 0.2099609375, "learning_rate": 7.321899736147758e-07, "loss": 0.0011, "num_tokens": 223873276.0, "reward": 1.4765625, "reward_std": 0.3395759016275406, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.49186936020851135, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.4767438918352127, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.5, "completions/max_terminated_length": 273.5, "completions/mean_length": 164.84375, "completions/mean_terminated_length": 164.84375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5365853658536586, "grad_norm": 1.377851128578186, "kl": 0.19189453125, "learning_rate": 7.315303430079155e-07, "loss": 0.0078, "num_tokens": 224399443.0, "reward": 1.5390625, "reward_std": 0.27497004717588425, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.4825586974620819, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.48896822333335876, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 158.375, "completions/mean_terminated_length": 158.375, "completions/min_length": 79.5, "completions/min_terminated_length": 79.5, "epoch": 0.5379037574159525, "grad_norm": 1.9699807167053223, "kl": 0.228515625, "learning_rate": 7.308707124010554e-07, "loss": 0.0002, "num_tokens": 224952430.0, "reward": 1.453125, "reward_std": 0.4072958678007126, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.48896822333335876, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.4364590644836426, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.5, "completions/max_terminated_length": 242.5, "completions/mean_length": 163.53125, "completions/mean_terminated_length": 163.53125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.5392221489782465, "grad_norm": 1.4508167505264282, "kl": 0.2578125, "learning_rate": 7.302110817941952e-07, "loss": 0.0013, "num_tokens": 225529449.0, "reward": 1.6953125, "reward_std": 0.2833397686481476, "rewards/accuracy_reward/mean": 0.859375, "rewards/accuracy_reward/std": 0.3083590194582939, "rewards/counterfactual_reasoning_reward/mean": 0.53125, "rewards/counterfactual_reasoning_reward/std": 0.5029991269111633, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 172.453125, "completions/mean_terminated_length": 172.453125, "completions/min_length": 125.5, "completions/min_terminated_length": 125.5, "epoch": 0.5405405405405406, "grad_norm": 2.026759386062622, "kl": 0.2587890625, "learning_rate": 7.29551451187335e-07, "loss": -0.0153, "num_tokens": 226126322.0, "reward": 1.65625, "reward_std": 0.2298443838953972, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.45543521642684937, "rewards/counterfactual_reasoning_reward/mean": 0.625, "rewards/counterfactual_reasoning_reward/std": 0.4874725937843323, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.5, "completions/max_terminated_length": 240.5, "completions/mean_length": 164.109375, "completions/mean_terminated_length": 164.109375, "completions/min_length": 114.5, "completions/min_terminated_length": 114.5, "epoch": 0.5418589321028345, "grad_norm": 6.452300071716309, "kl": 0.73779296875, "learning_rate": 7.28891820580475e-07, "loss": 0.0105, "num_tokens": 226650851.0, "reward": 1.5078125, "reward_std": 0.26515478640794754, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.4659053534269333, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.39400696754455566, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 169.734375, "completions/mean_terminated_length": 169.734375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.5431773236651285, "grad_norm": 1.5020703077316284, "kl": 0.25439453125, "learning_rate": 7.282321899736148e-07, "loss": -0.0173, "num_tokens": 227226796.0, "reward": 1.5390625, "reward_std": 0.2918113097548485, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.3584318831562996, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.46125002205371857, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 178.78125, "completions/mean_terminated_length": 178.78125, "completions/min_length": 123.5, "completions/min_terminated_length": 123.5, "epoch": 0.5444957152274226, "grad_norm": 15.091057777404785, "kl": 1.646484375, "learning_rate": 7.275725593667546e-07, "loss": -0.0083, "num_tokens": 227770736.0, "reward": 1.6015625, "reward_std": 0.32589787244796753, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.4364590644836426, "rewards/counterfactual_reasoning_reward/mean": 0.5, "rewards/counterfactual_reasoning_reward/std": 0.504016101360321, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 167.296875, "completions/mean_terminated_length": 167.296875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.5458141067897165, "grad_norm": 2.5600860118865967, "kl": 0.67578125, "learning_rate": 7.269129287598944e-07, "loss": 0.0131, "num_tokens": 228330859.0, "reward": 1.546875, "reward_std": 0.18297028541564941, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.4767438918352127, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.48896822333335876, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.5, "completions/max_terminated_length": 302.5, "completions/mean_length": 170.421875, "completions/mean_terminated_length": 170.421875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.5471324983520105, "grad_norm": 1.516133427619934, "kl": 0.2109375, "learning_rate": 7.262532981530342e-07, "loss": 0.0138, "num_tokens": 228862986.0, "reward": 1.328125, "reward_std": 0.260620154440403, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.5015034973621368, "rewards/counterfactual_reasoning_reward/mean": 0.234375, "rewards/counterfactual_reasoning_reward/std": 0.4299773871898651, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 180.3125, "completions/mean_terminated_length": 180.3125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.5484508899143046, "grad_norm": 2.0946853160858154, "kl": 0.1787109375, "learning_rate": 7.255936675461741e-07, "loss": 0.0028, "num_tokens": 229398431.0, "reward": 1.59375, "reward_std": 0.2842128723859787, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.39445772767066956, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.49993492662906647, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 183.96875, "completions/mean_terminated_length": 183.96875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.5497692814765985, "grad_norm": 2.296434164047241, "kl": 0.20751953125, "learning_rate": 7.24934036939314e-07, "loss": -0.0185, "num_tokens": 229925771.0, "reward": 1.328125, "reward_std": 0.28550924360752106, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.4128527194261551, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.37497539073228836, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 188.171875, "completions/mean_terminated_length": 188.171875, "completions/min_length": 121.5, "completions/min_terminated_length": 121.5, "epoch": 0.5510876730388925, "grad_norm": 2.133105754852295, "kl": 0.21533203125, "learning_rate": 7.242744063324539e-07, "loss": 0.0187, "num_tokens": 230456138.0, "reward": 1.5234375, "reward_std": 0.33865831792354584, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.3642466887831688, "rewards/counterfactual_reasoning_reward/mean": 0.46875, "rewards/counterfactual_reasoning_reward/std": 0.39445772767066956, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.5, "completions/max_terminated_length": 286.5, "completions/mean_length": 172.703125, "completions/mean_terminated_length": 172.703125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.5524060646011866, "grad_norm": 3.3281021118164062, "kl": 0.3984375, "learning_rate": 7.236147757255936e-07, "loss": 0.002, "num_tokens": 230998889.0, "reward": 1.5078125, "reward_std": 0.362157940864563, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.45227913558483124, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.4559413939714432, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.5, "completions/max_terminated_length": 306.5, "completions/mean_length": 165.84375, "completions/mean_terminated_length": 165.84375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.5537244561634805, "grad_norm": 2.3881335258483887, "kl": 0.35009765625, "learning_rate": 7.229551451187335e-07, "loss": 0.0018, "num_tokens": 231535199.0, "reward": 1.5625, "reward_std": 0.33317649364471436, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.4559413939714432, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.4994383603334427, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.5, "completions/max_terminated_length": 318.5, "completions/mean_length": 183.59375, "completions/mean_terminated_length": 183.59375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5550428477257745, "grad_norm": 1.264930248260498, "kl": 0.17041015625, "learning_rate": 7.222955145118733e-07, "loss": -0.0158, "num_tokens": 232105978.0, "reward": 1.78125, "reward_std": 0.2098381221294403, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 0.23546454310417175, "rewards/counterfactual_reasoning_reward/mean": 0.71875, "rewards/counterfactual_reasoning_reward/std": 0.45128606259822845, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 178.71875, "completions/mean_terminated_length": 178.71875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.5563612392880686, "grad_norm": 2.364734649658203, "kl": 0.2177734375, "learning_rate": 7.216358839050131e-07, "loss": 0.0011, "num_tokens": 232670522.0, "reward": 1.3984375, "reward_std": 0.43601636588573456, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.48495998978614807, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.4635103940963745, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 198.390625, "completions/mean_terminated_length": 198.390625, "completions/min_length": 117.5, "completions/min_terminated_length": 117.5, "epoch": 0.5576796308503625, "grad_norm": 6.781421661376953, "kl": 0.27685546875, "learning_rate": 7.209762532981531e-07, "loss": 0.0014, "num_tokens": 233188884.0, "reward": 1.421875, "reward_std": 0.3981630504131317, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.4979427307844162, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.48190538585186005, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.5, "completions/max_terminated_length": 302.5, "completions/mean_length": 183.5625, "completions/mean_terminated_length": 183.5625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.5589980224126566, "grad_norm": 8.477229118347168, "kl": 0.951171875, "learning_rate": 7.203166226912929e-07, "loss": 0.035, "num_tokens": 233727930.0, "reward": 1.5390625, "reward_std": 0.17407145351171494, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.48946478962898254, "rewards/counterfactual_reasoning_reward/mean": 0.484375, "rewards/counterfactual_reasoning_reward/std": 0.495430126786232, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.5, "completions/max_terminated_length": 343.5, "completions/mean_length": 171.921875, "completions/mean_terminated_length": 171.921875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.5603164139749506, "grad_norm": 2.8852062225341797, "kl": 0.205078125, "learning_rate": 7.196569920844327e-07, "loss": 0.004, "num_tokens": 234227201.0, "reward": 1.5625, "reward_std": 0.3111915811896324, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.4000803381204605, "rewards/counterfactual_reasoning_reward/mean": 0.484375, "rewards/counterfactual_reasoning_reward/std": 0.4299773871898651, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 183.765625, "completions/mean_terminated_length": 183.765625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.5616348055372445, "grad_norm": 4.71922492980957, "kl": 0.21044921875, "learning_rate": 7.189973614775725e-07, "loss": -0.0243, "num_tokens": 234790664.0, "reward": 1.4921875, "reward_std": 0.31099456548690796, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4994383603334427, "rewards/counterfactual_reasoning_reward/mean": 0.46875, "rewards/counterfactual_reasoning_reward/std": 0.5060082972049713, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.5, "completions/max_terminated_length": 361.5, "completions/mean_length": 202.1875, "completions/mean_terminated_length": 202.1875, "completions/min_length": 123.5, "completions/min_terminated_length": 123.5, "epoch": 0.5629531970995386, "grad_norm": 1.64402437210083, "kl": 0.189453125, "learning_rate": 7.183377308707123e-07, "loss": 0.0009, "num_tokens": 235321887.0, "reward": 1.2421875, "reward_std": 0.2899155914783478, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.45178256928920746, "rewards/counterfactual_reasoning_reward/mean": 0.15625, "rewards/counterfactual_reasoning_reward/std": 0.3580790013074875, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.5, "completions/max_terminated_length": 296.5, "completions/mean_length": 179.0625, "completions/mean_terminated_length": 179.0625, "completions/min_length": 122.5, "completions/min_terminated_length": 122.5, "epoch": 0.5642715886618326, "grad_norm": 1.4399155378341675, "kl": 0.1884765625, "learning_rate": 7.176781002638522e-07, "loss": -0.0078, "num_tokens": 235835419.0, "reward": 1.3828125, "reward_std": 0.3130585104227066, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.48721402883529663, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.469681054353714, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.5, "completions/max_terminated_length": 289.5, "completions/mean_length": 176.671875, "completions/mean_terminated_length": 176.671875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.5655899802241265, "grad_norm": 1.9121315479278564, "kl": 0.19384765625, "learning_rate": 7.170184696569921e-07, "loss": 0.0303, "num_tokens": 236390326.0, "reward": 1.40625, "reward_std": 0.2797553688287735, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4994383603334427, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.44837237894535065, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.5, "completions/max_terminated_length": 294.5, "completions/mean_length": 174.625, "completions/mean_terminated_length": 174.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.5669083717864206, "grad_norm": 4.132734298706055, "kl": 0.27197265625, "learning_rate": 7.16358839050132e-07, "loss": -0.0094, "num_tokens": 236951992.0, "reward": 1.65625, "reward_std": 0.19078750908374786, "rewards/accuracy_reward/mean": 0.78125, "rewards/accuracy_reward/std": 0.3378837928175926, "rewards/counterfactual_reasoning_reward/mean": 0.53125, "rewards/counterfactual_reasoning_reward/std": 0.4979427307844162, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.5, "completions/max_terminated_length": 294.5, "completions/mean_length": 171.140625, "completions/mean_terminated_length": 171.140625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.5682267633487146, "grad_norm": 1.7538683414459229, "kl": 0.22412109375, "learning_rate": 7.156992084432717e-07, "loss": 0.0226, "num_tokens": 237478394.0, "reward": 1.3125, "reward_std": 0.2886401042342186, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.4337434321641922, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.5, "completions/max_terminated_length": 278.5, "completions/mean_length": 172.296875, "completions/mean_terminated_length": 172.296875, "completions/min_length": 122.5, "completions/min_terminated_length": 122.5, "epoch": 0.5695451549110085, "grad_norm": 1.4691345691680908, "kl": 0.26611328125, "learning_rate": 7.150395778364116e-07, "loss": -0.0055, "num_tokens": 238035425.0, "reward": 1.4765625, "reward_std": 0.2689310312271118, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.504016101360321, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.4994383603334427, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.5, "completions/max_terminated_length": 302.5, "completions/mean_length": 180.796875, "completions/mean_terminated_length": 180.796875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.5708635464733026, "grad_norm": 1.7854996919631958, "kl": 0.1962890625, "learning_rate": 7.143799472295514e-07, "loss": 0.0049, "num_tokens": 238553650.0, "reward": 1.3984375, "reward_std": 0.3729718327522278, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5055117309093475, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.4442135691642761, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 171.734375, "completions/mean_terminated_length": 171.734375, "completions/min_length": 110.5, "completions/min_terminated_length": 110.5, "epoch": 0.5721819380355966, "grad_norm": 1.9048700332641602, "kl": 0.18603515625, "learning_rate": 7.137203166226912e-07, "loss": 0.0048, "num_tokens": 239096802.0, "reward": 1.5, "reward_std": 0.3898492753505707, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.48139922320842743, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.49899089336395264, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.5, "completions/max_terminated_length": 322.5, "completions/mean_length": 190.375, "completions/mean_terminated_length": 190.375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.5735003295978905, "grad_norm": 2.764127492904663, "kl": 0.4609375, "learning_rate": 7.130606860158312e-07, "loss": -0.0143, "num_tokens": 239652380.0, "reward": 1.1796875, "reward_std": 0.3490283638238907, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.45680341124534607, "rewards/counterfactual_reasoning_reward/mean": 0.171875, "rewards/counterfactual_reasoning_reward/std": 0.378012090921402, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.5, "completions/max_terminated_length": 388.5, "completions/mean_length": 192.671875, "completions/mean_terminated_length": 192.671875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.5748187211601846, "grad_norm": 4.632028579711914, "kl": 0.18115234375, "learning_rate": 7.12401055408971e-07, "loss": 0.0243, "num_tokens": 240212859.0, "reward": 1.5546875, "reward_std": 0.3312741816043854, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.4595021605491638, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.48495998978614807, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.5, "completions/max_terminated_length": 253.5, "completions/mean_length": 165.46875, "completions/mean_terminated_length": 165.46875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5761371127224786, "grad_norm": 2.038416624069214, "kl": 0.17431640625, "learning_rate": 7.117414248021108e-07, "loss": 0.0009, "num_tokens": 240732572.0, "reward": 1.515625, "reward_std": 0.44309380650520325, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.48946478962898254, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.4874725937843323, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 171.71875, "completions/mean_terminated_length": 171.71875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.5774555042847725, "grad_norm": 1.358697533607483, "kl": 0.18701171875, "learning_rate": 7.110817941952506e-07, "loss": 0.0009, "num_tokens": 241247483.0, "reward": 1.5703125, "reward_std": 0.30361463129520416, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.4659053534269333, "rewards/counterfactual_reasoning_reward/mean": 0.484375, "rewards/counterfactual_reasoning_reward/std": 0.5015034973621368, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.5, "completions/max_terminated_length": 320.5, "completions/mean_length": 178.359375, "completions/mean_terminated_length": 178.359375, "completions/min_length": 115.5, "completions/min_terminated_length": 115.5, "epoch": 0.5787738958470666, "grad_norm": 2.0144240856170654, "kl": 0.19580078125, "learning_rate": 7.104221635883904e-07, "loss": -0.0195, "num_tokens": 241769594.0, "reward": 1.515625, "reward_std": 0.29827095568180084, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.43845126032829285, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.462014764547348, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.5, "completions/max_terminated_length": 358.5, "completions/mean_length": 189.84375, "completions/mean_terminated_length": 189.84375, "completions/min_length": 111.5, "completions/min_terminated_length": 111.5, "epoch": 0.5800922874093606, "grad_norm": 11.391408920288086, "kl": 0.3828125, "learning_rate": 7.097625329815303e-07, "loss": -0.001, "num_tokens": 242337905.0, "reward": 1.3828125, "reward_std": 0.42900680005550385, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4743363857269287, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.43795469403266907, "rewards/multiturn_format_reward/mean": 0.9375, "rewards/multiturn_format_reward/std": 0.2364606335759163, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.5, "completions/max_terminated_length": 343.5, "completions/mean_length": 189.28125, "completions/mean_terminated_length": 189.28125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5814106789716545, "grad_norm": 1.5930767059326172, "kl": 0.3037109375, "learning_rate": 7.091029023746702e-07, "loss": 0.0015, "num_tokens": 242897162.0, "reward": 1.53125, "reward_std": 0.27609430253505707, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.45178256928920746, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.4994383603334427, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 188.984375, "completions/mean_terminated_length": 188.984375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5827290705339486, "grad_norm": 3.7171518802642822, "kl": 0.15966796875, "learning_rate": 7.084432717678101e-07, "loss": 0.0028, "num_tokens": 243476525.0, "reward": 1.4296875, "reward_std": 0.34778669476509094, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.48896822333335876, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.45128606259822845, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.5, "completions/max_terminated_length": 246.5, "completions/mean_length": 182.984375, "completions/mean_terminated_length": 182.984375, "completions/min_length": 128.5, "completions/min_terminated_length": 128.5, "epoch": 0.5840474620962426, "grad_norm": 1.6984212398529053, "kl": 0.1728515625, "learning_rate": 7.077836411609498e-07, "loss": -0.0196, "num_tokens": 244027339.0, "reward": 1.453125, "reward_std": 0.2428773045539856, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.4979427307844162, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.4595021605491638, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.5, "completions/max_terminated_length": 286.5, "completions/mean_length": 190.8125, "completions/mean_terminated_length": 190.8125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.5853658536585366, "grad_norm": 2.096066474914551, "kl": 0.22509765625, "learning_rate": 7.071240105540897e-07, "loss": 0.0294, "num_tokens": 244557690.0, "reward": 1.2734375, "reward_std": 0.3368266224861145, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.40346992015838623, "rewards/counterfactual_reasoning_reward/mean": 0.234375, "rewards/counterfactual_reasoning_reward/std": 0.253503680229187, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 184.125, "completions/mean_terminated_length": 184.125, "completions/min_length": 125.5, "completions/min_terminated_length": 125.5, "epoch": 0.5866842452208306, "grad_norm": 1.1492843627929688, "kl": 0.173828125, "learning_rate": 7.064643799472295e-07, "loss": 0.0224, "num_tokens": 245115066.0, "reward": 1.4765625, "reward_std": 0.2639690265059471, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.48190538585186005, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48495998978614807, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 184.0, "completions/mean_terminated_length": 184.0, "completions/min_length": 128.5, "completions/min_terminated_length": 128.5, "epoch": 0.5880026367831246, "grad_norm": 2.568390130996704, "kl": 0.216796875, "learning_rate": 7.058047493403693e-07, "loss": 0.0216, "num_tokens": 245673147.0, "reward": 1.5703125, "reward_std": 0.26827648282051086, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.46946612000465393, "rewards/counterfactual_reasoning_reward/mean": 0.5, "rewards/counterfactual_reasoning_reward/std": 0.504016101360321, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 185.875, "completions/mean_terminated_length": 185.875, "completions/min_length": 131.5, "completions/min_terminated_length": 131.5, "epoch": 0.5893210283454186, "grad_norm": 3.4815049171447754, "kl": 0.21826171875, "learning_rate": 7.051451187335093e-07, "loss": -0.0018, "num_tokens": 246233818.0, "reward": 1.359375, "reward_std": 0.2881552428007126, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.4979427307844162, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.4337434321641922, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 180.796875, "completions/mean_terminated_length": 180.796875, "completions/min_length": 117.5, "completions/min_terminated_length": 117.5, "epoch": 0.5906394199077126, "grad_norm": 1.7768040895462036, "kl": 0.224609375, "learning_rate": 7.044854881266491e-07, "loss": 0.0041, "num_tokens": 246783232.0, "reward": 1.6484375, "reward_std": 0.3828115463256836, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.3879760503768921, "rewards/counterfactual_reasoning_reward/mean": 0.578125, "rewards/counterfactual_reasoning_reward/std": 0.5015034973621368, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.5, "completions/max_terminated_length": 271.5, "completions/mean_length": 177.3125, "completions/mean_terminated_length": 177.3125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.5919578114700066, "grad_norm": 1.4916081428527832, "kl": 0.24853515625, "learning_rate": 7.038258575197889e-07, "loss": -0.0144, "num_tokens": 247307986.0, "reward": 1.421875, "reward_std": 0.22707363218069077, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.507007360458374, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.47789715230464935, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.5, "completions/max_terminated_length": 314.5, "completions/mean_length": 186.5625, "completions/mean_terminated_length": 186.5625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.5932762030323006, "grad_norm": 2.868993043899536, "kl": 0.21435546875, "learning_rate": 7.031662269129287e-07, "loss": -0.0038, "num_tokens": 247847257.0, "reward": 1.578125, "reward_std": 0.3500043749809265, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4337434321641922, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.49993492662906647, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 185.4375, "completions/mean_terminated_length": 185.4375, "completions/min_length": 123.5, "completions/min_terminated_length": 123.5, "epoch": 0.5945945945945946, "grad_norm": 1.8558871746063232, "kl": 0.4921875, "learning_rate": 7.025065963060685e-07, "loss": 0.0025, "num_tokens": 248416176.0, "reward": 1.2578125, "reward_std": 0.40491151809692383, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.48040975630283356, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.40346992015838623, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.5, "completions/max_terminated_length": 293.5, "completions/mean_length": 167.640625, "completions/mean_terminated_length": 167.640625, "completions/min_length": 115.5, "completions/min_terminated_length": 115.5, "epoch": 0.5959129861568886, "grad_norm": 1.9186317920684814, "kl": 0.26708984375, "learning_rate": 7.018469656992084e-07, "loss": 0.0013, "num_tokens": 248963346.0, "reward": 1.5625, "reward_std": 0.4136682152748108, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.4595021605491638, "rewards/counterfactual_reasoning_reward/mean": 0.46875, "rewards/counterfactual_reasoning_reward/std": 0.5029991269111633, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 192.875, "completions/mean_terminated_length": 192.875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.5972313777191826, "grad_norm": 3.8574845790863037, "kl": 0.2998046875, "learning_rate": 7.011873350923483e-07, "loss": 0.0015, "num_tokens": 249517170.0, "reward": 1.5390625, "reward_std": 0.3381732255220413, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.4559413939714432, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.48946478962898254, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 198.671875, "completions/mean_terminated_length": 198.671875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.5985497692814766, "grad_norm": 4.470089435577393, "kl": 0.20849609375, "learning_rate": 7.005277044854882e-07, "loss": 0.001, "num_tokens": 250026733.0, "reward": 1.3203125, "reward_std": 0.2767082527279854, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 0.4595021605491638, "rewards/counterfactual_reasoning_reward/mean": 0.234375, "rewards/counterfactual_reasoning_reward/std": 0.37246278673410416, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.5, "completions/max_terminated_length": 303.5, "completions/mean_length": 179.671875, "completions/mean_terminated_length": 179.671875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5998681608437706, "grad_norm": 1.7488855123519897, "kl": 0.328125, "learning_rate": 6.998680738786279e-07, "loss": 0.0016, "num_tokens": 250572766.0, "reward": 1.390625, "reward_std": 0.30664125084877014, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.4932873994112015, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4638662487268448, "rewards/multiturn_format_reward/mean": 0.9375, "rewards/multiturn_format_reward/std": 0.16800537705421448, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 174.6875, "completions/mean_terminated_length": 174.6875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.6011865524060646, "grad_norm": 1.4399681091308594, "kl": 0.22021484375, "learning_rate": 6.992084432717677e-07, "loss": -0.0116, "num_tokens": 251133589.0, "reward": 1.5859375, "reward_std": 0.41356024146080017, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.45128606259822845, "rewards/counterfactual_reasoning_reward/mean": 0.484375, "rewards/counterfactual_reasoning_reward/std": 0.5075039267539978, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 177.203125, "completions/mean_terminated_length": 177.203125, "completions/min_length": 119.5, "completions/min_terminated_length": 119.5, "epoch": 0.6025049439683586, "grad_norm": 1.508872628211975, "kl": 0.244140625, "learning_rate": 6.985488126649076e-07, "loss": -0.0037, "num_tokens": 251691126.0, "reward": 1.4609375, "reward_std": 0.2722841799259186, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.4399413466453552, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.4199155569076538, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 184.0625, "completions/mean_terminated_length": 184.0625, "completions/min_length": 116.5, "completions/min_terminated_length": 116.5, "epoch": 0.6038233355306526, "grad_norm": 1.127957820892334, "kl": 0.22314453125, "learning_rate": 6.978891820580474e-07, "loss": 0.0246, "num_tokens": 252228138.0, "reward": 1.515625, "reward_std": 0.1666998788714409, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.48040975630283356, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.495430126786232, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 186.34375, "completions/mean_terminated_length": 186.34375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.6051417270929466, "grad_norm": 1.8690898418426514, "kl": 0.21826171875, "learning_rate": 6.972295514511874e-07, "loss": -0.0145, "num_tokens": 252815893.0, "reward": 1.59375, "reward_std": 0.2557336688041687, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.44777433574199677, "rewards/counterfactual_reasoning_reward/mean": 0.484375, "rewards/counterfactual_reasoning_reward/std": 0.495430126786232, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.5, "completions/max_terminated_length": 313.5, "completions/mean_length": 174.578125, "completions/mean_terminated_length": 174.578125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.6064601186552406, "grad_norm": 2.2248799800872803, "kl": 0.1748046875, "learning_rate": 6.965699208443272e-07, "loss": 0.0048, "num_tokens": 253388927.0, "reward": 1.6015625, "reward_std": 0.2996871769428253, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.45028693974018097, "rewards/counterfactual_reasoning_reward/mean": 0.515625, "rewards/counterfactual_reasoning_reward/std": 0.5075039267539978, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.5, "completions/max_terminated_length": 307.5, "completions/mean_length": 181.984375, "completions/mean_terminated_length": 181.984375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.6077785102175346, "grad_norm": 1.3644791841506958, "kl": 0.212890625, "learning_rate": 6.95910290237467e-07, "loss": 0.0011, "num_tokens": 253969828.0, "reward": 1.3671875, "reward_std": 0.32485102117061615, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.5075039267539978, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.45680341124534607, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 199.234375, "completions/mean_terminated_length": 199.234375, "completions/min_length": 92.5, "completions/min_terminated_length": 92.5, "epoch": 0.6090969017798286, "grad_norm": 1.6425762176513672, "kl": 0.23193359375, "learning_rate": 6.952506596306068e-07, "loss": -0.0193, "num_tokens": 254491521.0, "reward": 1.515625, "reward_std": 0.2875763475894928, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4994383603334427, "rewards/counterfactual_reasoning_reward/mean": 0.484375, "rewards/counterfactual_reasoning_reward/std": 0.5015034973621368, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.5, "completions/max_terminated_length": 294.5, "completions/mean_length": 180.375, "completions/mean_terminated_length": 180.375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.6104152933421226, "grad_norm": 2.6358377933502197, "kl": 0.20458984375, "learning_rate": 6.945910290237466e-07, "loss": -0.0009, "num_tokens": 255052221.0, "reward": 1.609375, "reward_std": 0.29552366584539413, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.3423885926604271, "rewards/counterfactual_reasoning_reward/mean": 0.578125, "rewards/counterfactual_reasoning_reward/std": 0.3680429607629776, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.1480722874403, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.5, "completions/max_terminated_length": 285.5, "completions/mean_length": 169.8125, "completions/mean_terminated_length": 169.8125, "completions/min_length": 119.5, "completions/min_terminated_length": 119.5, "epoch": 0.6117336849044166, "grad_norm": 2.2124719619750977, "kl": 0.189453125, "learning_rate": 6.939313984168865e-07, "loss": -0.0039, "num_tokens": 255610760.0, "reward": 1.515625, "reward_std": 0.26527373492717743, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.47197872400283813, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.4874725937843323, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.5, "completions/max_terminated_length": 317.5, "completions/mean_length": 181.96875, "completions/mean_terminated_length": 181.96875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.6130520764667106, "grad_norm": 3.895132303237915, "kl": 0.16748046875, "learning_rate": 6.932717678100264e-07, "loss": -0.0128, "num_tokens": 256126403.0, "reward": 1.375, "reward_std": 0.290683776140213, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.507007360458374, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.43840841948986053, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.5, "completions/max_terminated_length": 259.5, "completions/mean_length": 167.4375, "completions/mean_terminated_length": 167.4375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.6143704680290046, "grad_norm": 2.285045623779297, "kl": 0.19287109375, "learning_rate": 6.926121372031663e-07, "loss": -0.0, "num_tokens": 256660026.0, "reward": 1.4921875, "reward_std": 0.3420127332210541, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49993492662906647, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.5015034973621368, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.5, "completions/max_terminated_length": 340.5, "completions/mean_length": 183.796875, "completions/mean_terminated_length": 183.796875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.6156888595912986, "grad_norm": 2.3875491619110107, "kl": 0.2138671875, "learning_rate": 6.91952506596306e-07, "loss": 0.0011, "num_tokens": 257179161.0, "reward": 1.5, "reward_std": 0.41180071234703064, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.49527959525585175, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.48896822333335876, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.5, "completions/max_terminated_length": 303.5, "completions/mean_length": 174.703125, "completions/mean_terminated_length": 174.703125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.6170072511535926, "grad_norm": 1.928825855255127, "kl": 0.20263671875, "learning_rate": 6.912928759894458e-07, "loss": -0.0048, "num_tokens": 257709941.0, "reward": 1.4375, "reward_std": 0.37161463499069214, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.4659053534269333, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 166.796875, "completions/mean_terminated_length": 166.796875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.6183256427158866, "grad_norm": 2.6674082279205322, "kl": 0.22265625, "learning_rate": 6.906332453825857e-07, "loss": -0.0311, "num_tokens": 258286537.0, "reward": 1.5078125, "reward_std": 0.2445988953113556, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4734743535518646, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4907747954130173, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 167.046875, "completions/mean_terminated_length": 167.046875, "completions/min_length": 117.5, "completions/min_terminated_length": 117.5, "epoch": 0.6196440342781806, "grad_norm": 3.6040778160095215, "kl": 0.326171875, "learning_rate": 6.899736147757255e-07, "loss": -0.013, "num_tokens": 258851663.0, "reward": 1.3671875, "reward_std": 0.39258521795272827, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.4907747954130173, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4638662487268448, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.1480722874403, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.5, "completions/max_terminated_length": 320.5, "completions/mean_length": 188.078125, "completions/mean_terminated_length": 188.078125, "completions/min_length": 128.5, "completions/min_terminated_length": 128.5, "epoch": 0.6209624258404747, "grad_norm": 1.93013334274292, "kl": 0.240234375, "learning_rate": 6.893139841688655e-07, "loss": 0.0012, "num_tokens": 259393045.0, "reward": 1.328125, "reward_std": 0.3434976637363434, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.47789715230464935, "rewards/counterfactual_reasoning_reward/mean": 0.21875, "rewards/counterfactual_reasoning_reward/std": 0.4128527194261551, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.5, "completions/max_terminated_length": 293.5, "completions/mean_length": 168.96875, "completions/mean_terminated_length": 168.96875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.6222808174027686, "grad_norm": 1.3996636867523193, "kl": 0.2705078125, "learning_rate": 6.886543535620053e-07, "loss": 0.0014, "num_tokens": 259971948.0, "reward": 1.609375, "reward_std": 0.3177255392074585, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.4020725339651108, "rewards/counterfactual_reasoning_reward/mean": 0.515625, "rewards/counterfactual_reasoning_reward/std": 0.44837237894535065, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 174.4375, "completions/mean_terminated_length": 174.4375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.6235992089650626, "grad_norm": 4.633606433868408, "kl": 0.24267578125, "learning_rate": 6.879947229551451e-07, "loss": 0.0012, "num_tokens": 260519082.0, "reward": 1.4765625, "reward_std": 0.3503444939851761, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4874725937843323, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.46946612000465393, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.5, "completions/max_terminated_length": 254.5, "completions/mean_length": 164.75, "completions/mean_terminated_length": 164.75, "completions/min_length": 115.5, "completions/min_terminated_length": 115.5, "epoch": 0.6249176005273567, "grad_norm": 1.9170297384262085, "kl": 0.23681640625, "learning_rate": 6.873350923482849e-07, "loss": 0.0012, "num_tokens": 261042479.0, "reward": 1.421875, "reward_std": 0.3212074786424637, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.4979427307844162, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.47197872400283813, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 192.359375, "completions/mean_terminated_length": 192.359375, "completions/min_length": 118.5, "completions/min_terminated_length": 118.5, "epoch": 0.6262359920896506, "grad_norm": 1.5204129219055176, "kl": 0.3193359375, "learning_rate": 6.866754617414247e-07, "loss": 0.0319, "num_tokens": 261607111.0, "reward": 1.3515625, "reward_std": 0.2896379381418228, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.3680429607629776, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.37696758657693863, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.5, "completions/max_terminated_length": 329.5, "completions/mean_length": 183.21875, "completions/mean_terminated_length": 183.21875, "completions/min_length": 122.5, "completions/min_terminated_length": 122.5, "epoch": 0.6275543836519446, "grad_norm": 1.0835617780685425, "kl": 0.23876953125, "learning_rate": 6.860158311345646e-07, "loss": -0.0037, "num_tokens": 262180367.0, "reward": 1.71875, "reward_std": 0.295247346162796, "rewards/accuracy_reward/mean": 0.890625, "rewards/accuracy_reward/std": 0.2100067138671875, "rewards/counterfactual_reasoning_reward/mean": 0.546875, "rewards/counterfactual_reasoning_reward/std": 0.5034956932067871, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 180.140625, "completions/mean_terminated_length": 180.140625, "completions/min_length": 124.5, "completions/min_terminated_length": 124.5, "epoch": 0.6288727752142387, "grad_norm": 1.7496694326400757, "kl": 0.2783203125, "learning_rate": 6.853562005277045e-07, "loss": 0.0024, "num_tokens": 262711748.0, "reward": 1.4375, "reward_std": 0.32145532965660095, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.4932873994112015, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.46125002205371857, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 175.1875, "completions/mean_terminated_length": 175.1875, "completions/min_length": 118.5, "completions/min_terminated_length": 118.5, "epoch": 0.6301911667765326, "grad_norm": 7.14608907699585, "kl": 0.22509765625, "learning_rate": 6.846965699208444e-07, "loss": -0.0077, "num_tokens": 263226692.0, "reward": 1.4453125, "reward_std": 0.3332912474870682, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.48040975630283356, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.48190538585186005, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 180.21875, "completions/mean_terminated_length": 180.21875, "completions/min_length": 127.5, "completions/min_terminated_length": 127.5, "epoch": 0.6315095583388266, "grad_norm": 3.2791290283203125, "kl": 0.23974609375, "learning_rate": 6.840369393139841e-07, "loss": 0.0207, "num_tokens": 263772305.0, "reward": 1.484375, "reward_std": 0.37323006987571716, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.49478302896022797, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.5, "completions/max_terminated_length": 279.5, "completions/mean_length": 176.015625, "completions/mean_terminated_length": 176.015625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.6328279499011207, "grad_norm": 9.808655738830566, "kl": 0.32763671875, "learning_rate": 6.833773087071239e-07, "loss": -0.013, "num_tokens": 264281387.0, "reward": 1.4609375, "reward_std": 0.3021458834409714, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.495430126786232, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4739709198474884, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.5, "completions/max_terminated_length": 271.5, "completions/mean_length": 170.3125, "completions/mean_terminated_length": 170.3125, "completions/min_length": 114.5, "completions/min_terminated_length": 114.5, "epoch": 0.6341463414634146, "grad_norm": 2.6715574264526367, "kl": 0.509765625, "learning_rate": 6.827176781002638e-07, "loss": 0.0026, "num_tokens": 264810094.0, "reward": 1.375, "reward_std": 0.3859531134366989, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49993492662906647, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.469681054353714, "rewards/multiturn_format_reward/mean": 0.9375, "rewards/multiturn_format_reward/std": 0.24593468010425568, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 184.609375, "completions/mean_terminated_length": 184.609375, "completions/min_length": 124.5, "completions/min_terminated_length": 124.5, "epoch": 0.6354647330257086, "grad_norm": 2.2205841541290283, "kl": 0.35791015625, "learning_rate": 6.820580474934036e-07, "loss": 0.0203, "num_tokens": 265343804.0, "reward": 1.515625, "reward_std": 0.2503412440419197, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.44777433574199677, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.48040975630283356, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 177.546875, "completions/mean_terminated_length": 177.546875, "completions/min_length": 116.5, "completions/min_terminated_length": 116.5, "epoch": 0.6367831245880027, "grad_norm": 2.5849595069885254, "kl": 0.27001953125, "learning_rate": 6.813984168865436e-07, "loss": 0.017, "num_tokens": 265915610.0, "reward": 1.6484375, "reward_std": 0.33187469840049744, "rewards/accuracy_reward/mean": 0.796875, "rewards/accuracy_reward/std": 0.3964070826768875, "rewards/counterfactual_reasoning_reward/mean": 0.53125, "rewards/counterfactual_reasoning_reward/std": 0.469681054353714, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.5, "completions/max_terminated_length": 321.5, "completions/mean_length": 181.203125, "completions/mean_terminated_length": 181.203125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.6381015161502966, "grad_norm": 2.229753017425537, "kl": 0.53125, "learning_rate": 6.807387862796834e-07, "loss": -0.0354, "num_tokens": 266490742.0, "reward": 1.4609375, "reward_std": 0.35806506872177124, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4364590644836426, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.43795469403266907, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.5, "completions/max_terminated_length": 270.5, "completions/mean_length": 175.671875, "completions/mean_terminated_length": 175.671875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.6394199077125906, "grad_norm": 1.847619891166687, "kl": 0.35009765625, "learning_rate": 6.800791556728232e-07, "loss": 0.0018, "num_tokens": 267039573.0, "reward": 1.4453125, "reward_std": 0.3678963631391525, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4734743535518646, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.45028693974018097, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.5, "completions/max_terminated_length": 285.5, "completions/mean_length": 181.8125, "completions/mean_terminated_length": 181.8125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.6407382992748847, "grad_norm": 4.268972396850586, "kl": 0.28515625, "learning_rate": 6.79419525065963e-07, "loss": 0.0366, "num_tokens": 267587559.0, "reward": 1.4375, "reward_std": 0.33631250262260437, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.47789715230464935, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.4595021605491638, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 171.03125, "completions/mean_terminated_length": 171.03125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.6420566908371786, "grad_norm": 2.887223243713379, "kl": 0.37353515625, "learning_rate": 6.787598944591028e-07, "loss": 0.0067, "num_tokens": 268109924.0, "reward": 1.59375, "reward_std": 0.30585649609565735, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.4215090572834015, "rewards/counterfactual_reasoning_reward/mean": 0.484375, "rewards/counterfactual_reasoning_reward/std": 0.48721402883529663, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.5, "completions/max_terminated_length": 311.5, "completions/mean_length": 190.8125, "completions/mean_terminated_length": 190.8125, "completions/min_length": 127.5, "completions/min_terminated_length": 127.5, "epoch": 0.6433750823994726, "grad_norm": 1.708441138267517, "kl": 0.294921875, "learning_rate": 6.781002638522427e-07, "loss": 0.0015, "num_tokens": 268679602.0, "reward": 1.4765625, "reward_std": 0.32903049886226654, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.495430126786232, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.48139922320842743, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 191.265625, "completions/mean_terminated_length": 191.265625, "completions/min_length": 131.5, "completions/min_terminated_length": 131.5, "epoch": 0.6446934739617667, "grad_norm": 1.2935012578964233, "kl": 0.24365234375, "learning_rate": 6.774406332453826e-07, "loss": 0.0012, "num_tokens": 269204783.0, "reward": 1.4140625, "reward_std": 0.3203144669532776, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.5060082972049713, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4638662487268448, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.5, "completions/max_terminated_length": 306.5, "completions/mean_length": 194.046875, "completions/mean_terminated_length": 194.046875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.6460118655240606, "grad_norm": 1.5360445976257324, "kl": 0.1845703125, "learning_rate": 6.767810026385225e-07, "loss": 0.0009, "num_tokens": 269755898.0, "reward": 1.4765625, "reward_std": 0.33258767426013947, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4640069603919983, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.48139922320842743, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 194.265625, "completions/mean_terminated_length": 194.265625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.6473302570863546, "grad_norm": 3.4695653915405273, "kl": 0.3994140625, "learning_rate": 6.761213720316622e-07, "loss": 0.002, "num_tokens": 270322703.0, "reward": 1.515625, "reward_std": 0.40298840403556824, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.45028693974018097, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4739709198474884, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 195.0, "completions/mean_terminated_length": 195.0, "completions/min_length": 132.5, "completions/min_terminated_length": 132.5, "epoch": 0.6486486486486487, "grad_norm": 2.510535478591919, "kl": 0.24560546875, "learning_rate": 6.75461741424802e-07, "loss": 0.0012, "num_tokens": 270874234.0, "reward": 1.5546875, "reward_std": 0.35310834646224976, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.43038569390773773, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.48190538585186005, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 181.484375, "completions/mean_terminated_length": 181.484375, "completions/min_length": 122.5, "completions/min_terminated_length": 122.5, "epoch": 0.6499670402109426, "grad_norm": 4.487628936767578, "kl": 0.2568359375, "learning_rate": 6.748021108179419e-07, "loss": -0.0261, "num_tokens": 271425718.0, "reward": 1.6171875, "reward_std": 0.32174310088157654, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.45543521642684937, "rewards/counterfactual_reasoning_reward/mean": 0.578125, "rewards/counterfactual_reasoning_reward/std": 0.4994383603334427, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.5, "completions/max_terminated_length": 290.5, "completions/mean_length": 202.515625, "completions/mean_terminated_length": 202.515625, "completions/min_length": 147.5, "completions/min_terminated_length": 147.5, "epoch": 0.6512854317732366, "grad_norm": 1.6426961421966553, "kl": 0.220703125, "learning_rate": 6.741424802110817e-07, "loss": 0.0226, "num_tokens": 271964811.0, "reward": 1.6328125, "reward_std": 0.2498924881219864, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 0.31679005175828934, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.48896822333335876, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.5, "completions/max_terminated_length": 289.5, "completions/mean_length": 186.984375, "completions/mean_terminated_length": 186.984375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.6526038233355307, "grad_norm": 1.2822444438934326, "kl": 0.244140625, "learning_rate": 6.734828496042217e-07, "loss": 0.0002, "num_tokens": 272519406.0, "reward": 1.4453125, "reward_std": 0.2703554183244705, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.507007360458374, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.48896822333335876, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.5, "completions/max_terminated_length": 264.5, "completions/mean_length": 196.671875, "completions/mean_terminated_length": 196.671875, "completions/min_length": 137.5, "completions/min_terminated_length": 137.5, "epoch": 0.6539222148978246, "grad_norm": 1.5976475477218628, "kl": 0.30517578125, "learning_rate": 6.728232189973615e-07, "loss": 0.0045, "num_tokens": 273070743.0, "reward": 1.4765625, "reward_std": 0.3046937882900238, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4635103940963745, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48721402883529663, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.5, "completions/max_terminated_length": 306.5, "completions/mean_length": 197.046875, "completions/mean_terminated_length": 197.046875, "completions/min_length": 144.5, "completions/min_terminated_length": 144.5, "epoch": 0.6552406064601186, "grad_norm": 1.5392166376113892, "kl": 0.2734375, "learning_rate": 6.721635883905013e-07, "loss": -0.0055, "num_tokens": 273625389.0, "reward": 1.6015625, "reward_std": 0.24574294686317444, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.4199155569076538, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.5029991269111633, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 202.34375, "completions/mean_terminated_length": 202.34375, "completions/min_length": 132.5, "completions/min_terminated_length": 132.5, "epoch": 0.6565589980224127, "grad_norm": 1.1978362798690796, "kl": 0.18505859375, "learning_rate": 6.715039577836411e-07, "loss": 0.0068, "num_tokens": 274159958.0, "reward": 1.5625, "reward_std": 0.33430835604667664, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.46125002205371857, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.5015034973621368, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.5, "completions/max_terminated_length": 333.5, "completions/mean_length": 192.015625, "completions/mean_terminated_length": 192.015625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.6578773895847067, "grad_norm": 0.9920403957366943, "kl": 0.2177734375, "learning_rate": 6.708443271767809e-07, "loss": 0.003, "num_tokens": 274700982.0, "reward": 1.5859375, "reward_std": 0.18666287511587143, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.45227913558483124, "rewards/counterfactual_reasoning_reward/mean": 0.515625, "rewards/counterfactual_reasoning_reward/std": 0.44837237894535065, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 195.046875, "completions/mean_terminated_length": 195.046875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.6591957811470006, "grad_norm": 3.150437593460083, "kl": 0.3271484375, "learning_rate": 6.701846965699208e-07, "loss": 0.0153, "num_tokens": 275266479.0, "reward": 1.5703125, "reward_std": 0.3730771690607071, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.45128606259822845, "rewards/counterfactual_reasoning_reward/mean": 0.484375, "rewards/counterfactual_reasoning_reward/std": 0.5015034973621368, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 194.375, "completions/mean_terminated_length": 194.375, "completions/min_length": 139.5, "completions/min_terminated_length": 139.5, "epoch": 0.6605141727092947, "grad_norm": 4.8466644287109375, "kl": 0.6328125, "learning_rate": 6.695250659630607e-07, "loss": 0.0032, "num_tokens": 275839235.0, "reward": 1.421875, "reward_std": 0.41514208912849426, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.5034956932067871, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.4743363857269287, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.5, "completions/max_terminated_length": 327.5, "completions/mean_length": 195.703125, "completions/mean_terminated_length": 195.703125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.6618325642715887, "grad_norm": 1.6105886697769165, "kl": 0.30078125, "learning_rate": 6.688654353562006e-07, "loss": -0.0092, "num_tokens": 276388092.0, "reward": 1.21875, "reward_std": 0.38499633967876434, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48190538585186005, "rewards/counterfactual_reasoning_reward/mean": 0.15625, "rewards/counterfactual_reasoning_reward/std": 0.3689020276069641, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 201.4375, "completions/mean_terminated_length": 201.4375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.6631509558338826, "grad_norm": 8.178218841552734, "kl": 0.2490234375, "learning_rate": 6.682058047493403e-07, "loss": -0.0163, "num_tokens": 276931083.0, "reward": 1.5703125, "reward_std": 0.25120531022548676, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.44777433574199677, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.4128527194261551, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 186.625, "completions/mean_terminated_length": 186.625, "completions/min_length": 148.5, "completions/min_terminated_length": 148.5, "epoch": 0.6644693473961767, "grad_norm": 1.728042483329773, "kl": 0.216796875, "learning_rate": 6.675461741424801e-07, "loss": -0.0194, "num_tokens": 277502299.0, "reward": 1.65625, "reward_std": 0.27271444350481033, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.4638662487268448, "rewards/counterfactual_reasoning_reward/mean": 0.609375, "rewards/counterfactual_reasoning_reward/std": 0.48896822333335876, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.5, "completions/max_terminated_length": 321.5, "completions/mean_length": 194.984375, "completions/mean_terminated_length": 194.984375, "completions/min_length": 142.5, "completions/min_terminated_length": 142.5, "epoch": 0.6657877389584707, "grad_norm": 2.563209056854248, "kl": 0.24072265625, "learning_rate": 6.6688654353562e-07, "loss": 0.0022, "num_tokens": 278018778.0, "reward": 1.3515625, "reward_std": 0.29343587160110474, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.495430126786232, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.4659053534269333, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.1480722874403, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.5, "completions/max_terminated_length": 290.5, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 132.5, "completions/min_terminated_length": 132.5, "epoch": 0.6671061305207646, "grad_norm": 1.996938705444336, "kl": 0.37353515625, "learning_rate": 6.662269129287598e-07, "loss": 0.0331, "num_tokens": 278584716.0, "reward": 1.609375, "reward_std": 0.39886149764060974, "rewards/accuracy_reward/mean": 0.78125, "rewards/accuracy_reward/std": 0.420013427734375, "rewards/counterfactual_reasoning_reward/mean": 0.5, "rewards/counterfactual_reasoning_reward/std": 0.49899089336395264, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 202.53125, "completions/mean_terminated_length": 202.53125, "completions/min_length": 140.5, "completions/min_terminated_length": 140.5, "epoch": 0.6684245220830587, "grad_norm": 3.078827142715454, "kl": 0.25146484375, "learning_rate": 6.655672823218998e-07, "loss": -0.0046, "num_tokens": 279152920.0, "reward": 1.4765625, "reward_std": 0.3767327517271042, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.4932873994112015, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.5, "completions/max_terminated_length": 335.5, "completions/mean_length": 196.1875, "completions/mean_terminated_length": 196.1875, "completions/min_length": 142.5, "completions/min_terminated_length": 142.5, "epoch": 0.6697429136453527, "grad_norm": 3.879014253616333, "kl": 0.25, "learning_rate": 6.649076517150396e-07, "loss": 0.0013, "num_tokens": 279686665.0, "reward": 1.2890625, "reward_std": 0.42719706892967224, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.495430126786232, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.41394005715847015, "rewards/multiturn_format_reward/mean": 0.921875, "rewards/multiturn_format_reward/std": 0.18445101380348206, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 199.484375, "completions/mean_terminated_length": 199.484375, "completions/min_length": 134.5, "completions/min_terminated_length": 134.5, "epoch": 0.6710613052076466, "grad_norm": 3.6641976833343506, "kl": 0.39990234375, "learning_rate": 6.642480211081794e-07, "loss": 0.0362, "num_tokens": 280248547.0, "reward": 1.4765625, "reward_std": 0.2761355936527252, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48721402883529663, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.5, "completions/max_terminated_length": 289.5, "completions/mean_length": 187.4375, "completions/mean_terminated_length": 187.4375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.6723796967699407, "grad_norm": 4.185696601867676, "kl": 0.28173828125, "learning_rate": 6.635883905013192e-07, "loss": 0.0043, "num_tokens": 280820714.0, "reward": 1.390625, "reward_std": 0.34346452355384827, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.495430126786232, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.4743363857269287, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.5, "completions/max_terminated_length": 363.5, "completions/mean_length": 203.46875, "completions/mean_terminated_length": 203.46875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.6736980883322347, "grad_norm": 3.056499481201172, "kl": 0.2041015625, "learning_rate": 6.62928759894459e-07, "loss": 0.0059, "num_tokens": 281374612.0, "reward": 1.546875, "reward_std": 0.3287336230278015, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.46946612000465393, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.4743363857269287, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.5, "completions/max_terminated_length": 276.5, "completions/mean_length": 184.5625, "completions/mean_terminated_length": 184.5625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.6750164798945286, "grad_norm": 1.8957206010818481, "kl": 0.3466796875, "learning_rate": 6.62269129287599e-07, "loss": -0.01, "num_tokens": 281918223.0, "reward": 1.359375, "reward_std": 0.43043912947177887, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.507007360458374, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.45680341124534607, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 194.859375, "completions/mean_terminated_length": 194.859375, "completions/min_length": 143.5, "completions/min_terminated_length": 143.5, "epoch": 0.6763348714568227, "grad_norm": 1.565251350402832, "kl": 0.19921875, "learning_rate": 6.616094986807388e-07, "loss": -0.0049, "num_tokens": 282457140.0, "reward": 1.5703125, "reward_std": 0.3581793010234833, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.4442135691642761, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.4994383603334427, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.5, "completions/max_terminated_length": 243.5, "completions/mean_length": 187.046875, "completions/mean_terminated_length": 187.046875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.6776532630191167, "grad_norm": 2.93910813331604, "kl": 0.255859375, "learning_rate": 6.609498680738787e-07, "loss": 0.0013, "num_tokens": 283000201.0, "reward": 1.5625, "reward_std": 0.3267960846424103, "rewards/accuracy_reward/mean": 0.78125, "rewards/accuracy_reward/std": 0.41824956238269806, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4874725937843323, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 185.796875, "completions/mean_terminated_length": 185.796875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.6789716545814107, "grad_norm": 1.4944239854812622, "kl": 0.2470703125, "learning_rate": 6.602902374670184e-07, "loss": 0.0256, "num_tokens": 283537224.0, "reward": 1.625, "reward_std": 0.2166680172085762, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.45028693974018097, "rewards/counterfactual_reasoning_reward/mean": 0.5625, "rewards/counterfactual_reasoning_reward/std": 0.4659053534269333, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.5, "completions/max_terminated_length": 306.5, "completions/mean_length": 192.25, "completions/mean_terminated_length": 192.25, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.6802900461437047, "grad_norm": 1.4646445512771606, "kl": 0.23828125, "learning_rate": 6.596306068601582e-07, "loss": 0.0012, "num_tokens": 284080422.0, "reward": 1.5078125, "reward_std": 0.31997618079185486, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.46946612000465393, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.47789715230464935, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.5, "completions/max_terminated_length": 244.5, "completions/mean_length": 179.515625, "completions/mean_terminated_length": 179.515625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.6816084377059987, "grad_norm": 3.4005255699157715, "kl": 0.17626953125, "learning_rate": 6.589709762532981e-07, "loss": -0.0196, "num_tokens": 284619184.0, "reward": 1.65625, "reward_std": 0.17289285361766815, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.37647102028131485, "rewards/counterfactual_reasoning_reward/mean": 0.578125, "rewards/counterfactual_reasoning_reward/std": 0.46946612000465393, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.5, "completions/max_terminated_length": 245.5, "completions/mean_length": 176.9375, "completions/mean_terminated_length": 176.9375, "completions/min_length": 123.5, "completions/min_terminated_length": 123.5, "epoch": 0.6829268292682927, "grad_norm": 1.7594887018203735, "kl": 0.4150390625, "learning_rate": 6.58311345646438e-07, "loss": 0.0001, "num_tokens": 285185340.0, "reward": 1.3984375, "reward_std": 0.2630535662174225, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.47789715230464935, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.45178256928920746, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.5, "completions/max_terminated_length": 286.5, "completions/mean_length": 203.78125, "completions/mean_terminated_length": 203.78125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.6842452208305867, "grad_norm": 1.5557811260223389, "kl": 0.2705078125, "learning_rate": 6.576517150395779e-07, "loss": -0.0231, "num_tokens": 285677499.0, "reward": 1.328125, "reward_std": 0.22152934968471527, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.44777433574199677, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.3423885926604271, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.5, "completions/max_terminated_length": 260.5, "completions/mean_length": 176.3125, "completions/mean_terminated_length": 176.3125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.6855636123928807, "grad_norm": 2.507319688796997, "kl": 0.177734375, "learning_rate": 6.569920844327177e-07, "loss": 0.0136, "num_tokens": 286212581.0, "reward": 1.5703125, "reward_std": 0.36960369348526, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.4743363857269287, "rewards/counterfactual_reasoning_reward/mean": 0.5, "rewards/counterfactual_reasoning_reward/std": 0.5080004930496216, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.5, "completions/max_terminated_length": 267.5, "completions/mean_length": 183.890625, "completions/mean_terminated_length": 183.890625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.6868820039551747, "grad_norm": 2.6003432273864746, "kl": 0.21728515625, "learning_rate": 6.563324538258574e-07, "loss": 0.0011, "num_tokens": 286776745.0, "reward": 1.6171875, "reward_std": 0.33651866018772125, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.4299773871898651, "rewards/counterfactual_reasoning_reward/mean": 0.46875, "rewards/counterfactual_reasoning_reward/std": 0.4907747954130173, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.5, "completions/max_terminated_length": 243.5, "completions/mean_length": 166.59375, "completions/mean_terminated_length": 166.59375, "completions/min_length": 77.5, "completions/min_terminated_length": 77.5, "epoch": 0.6882003955174687, "grad_norm": 11.587615966796875, "kl": 0.28955078125, "learning_rate": 6.556728232189973e-07, "loss": 0.0063, "num_tokens": 287304015.0, "reward": 1.6015625, "reward_std": 0.22839760035276413, "rewards/accuracy_reward/mean": 0.828125, "rewards/accuracy_reward/std": 0.24127934873104095, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.48946478962898254, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.5, "completions/max_terminated_length": 314.5, "completions/mean_length": 193.09375, "completions/mean_terminated_length": 193.09375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.6895187870797627, "grad_norm": 1.4816704988479614, "kl": 0.2021484375, "learning_rate": 6.550131926121371e-07, "loss": 0.002, "num_tokens": 287887065.0, "reward": 1.359375, "reward_std": 0.2491578906774521, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.4907747954130173, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.4337434321641922, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 178.0625, "completions/mean_terminated_length": 178.0625, "completions/min_length": 132.5, "completions/min_terminated_length": 132.5, "epoch": 0.6908371786420567, "grad_norm": 2.558044195175171, "kl": 0.21044921875, "learning_rate": 6.54353562005277e-07, "loss": 0.0011, "num_tokens": 288437815.0, "reward": 1.4765625, "reward_std": 0.39782536029815674, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4994383603334427, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.48946478962898254, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 178.359375, "completions/mean_terminated_length": 178.359375, "completions/min_length": 131.5, "completions/min_terminated_length": 131.5, "epoch": 0.6921555702043507, "grad_norm": 2.4954590797424316, "kl": 0.2177734375, "learning_rate": 6.536939313984169e-07, "loss": -0.0057, "num_tokens": 289020923.0, "reward": 1.5703125, "reward_std": 0.3616732209920883, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.4595021605491638, "rewards/counterfactual_reasoning_reward/mean": 0.484375, "rewards/counterfactual_reasoning_reward/std": 0.5075039267539978, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.5, "completions/max_terminated_length": 285.5, "completions/mean_length": 179.859375, "completions/mean_terminated_length": 179.859375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.6934739617666447, "grad_norm": 1.5468690395355225, "kl": 0.26806640625, "learning_rate": 6.530343007915568e-07, "loss": -0.0172, "num_tokens": 289598139.0, "reward": 1.40625, "reward_std": 0.3560083508491516, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.495430126786232, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.462014764547348, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 178.828125, "completions/mean_terminated_length": 178.828125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.6947923533289387, "grad_norm": 1.8073817491531372, "kl": 0.21875, "learning_rate": 6.523746701846965e-07, "loss": -0.0106, "num_tokens": 290141751.0, "reward": 1.6015625, "reward_std": 0.31965357065200806, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.4299773871898651, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.5029991269111633, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 192.21875, "completions/mean_terminated_length": 192.21875, "completions/min_length": 121.5, "completions/min_terminated_length": 121.5, "epoch": 0.6961107448912327, "grad_norm": 1.4081957340240479, "kl": 0.22900390625, "learning_rate": 6.517150395778363e-07, "loss": -0.0096, "num_tokens": 290719560.0, "reward": 1.5, "reward_std": 0.30325107276439667, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.46946612000465393, "rewards/counterfactual_reasoning_reward/mean": 0.453125, "rewards/counterfactual_reasoning_reward/std": 0.4743363857269287, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 184.921875, "completions/mean_terminated_length": 184.921875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.6974291364535267, "grad_norm": 1.315429925918579, "kl": 0.24072265625, "learning_rate": 6.510554089709762e-07, "loss": 0.0012, "num_tokens": 291310417.0, "reward": 1.65625, "reward_std": 0.27203021198511124, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.4266805946826935, "rewards/counterfactual_reasoning_reward/mean": 0.578125, "rewards/counterfactual_reasoning_reward/std": 0.4994383603334427, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 183.859375, "completions/mean_terminated_length": 183.859375, "completions/min_length": 126.5, "completions/min_terminated_length": 126.5, "epoch": 0.6987475280158207, "grad_norm": 1.213947057723999, "kl": 0.21728515625, "learning_rate": 6.50395778364116e-07, "loss": -0.0018, "num_tokens": 291861933.0, "reward": 1.40625, "reward_std": 0.25099294632673264, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.44837237894535065, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.37647102028131485, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.5, "completions/max_terminated_length": 262.5, "completions/mean_length": 176.296875, "completions/mean_terminated_length": 176.296875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.7000659195781147, "grad_norm": 2.085965871810913, "kl": 0.27392578125, "learning_rate": 6.49736147757256e-07, "loss": 0.0375, "num_tokens": 292423683.0, "reward": 1.5546875, "reward_std": 0.1920287311077118, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.4659053534269333, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.4395582377910614, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.5, "completions/max_terminated_length": 264.5, "completions/mean_length": 168.8125, "completions/mean_terminated_length": 168.8125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.7013843111404087, "grad_norm": 1.4200191497802734, "kl": 0.23193359375, "learning_rate": 6.490765171503958e-07, "loss": 0.0012, "num_tokens": 293020336.0, "reward": 1.609375, "reward_std": 0.3285793662071228, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.4339464604854584, "rewards/counterfactual_reasoning_reward/mean": 0.53125, "rewards/counterfactual_reasoning_reward/std": 0.4979427307844162, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 171.375, "completions/mean_terminated_length": 171.375, "completions/min_length": 120.5, "completions/min_terminated_length": 120.5, "epoch": 0.7027027027027027, "grad_norm": 2.29695725440979, "kl": 0.21044921875, "learning_rate": 6.484168865435355e-07, "loss": -0.0019, "num_tokens": 293562822.0, "reward": 1.46875, "reward_std": 0.3451269268989563, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.49478302896022797, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 171.640625, "completions/mean_terminated_length": 171.640625, "completions/min_length": 108.5, "completions/min_terminated_length": 108.5, "epoch": 0.7040210942649967, "grad_norm": 6.265994548797607, "kl": 0.3203125, "learning_rate": 6.477572559366754e-07, "loss": 0.0016, "num_tokens": 294147442.0, "reward": 1.4140625, "reward_std": 0.5032328963279724, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4932873994112015, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.48946478962898254, "rewards/multiturn_format_reward/mean": 0.9375, "rewards/multiturn_format_reward/std": 0.24593468010425568, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.5, "completions/max_terminated_length": 222.5, "completions/mean_length": 166.546875, "completions/mean_terminated_length": 166.546875, "completions/min_length": 126.5, "completions/min_terminated_length": 126.5, "epoch": 0.7053394858272907, "grad_norm": 2.4909656047821045, "kl": 0.3515625, "learning_rate": 6.470976253298152e-07, "loss": -0.0031, "num_tokens": 294685563.0, "reward": 1.3984375, "reward_std": 0.36846210062503815, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4559413939714432, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.43795469403266907, "rewards/multiturn_format_reward/mean": 0.953125, "rewards/multiturn_format_reward/std": 0.21135568618774414, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 168.71875, "completions/mean_terminated_length": 168.71875, "completions/min_length": 116.5, "completions/min_terminated_length": 116.5, "epoch": 0.7066578773895847, "grad_norm": 5.961313247680664, "kl": 1.3310546875, "learning_rate": 6.464379947229552e-07, "loss": -0.0197, "num_tokens": 295220081.0, "reward": 1.4375, "reward_std": 0.3974643647670746, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48896822333335876, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.5015034973621368, "rewards/multiturn_format_reward/mean": 0.921875, "rewards/multiturn_format_reward/std": 0.2710396274924278, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 175.34375, "completions/mean_terminated_length": 175.34375, "completions/min_length": 121.5, "completions/min_terminated_length": 121.5, "epoch": 0.7079762689518787, "grad_norm": 2.0524208545684814, "kl": 0.3349609375, "learning_rate": 6.45778364116095e-07, "loss": -0.0091, "num_tokens": 295756029.0, "reward": 1.453125, "reward_std": 0.3455982208251953, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.48040975630283356, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.4932873994112015, "rewards/multiturn_format_reward/mean": 0.9375, "rewards/multiturn_format_reward/std": 0.2364606335759163, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.5, "completions/max_terminated_length": 302.5, "completions/mean_length": 173.140625, "completions/mean_terminated_length": 173.140625, "completions/min_length": 76.5, "completions/min_terminated_length": 76.5, "epoch": 0.7092946605141727, "grad_norm": 1.412907600402832, "kl": 0.244140625, "learning_rate": 6.451187335092349e-07, "loss": 0.0012, "num_tokens": 296331548.0, "reward": 1.578125, "reward_std": 0.31396010518074036, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.45028693974018097, "rewards/counterfactual_reasoning_reward/mean": 0.5, "rewards/counterfactual_reasoning_reward/std": 0.507007360458374, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 171.21875, "completions/mean_terminated_length": 171.21875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.7106130520764667, "grad_norm": 2.1411445140838623, "kl": 0.33203125, "learning_rate": 6.444591029023746e-07, "loss": 0.0017, "num_tokens": 296901877.0, "reward": 1.4609375, "reward_std": 0.4301883578300476, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.43795469403266907, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.4395582377910614, "rewards/multiturn_format_reward/mean": 0.921875, "rewards/multiturn_format_reward/std": 0.2710396274924278, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 170.546875, "completions/mean_terminated_length": 170.546875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.7119314436387607, "grad_norm": 2.7832796573638916, "kl": 0.365234375, "learning_rate": 6.437994722955144e-07, "loss": -0.0031, "num_tokens": 297418579.0, "reward": 1.46875, "reward_std": 0.28766903281211853, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.4994383603334427, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.5015034973621368, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 186.34375, "completions/mean_terminated_length": 186.34375, "completions/min_length": 137.5, "completions/min_terminated_length": 137.5, "epoch": 0.7132498352010547, "grad_norm": 4.544503688812256, "kl": 0.3603515625, "learning_rate": 6.431398416886543e-07, "loss": -0.0148, "num_tokens": 297976805.0, "reward": 1.484375, "reward_std": 0.35286714136600494, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.45178256928920746, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.45178256928920746, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.5, "completions/max_terminated_length": 349.5, "completions/mean_length": 188.609375, "completions/mean_terminated_length": 188.609375, "completions/min_length": 128.5, "completions/min_terminated_length": 128.5, "epoch": 0.7145682267633487, "grad_norm": 0.9434488415718079, "kl": 0.19140625, "learning_rate": 6.424802110817942e-07, "loss": 0.001, "num_tokens": 298485875.0, "reward": 1.4296875, "reward_std": 0.3506094664335251, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.5034956932067871, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.4709290862083435, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.5, "completions/max_terminated_length": 240.5, "completions/mean_length": 175.578125, "completions/mean_terminated_length": 175.578125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.7158866183256427, "grad_norm": 2.3821330070495605, "kl": 0.419921875, "learning_rate": 6.418205804749341e-07, "loss": 0.005, "num_tokens": 299036914.0, "reward": 1.484375, "reward_std": 0.3516102284193039, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.47197872400283813, "rewards/counterfactual_reasoning_reward/mean": 0.4375, "rewards/counterfactual_reasoning_reward/std": 0.49478302896022797, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 197.5, "completions/mean_terminated_length": 197.5, "completions/min_length": 146.5, "completions/min_terminated_length": 146.5, "epoch": 0.7172050098879367, "grad_norm": 2.1543655395507812, "kl": 0.25146484375, "learning_rate": 6.411609498680739e-07, "loss": 0.0179, "num_tokens": 299582973.0, "reward": 1.6796875, "reward_std": 0.2862573638558388, "rewards/accuracy_reward/mean": 0.78125, "rewards/accuracy_reward/std": 0.41824956238269806, "rewards/counterfactual_reasoning_reward/mean": 0.609375, "rewards/counterfactual_reasoning_reward/std": 0.495430126786232, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 207.609375, "completions/mean_terminated_length": 207.609375, "completions/min_length": 147.5, "completions/min_terminated_length": 147.5, "epoch": 0.7185234014502307, "grad_norm": 2.1049039363861084, "kl": 0.240234375, "learning_rate": 6.405013192612136e-07, "loss": 0.0012, "num_tokens": 300125927.0, "reward": 1.40625, "reward_std": 0.3531196266412735, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.48495998978614807, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4215090572834015, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.5, "completions/max_terminated_length": 315.5, "completions/mean_length": 200.15625, "completions/mean_terminated_length": 200.15625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.7198417930125247, "grad_norm": 8.785233497619629, "kl": 1.3271484375, "learning_rate": 6.398416886543535e-07, "loss": -0.0051, "num_tokens": 300665812.0, "reward": 1.515625, "reward_std": 0.24610909074544907, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.4825586974620819, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4874725937843323, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 211.046875, "completions/mean_terminated_length": 211.046875, "completions/min_length": 151.5, "completions/min_terminated_length": 151.5, "epoch": 0.7211601845748187, "grad_norm": 1.5978049039840698, "kl": 0.24365234375, "learning_rate": 6.391820580474933e-07, "loss": -0.0105, "num_tokens": 301239259.0, "reward": 1.375, "reward_std": 0.25512686371803284, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.4874725937843323, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.45028693974018097, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.5, "completions/max_terminated_length": 342.5, "completions/mean_length": 199.625, "completions/mean_terminated_length": 199.625, "completions/min_length": 103.5, "completions/min_terminated_length": 103.5, "epoch": 0.7224785761371127, "grad_norm": 2.5635671615600586, "kl": 0.19677734375, "learning_rate": 6.385224274406333e-07, "loss": -0.0029, "num_tokens": 301786548.0, "reward": 1.625, "reward_std": 0.15517596900463104, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.4638662487268448, "rewards/counterfactual_reasoning_reward/mean": 0.546875, "rewards/counterfactual_reasoning_reward/std": 0.4932873994112015, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 202.8125, "completions/mean_terminated_length": 202.8125, "completions/min_length": 147.5, "completions/min_terminated_length": 147.5, "epoch": 0.7237969676994067, "grad_norm": 1.4623454809188843, "kl": 0.22119140625, "learning_rate": 6.378627968337731e-07, "loss": 0.0128, "num_tokens": 302323760.0, "reward": 1.40625, "reward_std": 0.3143366128206253, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.4638662487268448, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.45178256928920746, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.5, "completions/max_terminated_length": 345.5, "completions/mean_length": 213.96875, "completions/mean_terminated_length": 213.96875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.7251153592617007, "grad_norm": 11.371781349182129, "kl": 0.18701171875, "learning_rate": 6.37203166226913e-07, "loss": 0.0009, "num_tokens": 302853381.0, "reward": 1.3203125, "reward_std": 0.31018537282943726, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.4907747954130173, "rewards/counterfactual_reasoning_reward/mean": 0.171875, "rewards/counterfactual_reasoning_reward/std": 0.3827299028635025, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.5, "completions/max_terminated_length": 325.5, "completions/mean_length": 195.90625, "completions/mean_terminated_length": 195.90625, "completions/min_length": 115.5, "completions/min_terminated_length": 115.5, "epoch": 0.7264337508239948, "grad_norm": 1.0941047668457031, "kl": 0.20654296875, "learning_rate": 6.365435356200527e-07, "loss": 0.0206, "num_tokens": 303403979.0, "reward": 1.5390625, "reward_std": 0.29489797353744507, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.48190538585186005, "rewards/counterfactual_reasoning_reward/mean": 0.484375, "rewards/counterfactual_reasoning_reward/std": 0.48721402883529663, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 199.34375, "completions/mean_terminated_length": 199.34375, "completions/min_length": 145.5, "completions/min_terminated_length": 145.5, "epoch": 0.7277521423862887, "grad_norm": 1.0739789009094238, "kl": 0.18603515625, "learning_rate": 6.358839050131925e-07, "loss": -0.002, "num_tokens": 303922420.0, "reward": 1.3671875, "reward_std": 0.24952403455972672, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.4979427307844162, "rewards/counterfactual_reasoning_reward/mean": 0.203125, "rewards/counterfactual_reasoning_reward/std": 0.40828560292720795, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.5, "completions/max_terminated_length": 353.5, "completions/mean_length": 208.9375, "completions/mean_terminated_length": 208.9375, "completions/min_length": 139.5, "completions/min_terminated_length": 139.5, "epoch": 0.7290705339485827, "grad_norm": 2.184251070022583, "kl": 0.18212890625, "learning_rate": 6.352242744063324e-07, "loss": 0.0302, "num_tokens": 304503270.0, "reward": 1.6796875, "reward_std": 0.23082757741212845, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.44837237894535065, "rewards/counterfactual_reasoning_reward/mean": 0.65625, "rewards/counterfactual_reasoning_reward/std": 0.4825586974620819, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 212.296875, "completions/mean_terminated_length": 212.296875, "completions/min_length": 149.5, "completions/min_terminated_length": 149.5, "epoch": 0.7303889255108768, "grad_norm": 2.1874752044677734, "kl": 0.1982421875, "learning_rate": 6.345646437994723e-07, "loss": 0.0195, "num_tokens": 305021435.0, "reward": 1.53125, "reward_std": 0.2563588172197342, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.48495998978614807, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.4994383603334427, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 207.828125, "completions/mean_terminated_length": 207.828125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.7317073170731707, "grad_norm": 1.2017539739608765, "kl": 0.1884765625, "learning_rate": 6.339050131926122e-07, "loss": -0.0108, "num_tokens": 305585701.0, "reward": 1.4296875, "reward_std": 0.25875162333250046, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.44837237894535065, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 205.328125, "completions/mean_terminated_length": 205.328125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.7330257086354647, "grad_norm": 1.0368732213974, "kl": 0.16455078125, "learning_rate": 6.33245382585752e-07, "loss": 0.0008, "num_tokens": 306147235.0, "reward": 1.625, "reward_std": 0.2768217474222183, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.37497539073228836, "rewards/counterfactual_reasoning_reward/mean": 0.5, "rewards/counterfactual_reasoning_reward/std": 0.4709290862083435, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 213.46875, "completions/mean_terminated_length": 213.46875, "completions/min_length": 151.5, "completions/min_terminated_length": 151.5, "epoch": 0.7343441001977588, "grad_norm": 1.9793081283569336, "kl": 0.2138671875, "learning_rate": 6.325857519788917e-07, "loss": 0.002, "num_tokens": 306689822.0, "reward": 1.5234375, "reward_std": 0.317100465297699, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4635103940963745, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.48896822333335876, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 194.046875, "completions/mean_terminated_length": 194.046875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.7356624917600527, "grad_norm": 1.0733784437179565, "kl": 0.18359375, "learning_rate": 6.319261213720316e-07, "loss": 0.0009, "num_tokens": 307233889.0, "reward": 1.3984375, "reward_std": 0.2613219991326332, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.5075039267539978, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.469681054353714, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 197.640625, "completions/mean_terminated_length": 197.640625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.7369808833223468, "grad_norm": 1.1569814682006836, "kl": 0.19482421875, "learning_rate": 6.312664907651714e-07, "loss": 0.0029, "num_tokens": 307801079.0, "reward": 1.4375, "reward_std": 0.23769184201955795, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.4638662487268448, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.48040975630283356, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.5, "completions/max_terminated_length": 276.5, "completions/mean_length": 208.390625, "completions/mean_terminated_length": 208.390625, "completions/min_length": 146.5, "completions/min_terminated_length": 146.5, "epoch": 0.7382992748846408, "grad_norm": 4.060904026031494, "kl": 0.490234375, "learning_rate": 6.306068601583114e-07, "loss": 0.0034, "num_tokens": 308352174.0, "reward": 1.296875, "reward_std": 0.3807689696550369, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.4994383603334427, "rewards/counterfactual_reasoning_reward/mean": 0.203125, "rewards/counterfactual_reasoning_reward/std": 0.40828560292720795, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.12296734005212784, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.5, "completions/max_terminated_length": 263.5, "completions/mean_length": 185.84375, "completions/mean_terminated_length": 185.84375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.7396176664469347, "grad_norm": 1.4472342729568481, "kl": 0.181640625, "learning_rate": 6.299472295514512e-07, "loss": -0.0098, "num_tokens": 308902852.0, "reward": 1.609375, "reward_std": 0.28664615005254745, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.44777433574199677, "rewards/counterfactual_reasoning_reward/mean": 0.515625, "rewards/counterfactual_reasoning_reward/std": 0.5055117309093475, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.5, "completions/max_terminated_length": 297.5, "completions/mean_length": 184.234375, "completions/mean_terminated_length": 184.234375, "completions/min_length": 98.5, "completions/min_terminated_length": 98.5, "epoch": 0.7409360580092288, "grad_norm": 2.095546245574951, "kl": 0.19775390625, "learning_rate": 6.292875989445911e-07, "loss": -0.0303, "num_tokens": 309443631.0, "reward": 1.4140625, "reward_std": 0.29189829528331757, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.4709290862083435, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.4824019521474838, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.5, "completions/max_terminated_length": 291.5, "completions/mean_length": 192.0, "completions/mean_terminated_length": 192.0, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.7422544495715228, "grad_norm": 2.869626522064209, "kl": 0.28515625, "learning_rate": 6.286279683377308e-07, "loss": 0.0014, "num_tokens": 310004029.0, "reward": 1.40625, "reward_std": 0.4220409691333771, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.495430126786232, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.4767438918352127, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 195.6875, "completions/mean_terminated_length": 195.6875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.7435728411338167, "grad_norm": 4.997684001922607, "kl": 0.203125, "learning_rate": 6.279683377308706e-07, "loss": -0.0029, "num_tokens": 310558961.0, "reward": 1.484375, "reward_std": 0.30312399566173553, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4743363857269287, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.46946612000465393, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 187.25, "completions/mean_terminated_length": 187.25, "completions/min_length": 136.5, "completions/min_terminated_length": 136.5, "epoch": 0.7448912326961108, "grad_norm": 1.739221453666687, "kl": 0.2109375, "learning_rate": 6.273087071240105e-07, "loss": 0.0011, "num_tokens": 311100520.0, "reward": 1.6171875, "reward_std": 0.3898201584815979, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.4215090572834015, "rewards/counterfactual_reasoning_reward/mean": 0.5625, "rewards/counterfactual_reasoning_reward/std": 0.4659053534269333, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.5, "completions/max_terminated_length": 306.5, "completions/mean_length": 198.09375, "completions/mean_terminated_length": 198.09375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.7462096242584048, "grad_norm": 2.672062873840332, "kl": 0.54296875, "learning_rate": 6.266490765171504e-07, "loss": 0.0193, "num_tokens": 311646222.0, "reward": 1.4921875, "reward_std": 0.20812273770570755, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.45227913558483124, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.45178256928920746, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 195.125, "completions/mean_terminated_length": 195.125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.7475280158206987, "grad_norm": 2.2101187705993652, "kl": 0.234375, "learning_rate": 6.259894459102903e-07, "loss": 0.0012, "num_tokens": 312174871.0, "reward": 1.2109375, "reward_std": 0.25621990859508514, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.4767438918352127, "rewards/counterfactual_reasoning_reward/mean": 0.125, "rewards/counterfactual_reasoning_reward/std": 0.33252330124378204, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.5, "completions/max_terminated_length": 353.5, "completions/mean_length": 205.5625, "completions/mean_terminated_length": 205.5625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.7488464073829928, "grad_norm": 1.6525242328643799, "kl": 0.22265625, "learning_rate": 6.253298153034301e-07, "loss": -0.0253, "num_tokens": 312720507.0, "reward": 1.5, "reward_std": 0.26733341813087463, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.49186936020851135, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.49899089336395264, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 196.234375, "completions/mean_terminated_length": 196.234375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.7501647989452868, "grad_norm": 2.1004812717437744, "kl": 0.20654296875, "learning_rate": 6.246701846965698e-07, "loss": -0.0009, "num_tokens": 313268436.0, "reward": 1.4453125, "reward_std": 0.37209658324718475, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.4442135691642761, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.37246278673410416, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 188.859375, "completions/mean_terminated_length": 188.859375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.7514831905075807, "grad_norm": 1.5103802680969238, "kl": 0.18408203125, "learning_rate": 6.240105540897097e-07, "loss": -0.0108, "num_tokens": 313828343.0, "reward": 1.5625, "reward_std": 0.3192354589700699, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.49186936020851135, "rewards/counterfactual_reasoning_reward/mean": 0.5, "rewards/counterfactual_reasoning_reward/std": 0.504016101360321, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.5, "completions/max_terminated_length": 367.5, "completions/mean_length": 208.140625, "completions/mean_terminated_length": 208.140625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.7528015820698748, "grad_norm": 2.0145602226257324, "kl": 0.20947265625, "learning_rate": 6.233509234828495e-07, "loss": -0.0741, "num_tokens": 314366102.0, "reward": 1.140625, "reward_std": 0.14902584999799728, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41824956238269806, "rewards/counterfactual_reasoning_reward/mean": 0.0625, "rewards/counterfactual_reasoning_reward/std": 0.16800537705421448, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.5, "completions/max_terminated_length": 362.5, "completions/mean_length": 205.0625, "completions/mean_terminated_length": 205.0625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.7541199736321688, "grad_norm": 2.0406877994537354, "kl": 0.17724609375, "learning_rate": 6.226912928759895e-07, "loss": -0.0089, "num_tokens": 314914580.0, "reward": 1.6328125, "reward_std": 0.31708528846502304, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4399413466453552, "rewards/counterfactual_reasoning_reward/mean": 0.515625, "rewards/counterfactual_reasoning_reward/std": 0.5075039267539978, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 209.421875, "completions/mean_terminated_length": 209.421875, "completions/min_length": 148.5, "completions/min_terminated_length": 148.5, "epoch": 0.7554383651944627, "grad_norm": 1.395274043083191, "kl": 0.1923828125, "learning_rate": 6.220316622691293e-07, "loss": 0.0049, "num_tokens": 315442620.0, "reward": 1.546875, "reward_std": 0.2597545459866524, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.4638662487268448, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.4824019521474838, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 227.15625, "completions/mean_terminated_length": 227.15625, "completions/min_length": 127.5, "completions/min_terminated_length": 127.5, "epoch": 0.7567567567567568, "grad_norm": 1.448084831237793, "kl": 0.16845703125, "learning_rate": 6.213720316622692e-07, "loss": 0.0008, "num_tokens": 315981733.0, "reward": 1.4140625, "reward_std": 0.31098589301109314, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.46125002205371857, "rewards/counterfactual_reasoning_reward/mean": 0.28125, "rewards/counterfactual_reasoning_reward/std": 0.420013427734375, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 188.40625, "completions/mean_terminated_length": 188.40625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.7580751483190508, "grad_norm": 1.552443027496338, "kl": 0.1943359375, "learning_rate": 6.207124010554089e-07, "loss": -0.0029, "num_tokens": 316530128.0, "reward": 1.6953125, "reward_std": 0.30783499777317047, "rewards/accuracy_reward/mean": 0.828125, "rewards/accuracy_reward/std": 0.3827299028635025, "rewards/counterfactual_reasoning_reward/mean": 0.5625, "rewards/counterfactual_reasoning_reward/std": 0.5029991269111633, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.5, "completions/max_terminated_length": 361.5, "completions/mean_length": 201.8125, "completions/mean_terminated_length": 201.8125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.7593935398813447, "grad_norm": 2.030022382736206, "kl": 0.20361328125, "learning_rate": 6.200527704485487e-07, "loss": 0.001, "num_tokens": 317085278.0, "reward": 1.4375, "reward_std": 0.3396371901035309, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4743363857269287, "rewards/counterfactual_reasoning_reward/mean": 0.328125, "rewards/counterfactual_reasoning_reward/std": 0.43845126032829285, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.5, "completions/max_terminated_length": 360.5, "completions/mean_length": 204.890625, "completions/mean_terminated_length": 204.890625, "completions/min_length": 137.5, "completions/min_terminated_length": 137.5, "epoch": 0.7607119314436388, "grad_norm": 1.3773537874221802, "kl": 0.22021484375, "learning_rate": 6.193931398416886e-07, "loss": 0.0089, "num_tokens": 317615852.0, "reward": 1.375, "reward_std": 0.19157499819993973, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.4874725937843323, "rewards/counterfactual_reasoning_reward/mean": 0.34375, "rewards/counterfactual_reasoning_reward/std": 0.47197872400283813, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.5, "completions/max_terminated_length": 333.5, "completions/mean_length": 201.25, "completions/mean_terminated_length": 201.25, "completions/min_length": 146.5, "completions/min_terminated_length": 146.5, "epoch": 0.7620303230059328, "grad_norm": 1.4876176118850708, "kl": 0.2568359375, "learning_rate": 6.187335092348285e-07, "loss": -0.0065, "num_tokens": 318153509.0, "reward": 1.3984375, "reward_std": 0.36497049033641815, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5015034973621368, "rewards/counterfactual_reasoning_reward/mean": 0.3125, "rewards/counterfactual_reasoning_reward/std": 0.45028693974018097, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 203.09375, "completions/mean_terminated_length": 203.09375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.7633487145682267, "grad_norm": 1.489472508430481, "kl": 0.1806640625, "learning_rate": 6.180738786279684e-07, "loss": 0.0429, "num_tokens": 318684804.0, "reward": 1.7265625, "reward_std": 0.2619616612792015, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.3584318831562996, "rewards/counterfactual_reasoning_reward/mean": 0.640625, "rewards/counterfactual_reasoning_reward/std": 0.4640069603919983, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.5, "completions/max_terminated_length": 420.5, "completions/mean_length": 217.359375, "completions/mean_terminated_length": 217.359375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.7646671061305208, "grad_norm": 1.910254955291748, "kl": 0.17431640625, "learning_rate": 6.174142480211082e-07, "loss": 0.0009, "num_tokens": 319238286.0, "reward": 1.3984375, "reward_std": 0.34225544333457947, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4994383603334427, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.3975677341222763, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.5, "completions/max_terminated_length": 382.5, "completions/mean_length": 217.53125, "completions/mean_terminated_length": 217.53125, "completions/min_length": 153.5, "completions/min_terminated_length": 153.5, "epoch": 0.7659854976928148, "grad_norm": 1.284716248512268, "kl": 0.21044921875, "learning_rate": 6.167546174142479e-07, "loss": 0.0196, "num_tokens": 319762414.0, "reward": 1.359375, "reward_std": 0.2921764403581619, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.495430126786232, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.44837237894535065, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.5, "completions/max_terminated_length": 367.5, "completions/mean_length": 211.359375, "completions/mean_terminated_length": 211.359375, "completions/min_length": 141.5, "completions/min_terminated_length": 141.5, "epoch": 0.7673038892551087, "grad_norm": 1.4316078424453735, "kl": 0.173828125, "learning_rate": 6.160949868073878e-07, "loss": 0.0009, "num_tokens": 320297924.0, "reward": 1.5, "reward_std": 0.3242250233888626, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.48139922320842743, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4907747954130173, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 210.015625, "completions/mean_terminated_length": 210.015625, "completions/min_length": 139.5, "completions/min_terminated_length": 139.5, "epoch": 0.7686222808174028, "grad_norm": 1.6393672227859497, "kl": 0.20166015625, "learning_rate": 6.154353562005276e-07, "loss": -0.0058, "num_tokens": 320851311.0, "reward": 1.3359375, "reward_std": 0.228536456823349, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.265625, "rewards/counterfactual_reasoning_reward/std": 0.4175008237361908, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 213.1875, "completions/mean_terminated_length": 213.1875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.7699406723796968, "grad_norm": 3.455173969268799, "kl": 0.2060546875, "learning_rate": 6.147757255936676e-07, "loss": -0.0058, "num_tokens": 321441085.0, "reward": 1.46875, "reward_std": 0.3873114585876465, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.504016101360321, "rewards/counterfactual_reasoning_reward/mean": 0.375, "rewards/counterfactual_reasoning_reward/std": 0.4907747954130173, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 219.203125, "completions/mean_terminated_length": 219.203125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.7712590639419907, "grad_norm": 1.319526195526123, "kl": 0.18017578125, "learning_rate": 6.141160949868074e-07, "loss": 0.0048, "num_tokens": 321979152.0, "reward": 1.515625, "reward_std": 0.34109005331993103, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4734743535518646, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.48896822333335876, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 212.21875, "completions/mean_terminated_length": 212.21875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.7725774555042848, "grad_norm": 1.4772905111312866, "kl": 0.19921875, "learning_rate": 6.134564643799473e-07, "loss": 0.002, "num_tokens": 322570134.0, "reward": 1.3828125, "reward_std": 0.3481176197528839, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.5060082972049713, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4559413939714432, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 220.75, "completions/mean_terminated_length": 220.75, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.7738958470665788, "grad_norm": 1.5859249830245972, "kl": 0.1630859375, "learning_rate": 6.12796833773087e-07, "loss": 0.0116, "num_tokens": 323113956.0, "reward": 1.3359375, "reward_std": 0.25151700526475906, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5015034973621368, "rewards/counterfactual_reasoning_reward/mean": 0.15625, "rewards/counterfactual_reasoning_reward/std": 0.34293801337480545, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 219.828125, "completions/mean_terminated_length": 219.828125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.7752142386288727, "grad_norm": 2.2318859100341797, "kl": 0.16943359375, "learning_rate": 6.121372031662268e-07, "loss": 0.0008, "num_tokens": 323675443.0, "reward": 1.4921875, "reward_std": 0.36218710243701935, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4907747954130173, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.48896822333335876, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 209.234375, "completions/mean_terminated_length": 209.234375, "completions/min_length": 144.5, "completions/min_terminated_length": 144.5, "epoch": 0.7765326301911668, "grad_norm": 1.932236671447754, "kl": 0.34765625, "learning_rate": 6.114775725593667e-07, "loss": 0.0135, "num_tokens": 324242414.0, "reward": 1.5703125, "reward_std": 0.29260797798633575, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4734743535518646, "rewards/counterfactual_reasoning_reward/mean": 0.5, "rewards/counterfactual_reasoning_reward/std": 0.49899089336395264, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.5, "completions/max_terminated_length": 397.5, "completions/mean_length": 220.15625, "completions/mean_terminated_length": 220.15625, "completions/min_length": 147.5, "completions/min_terminated_length": 147.5, "epoch": 0.7778510217534608, "grad_norm": 3.0366547107696533, "kl": 0.23828125, "learning_rate": 6.108179419525066e-07, "loss": -0.0135, "num_tokens": 324817397.0, "reward": 1.5625, "reward_std": 0.3776901960372925, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.4638662487268448, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.5015034973621368, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 216.09375, "completions/mean_terminated_length": 216.09375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.7791694133157547, "grad_norm": 1.256229281425476, "kl": 0.22216796875, "learning_rate": 6.101583113456465e-07, "loss": -0.0008, "num_tokens": 325367094.0, "reward": 1.4296875, "reward_std": 0.25073397159576416, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.504016101360321, "rewards/counterfactual_reasoning_reward/mean": 0.296875, "rewards/counterfactual_reasoning_reward/std": 0.4638662487268448, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 205.28125, "completions/mean_terminated_length": 205.28125, "completions/min_length": 139.5, "completions/min_terminated_length": 139.5, "epoch": 0.7804878048780488, "grad_norm": 1.7207001447677612, "kl": 0.2333984375, "learning_rate": 6.094986807387863e-07, "loss": 0.007, "num_tokens": 325928943.0, "reward": 1.8359375, "reward_std": 0.07996084541082382, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.2199706733226776, "rewards/counterfactual_reasoning_reward/mean": 0.796875, "rewards/counterfactual_reasoning_reward/std": 0.38353683054447174, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 207.6875, "completions/mean_terminated_length": 207.6875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.7818061964403428, "grad_norm": 1.821115255355835, "kl": 0.24072265625, "learning_rate": 6.08839050131926e-07, "loss": 0.0227, "num_tokens": 326479246.0, "reward": 1.5625, "reward_std": 0.23111103475093842, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.48040975630283356, "rewards/counterfactual_reasoning_reward/mean": 0.484375, "rewards/counterfactual_reasoning_reward/std": 0.4638662487268448, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 238.90625, "completions/mean_terminated_length": 238.90625, "completions/min_length": 131.5, "completions/min_terminated_length": 131.5, "epoch": 0.7831245880026367, "grad_norm": 1.9429503679275513, "kl": 0.216796875, "learning_rate": 6.081794195250659e-07, "loss": -0.0058, "num_tokens": 327021908.0, "reward": 1.4765625, "reward_std": 0.2974023073911667, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49478302896022797, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.4932873994112015, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.5, "completions/max_terminated_length": 341.5, "completions/mean_length": 203.34375, "completions/mean_terminated_length": 203.34375, "completions/min_length": 128.5, "completions/min_terminated_length": 128.5, "epoch": 0.7844429795649308, "grad_norm": 1.4079331159591675, "kl": 0.2451171875, "learning_rate": 6.075197889182057e-07, "loss": -0.0056, "num_tokens": 327556739.0, "reward": 1.671875, "reward_std": 0.32466430962085724, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.376473993062973, "rewards/counterfactual_reasoning_reward/mean": 0.53125, "rewards/counterfactual_reasoning_reward/std": 0.43840841948986053, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.5, "completions/max_terminated_length": 413.5, "completions/mean_length": 231.421875, "completions/mean_terminated_length": 231.421875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.7857613711272248, "grad_norm": 1.4544498920440674, "kl": 0.21728515625, "learning_rate": 6.068601583113457e-07, "loss": -0.0155, "num_tokens": 328155491.0, "reward": 1.4375, "reward_std": 0.2808258533477783, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.4638662487268448, "rewards/counterfactual_reasoning_reward/mean": 0.390625, "rewards/counterfactual_reasoning_reward/std": 0.40928472578525543, "rewards/multiturn_format_reward/mean": 0.984375, "rewards/multiturn_format_reward/std": 0.0883883461356163, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.5, "completions/max_terminated_length": 366.5, "completions/mean_length": 222.734375, "completions/mean_terminated_length": 222.734375, "completions/min_length": 139.5, "completions/min_terminated_length": 139.5, "epoch": 0.7870797626895187, "grad_norm": 1.4901474714279175, "kl": 0.2197265625, "learning_rate": 6.062005277044855e-07, "loss": -0.0087, "num_tokens": 328701420.0, "reward": 1.375, "reward_std": 0.3246610760688782, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.49186936020851135, "rewards/counterfactual_reasoning_reward/mean": 0.25, "rewards/counterfactual_reasoning_reward/std": 0.4399413466453552, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 229.59375, "completions/mean_terminated_length": 229.59375, "completions/min_length": 141.5, "completions/min_terminated_length": 141.5, "epoch": 0.7883981542518128, "grad_norm": 2.6204075813293457, "kl": 0.18798828125, "learning_rate": 6.055408970976254e-07, "loss": 0.0107, "num_tokens": 329242080.0, "reward": 1.5, "reward_std": 0.2878372445702553, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.32385288923978806, "rewards/counterfactual_reasoning_reward/mean": 0.421875, "rewards/counterfactual_reasoning_reward/std": 0.3680429607629776, "rewards/multiturn_format_reward/mean": 0.96875, "rewards/multiturn_format_reward/std": 0.1767766922712326, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 218.859375, "completions/mean_terminated_length": 218.859375, "completions/min_length": 140.5, "completions/min_terminated_length": 140.5, "epoch": 0.7897165458141068, "grad_norm": 1.2710840702056885, "kl": 0.1806640625, "learning_rate": 6.048812664907651e-07, "loss": 0.0009, "num_tokens": 329803357.0, "reward": 1.5390625, "reward_std": 0.3035851716995239, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.37246278673410416, "rewards/counterfactual_reasoning_reward/mean": 0.40625, "rewards/counterfactual_reasoning_reward/std": 0.47197872400283813, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 219.640625, "completions/mean_terminated_length": 219.640625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.7910349373764007, "grad_norm": 1.7080672979354858, "kl": 0.2099609375, "learning_rate": 6.042216358839049e-07, "loss": 0.0089, "num_tokens": 330358137.0, "reward": 1.4921875, "reward_std": 0.35067644715309143, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4635103940963745, "rewards/counterfactual_reasoning_reward/mean": 0.359375, "rewards/counterfactual_reasoning_reward/std": 0.48495998978614807, "rewards/multiturn_format_reward/mean": 1.0, "rewards/multiturn_format_reward/std": 0.0, "step": 600 } ], "logging_steps": 1.0, "max_steps": 1516, "num_input_tokens_seen": 330358137, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }