| { | |
| "best_metric": 5.531345844268799, | |
| "best_model_checkpoint": "./results/models/checkpoint-58916", | |
| "epoch": 13.0, | |
| "eval_steps": 500, | |
| "global_step": 58916, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.11032656663724624, | |
| "grad_norm": 0.07373046875, | |
| "learning_rate": 0.00199558693733451, | |
| "loss": 6.7817, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.22065313327449249, | |
| "grad_norm": 0.10693359375, | |
| "learning_rate": 0.0019911738746690205, | |
| "loss": 6.1767, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.33097969991173876, | |
| "grad_norm": 0.10595703125, | |
| "learning_rate": 0.0019867608120035306, | |
| "loss": 6.0418, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.44130626654898497, | |
| "grad_norm": 0.08642578125, | |
| "learning_rate": 0.0019823477493380406, | |
| "loss": 5.9811, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5516328331862312, | |
| "grad_norm": 0.11572265625, | |
| "learning_rate": 0.0019779346866725506, | |
| "loss": 5.9448, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.6619593998234775, | |
| "grad_norm": 0.07958984375, | |
| "learning_rate": 0.001973521624007061, | |
| "loss": 5.9146, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.7722859664607238, | |
| "grad_norm": 0.10205078125, | |
| "learning_rate": 0.001969108561341571, | |
| "loss": 5.8869, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.8826125330979699, | |
| "grad_norm": 0.09814453125, | |
| "learning_rate": 0.001964695498676081, | |
| "loss": 5.8551, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.9929390997352162, | |
| "grad_norm": 0.1025390625, | |
| "learning_rate": 0.001960282436010591, | |
| "loss": 5.8427, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 5.741388320922852, | |
| "eval_runtime": 7.2342, | |
| "eval_samples_per_second": 69.116, | |
| "eval_steps_per_second": 1.106, | |
| "step": 4532 | |
| }, | |
| { | |
| "epoch": 1.1032656663724625, | |
| "grad_norm": 0.09912109375, | |
| "learning_rate": 0.0019558693733451016, | |
| "loss": 5.8385, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.2135922330097086, | |
| "grad_norm": 0.10302734375, | |
| "learning_rate": 0.0019514563106796117, | |
| "loss": 5.8289, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.323918799646955, | |
| "grad_norm": 0.11181640625, | |
| "learning_rate": 0.001947043248014122, | |
| "loss": 5.8157, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.4342453662842012, | |
| "grad_norm": 0.11376953125, | |
| "learning_rate": 0.001942630185348632, | |
| "loss": 5.7893, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.5445719329214476, | |
| "grad_norm": 0.1171875, | |
| "learning_rate": 0.0019382171226831422, | |
| "loss": 5.7912, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.6548984995586937, | |
| "grad_norm": 0.119140625, | |
| "learning_rate": 0.0019338040600176522, | |
| "loss": 5.7802, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.7652250661959399, | |
| "grad_norm": 0.109375, | |
| "learning_rate": 0.0019293909973521625, | |
| "loss": 5.7805, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.8755516328331863, | |
| "grad_norm": 0.10693359375, | |
| "learning_rate": 0.0019249779346866725, | |
| "loss": 5.765, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.9858781994704324, | |
| "grad_norm": 0.1064453125, | |
| "learning_rate": 0.0019205648720211827, | |
| "loss": 5.7674, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 5.6664910316467285, | |
| "eval_runtime": 6.8932, | |
| "eval_samples_per_second": 72.535, | |
| "eval_steps_per_second": 1.161, | |
| "step": 9064 | |
| }, | |
| { | |
| "epoch": 2.096204766107679, | |
| "grad_norm": 0.11572265625, | |
| "learning_rate": 0.001916151809355693, | |
| "loss": 5.7568, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.206531332744925, | |
| "grad_norm": 0.109375, | |
| "learning_rate": 0.001911738746690203, | |
| "loss": 5.7619, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.316857899382171, | |
| "grad_norm": 0.12060546875, | |
| "learning_rate": 0.0019073256840247133, | |
| "loss": 5.7423, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.4271844660194173, | |
| "grad_norm": 0.1181640625, | |
| "learning_rate": 0.0019029126213592233, | |
| "loss": 5.7502, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.537511032656664, | |
| "grad_norm": 0.109375, | |
| "learning_rate": 0.0018984995586937335, | |
| "loss": 5.7371, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.64783759929391, | |
| "grad_norm": 0.1142578125, | |
| "learning_rate": 0.0018940864960282436, | |
| "loss": 5.7316, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.758164165931156, | |
| "grad_norm": 0.10595703125, | |
| "learning_rate": 0.0018896734333627538, | |
| "loss": 5.7315, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.8684907325684024, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 0.001885260370697264, | |
| "loss": 5.7301, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.978817299205649, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 0.001880847308031774, | |
| "loss": 5.728, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 5.630899429321289, | |
| "eval_runtime": 7.3076, | |
| "eval_samples_per_second": 68.422, | |
| "eval_steps_per_second": 1.095, | |
| "step": 13596 | |
| }, | |
| { | |
| "epoch": 3.089143865842895, | |
| "grad_norm": 0.11572265625, | |
| "learning_rate": 0.001876434245366284, | |
| "loss": 5.724, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 3.1994704324801413, | |
| "grad_norm": 0.109375, | |
| "learning_rate": 0.0018720211827007946, | |
| "loss": 5.7181, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 3.3097969991173875, | |
| "grad_norm": 0.11865234375, | |
| "learning_rate": 0.0018676081200353046, | |
| "loss": 5.7201, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 3.4201235657546336, | |
| "grad_norm": 0.1240234375, | |
| "learning_rate": 0.0018631950573698146, | |
| "loss": 5.718, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 3.5304501323918798, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.0018587819947043247, | |
| "loss": 5.6959, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 3.6407766990291264, | |
| "grad_norm": 0.11181640625, | |
| "learning_rate": 0.0018543689320388351, | |
| "loss": 5.7112, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 3.7511032656663725, | |
| "grad_norm": 0.1220703125, | |
| "learning_rate": 0.0018499558693733451, | |
| "loss": 5.6986, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 3.8614298323036187, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 0.0018455428067078552, | |
| "loss": 5.7103, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 3.971756398940865, | |
| "grad_norm": 0.12158203125, | |
| "learning_rate": 0.0018411297440423656, | |
| "loss": 5.7054, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 5.601424694061279, | |
| "eval_runtime": 6.7523, | |
| "eval_samples_per_second": 74.049, | |
| "eval_steps_per_second": 1.185, | |
| "step": 18128 | |
| }, | |
| { | |
| "epoch": 4.0820829655781115, | |
| "grad_norm": 0.126953125, | |
| "learning_rate": 0.0018367166813768757, | |
| "loss": 5.6929, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 4.192409532215358, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 0.0018323036187113857, | |
| "loss": 5.6868, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 4.302736098852604, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 0.0018278905560458957, | |
| "loss": 5.6955, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 4.41306266548985, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 0.0018234774933804062, | |
| "loss": 5.6879, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 4.523389232127096, | |
| "grad_norm": 0.16796875, | |
| "learning_rate": 0.0018190644307149162, | |
| "loss": 5.6919, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 4.633715798764342, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 0.0018146513680494262, | |
| "loss": 5.6906, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 4.744042365401588, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 0.0018102383053839365, | |
| "loss": 5.6761, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 4.854368932038835, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 0.0018058252427184467, | |
| "loss": 5.6859, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 4.964695498676081, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 0.0018014121800529568, | |
| "loss": 5.6948, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 5.584114074707031, | |
| "eval_runtime": 6.7803, | |
| "eval_samples_per_second": 73.743, | |
| "eval_steps_per_second": 1.18, | |
| "step": 22660 | |
| }, | |
| { | |
| "epoch": 5.075022065313328, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 0.001796999117387467, | |
| "loss": 5.6773, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 5.185348631950574, | |
| "grad_norm": 0.1494140625, | |
| "learning_rate": 0.001792586054721977, | |
| "loss": 5.6741, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 5.29567519858782, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 0.0017881729920564873, | |
| "loss": 5.6694, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 5.406001765225066, | |
| "grad_norm": 0.1474609375, | |
| "learning_rate": 0.0017837599293909973, | |
| "loss": 5.6803, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 5.516328331862312, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 0.0017793468667255076, | |
| "loss": 5.6683, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 5.626654898499559, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 0.0017749338040600176, | |
| "loss": 5.6717, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 5.736981465136805, | |
| "grad_norm": 0.126953125, | |
| "learning_rate": 0.0017705207413945278, | |
| "loss": 5.671, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 5.847308031774051, | |
| "grad_norm": 0.1474609375, | |
| "learning_rate": 0.001766107678729038, | |
| "loss": 5.6843, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 5.957634598411297, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 0.0017616946160635481, | |
| "loss": 5.6726, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 5.570712566375732, | |
| "eval_runtime": 6.7662, | |
| "eval_samples_per_second": 73.896, | |
| "eval_steps_per_second": 1.182, | |
| "step": 27192 | |
| }, | |
| { | |
| "epoch": 6.067961165048544, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 0.0017572815533980584, | |
| "loss": 5.6475, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 6.17828773168579, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.0017528684907325684, | |
| "loss": 5.6556, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 6.288614298323036, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 0.0017484554280670786, | |
| "loss": 5.6642, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 6.398940864960283, | |
| "grad_norm": 0.1474609375, | |
| "learning_rate": 0.0017440423654015887, | |
| "loss": 5.6667, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 6.509267431597529, | |
| "grad_norm": 0.15625, | |
| "learning_rate": 0.001739629302736099, | |
| "loss": 5.6611, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 6.619593998234775, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 0.0017352162400706092, | |
| "loss": 5.6541, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 6.729920564872021, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 0.0017308031774051192, | |
| "loss": 5.6624, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 6.840247131509267, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 0.0017263901147396292, | |
| "loss": 5.6582, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 6.950573698146513, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 0.0017219770520741397, | |
| "loss": 5.6548, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 5.559887886047363, | |
| "eval_runtime": 6.8808, | |
| "eval_samples_per_second": 72.666, | |
| "eval_steps_per_second": 1.163, | |
| "step": 31724 | |
| }, | |
| { | |
| "epoch": 7.0609002647837595, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 0.0017175639894086497, | |
| "loss": 5.6465, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 7.171226831421007, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 0.0017131509267431597, | |
| "loss": 5.6571, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 7.281553398058253, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 0.0017087378640776698, | |
| "loss": 5.6475, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 7.391879964695499, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 0.0017043248014121802, | |
| "loss": 5.6503, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 7.502206531332745, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 0.0016999117387466903, | |
| "loss": 5.6645, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 7.612533097969991, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 0.0016954986760812003, | |
| "loss": 5.6403, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 7.722859664607237, | |
| "grad_norm": 0.1259765625, | |
| "learning_rate": 0.0016910856134157105, | |
| "loss": 5.6528, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 7.8331862312444835, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 0.0016866725507502208, | |
| "loss": 5.6419, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 7.94351279788173, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 0.0016822594880847308, | |
| "loss": 5.6511, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 5.553098678588867, | |
| "eval_runtime": 8.0507, | |
| "eval_samples_per_second": 62.106, | |
| "eval_steps_per_second": 0.994, | |
| "step": 36256 | |
| }, | |
| { | |
| "epoch": 8.053839364518977, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 0.0016778464254192408, | |
| "loss": 5.6478, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 8.164165931156223, | |
| "grad_norm": 0.16796875, | |
| "learning_rate": 0.0016734333627537513, | |
| "loss": 5.6245, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 8.274492497793469, | |
| "grad_norm": 0.1474609375, | |
| "learning_rate": 0.0016690203000882613, | |
| "loss": 5.6376, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 8.384819064430715, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 0.0016646072374227714, | |
| "loss": 5.6465, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 8.495145631067961, | |
| "grad_norm": 0.1552734375, | |
| "learning_rate": 0.0016601941747572816, | |
| "loss": 5.6546, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 8.605472197705208, | |
| "grad_norm": 0.15625, | |
| "learning_rate": 0.0016557811120917918, | |
| "loss": 5.6396, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 8.715798764342454, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 0.0016513680494263019, | |
| "loss": 5.6524, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 8.8261253309797, | |
| "grad_norm": 0.15625, | |
| "learning_rate": 0.0016469549867608121, | |
| "loss": 5.6368, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 8.936451897616946, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 0.0016425419240953221, | |
| "loss": 5.6372, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 5.547245979309082, | |
| "eval_runtime": 6.7444, | |
| "eval_samples_per_second": 74.136, | |
| "eval_steps_per_second": 1.186, | |
| "step": 40788 | |
| }, | |
| { | |
| "epoch": 9.046778464254192, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 0.0016381288614298324, | |
| "loss": 5.6392, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 9.157105030891438, | |
| "grad_norm": 0.1845703125, | |
| "learning_rate": 0.0016337157987643424, | |
| "loss": 5.6347, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 9.267431597528685, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 0.0016293027360988527, | |
| "loss": 5.6439, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 9.37775816416593, | |
| "grad_norm": 0.15625, | |
| "learning_rate": 0.0016248896734333627, | |
| "loss": 5.6422, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 9.488084730803177, | |
| "grad_norm": 0.17578125, | |
| "learning_rate": 0.001620476610767873, | |
| "loss": 5.6266, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 9.598411297440423, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 0.0016160635481023832, | |
| "loss": 5.6408, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 9.70873786407767, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.0016116504854368932, | |
| "loss": 5.6357, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 9.819064430714917, | |
| "grad_norm": 0.1474609375, | |
| "learning_rate": 0.0016072374227714032, | |
| "loss": 5.6387, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 9.929390997352161, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 0.0016028243601059135, | |
| "loss": 5.6347, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 5.540464401245117, | |
| "eval_runtime": 7.1327, | |
| "eval_samples_per_second": 70.1, | |
| "eval_steps_per_second": 1.122, | |
| "step": 45320 | |
| }, | |
| { | |
| "epoch": 10.03971756398941, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 0.0015984112974404237, | |
| "loss": 5.6329, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 10.150044130626656, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 0.0015939982347749338, | |
| "loss": 5.6368, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 10.260370697263902, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 0.001589585172109444, | |
| "loss": 5.6292, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 10.370697263901148, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 0.0015851721094439543, | |
| "loss": 5.64, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 10.481023830538394, | |
| "grad_norm": 0.1845703125, | |
| "learning_rate": 0.0015807590467784643, | |
| "loss": 5.63, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 10.59135039717564, | |
| "grad_norm": 0.1826171875, | |
| "learning_rate": 0.0015763459841129743, | |
| "loss": 5.6312, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 10.701676963812886, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 0.0015719329214474848, | |
| "loss": 5.6319, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 10.812003530450133, | |
| "grad_norm": 0.1533203125, | |
| "learning_rate": 0.0015675198587819948, | |
| "loss": 5.6298, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 10.922330097087379, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 0.0015631067961165048, | |
| "loss": 5.6281, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 5.537391662597656, | |
| "eval_runtime": 6.6373, | |
| "eval_samples_per_second": 75.332, | |
| "eval_steps_per_second": 1.205, | |
| "step": 49852 | |
| }, | |
| { | |
| "epoch": 11.032656663724625, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 0.0015586937334510149, | |
| "loss": 5.6283, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 11.142983230361871, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 0.0015542806707855253, | |
| "loss": 5.6227, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 11.253309796999117, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 0.0015498676081200354, | |
| "loss": 5.6277, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 11.363636363636363, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 0.0015454545454545454, | |
| "loss": 5.6274, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 11.47396293027361, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 0.0015410414827890556, | |
| "loss": 5.6306, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 11.584289496910856, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 0.0015366284201235659, | |
| "loss": 5.63, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 11.694616063548102, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 0.001532215357458076, | |
| "loss": 5.6296, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 11.804942630185348, | |
| "grad_norm": 0.2490234375, | |
| "learning_rate": 0.001527802294792586, | |
| "loss": 5.6257, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 11.915269196822594, | |
| "grad_norm": 0.1767578125, | |
| "learning_rate": 0.0015233892321270964, | |
| "loss": 5.6299, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 5.5359296798706055, | |
| "eval_runtime": 7.1034, | |
| "eval_samples_per_second": 70.388, | |
| "eval_steps_per_second": 1.126, | |
| "step": 54384 | |
| }, | |
| { | |
| "epoch": 12.02559576345984, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 0.0015189761694616064, | |
| "loss": 5.6219, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 12.135922330097088, | |
| "grad_norm": 0.162109375, | |
| "learning_rate": 0.0015145631067961165, | |
| "loss": 5.6342, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 12.246248896734334, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 0.0015101500441306267, | |
| "loss": 5.629, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 12.35657546337158, | |
| "grad_norm": 0.1865234375, | |
| "learning_rate": 0.001505736981465137, | |
| "loss": 5.6198, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 12.466902030008827, | |
| "grad_norm": 0.16015625, | |
| "learning_rate": 0.001501323918799647, | |
| "loss": 5.6308, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 12.577228596646073, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 0.0014969108561341572, | |
| "loss": 5.6205, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 12.687555163283319, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 0.0014924977934686673, | |
| "loss": 5.6075, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 12.797881729920565, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 0.0014880847308031775, | |
| "loss": 5.6294, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 12.908208296557811, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 0.0014836716681376875, | |
| "loss": 5.6225, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 5.531345844268799, | |
| "eval_runtime": 6.7386, | |
| "eval_samples_per_second": 74.2, | |
| "eval_steps_per_second": 1.187, | |
| "step": 58916 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 226600, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 3, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.800284808610376e+17, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |