| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9991836734693877, | |
| "eval_steps": 500, | |
| "global_step": 153, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006530612244897959, | |
| "grad_norm": 0.6802010536193848, | |
| "learning_rate": 4.999472998758978e-05, | |
| "loss": 0.7016, | |
| "num_input_tokens_seen": 2097152, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.013061224489795919, | |
| "grad_norm": 0.6286860704421997, | |
| "learning_rate": 4.99789221722016e-05, | |
| "loss": 0.6549, | |
| "num_input_tokens_seen": 4194304, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.019591836734693877, | |
| "grad_norm": 0.5965273976325989, | |
| "learning_rate": 4.995258321842611e-05, | |
| "loss": 0.6414, | |
| "num_input_tokens_seen": 6291456, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.026122448979591838, | |
| "grad_norm": 0.513691782951355, | |
| "learning_rate": 4.991572423079236e-05, | |
| "loss": 0.6061, | |
| "num_input_tokens_seen": 8388608, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0326530612244898, | |
| "grad_norm": 0.5384091138839722, | |
| "learning_rate": 4.986836074908616e-05, | |
| "loss": 0.6026, | |
| "num_input_tokens_seen": 10485760, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.03918367346938775, | |
| "grad_norm": 0.48317599296569824, | |
| "learning_rate": 4.98105127417984e-05, | |
| "loss": 0.5596, | |
| "num_input_tokens_seen": 12582912, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.35217732191085815, | |
| "learning_rate": 4.974220459770639e-05, | |
| "loss": 0.557, | |
| "num_input_tokens_seen": 14680064, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.052244897959183675, | |
| "grad_norm": 0.15410029888153076, | |
| "learning_rate": 4.966346511559149e-05, | |
| "loss": 0.5416, | |
| "num_input_tokens_seen": 16777216, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.05877551020408163, | |
| "grad_norm": 0.10962219536304474, | |
| "learning_rate": 4.957432749209755e-05, | |
| "loss": 0.5424, | |
| "num_input_tokens_seen": 18874368, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0653061224489796, | |
| "grad_norm": 0.0843987911939621, | |
| "learning_rate": 4.9474829307735115e-05, | |
| "loss": 0.5242, | |
| "num_input_tokens_seen": 20971520, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07183673469387755, | |
| "grad_norm": 0.06282058358192444, | |
| "learning_rate": 4.9365012511037514e-05, | |
| "loss": 0.5195, | |
| "num_input_tokens_seen": 23068672, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.0783673469387755, | |
| "grad_norm": 0.05317516624927521, | |
| "learning_rate": 4.9244923400875245e-05, | |
| "loss": 0.5259, | |
| "num_input_tokens_seen": 25165824, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.08489795918367347, | |
| "grad_norm": 0.04503787308931351, | |
| "learning_rate": 4.911461260693638e-05, | |
| "loss": 0.521, | |
| "num_input_tokens_seen": 27262976, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.040327247232198715, | |
| "learning_rate": 4.8974135068381036e-05, | |
| "loss": 0.5269, | |
| "num_input_tokens_seen": 29360128, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.09795918367346938, | |
| "grad_norm": 0.03843770548701286, | |
| "learning_rate": 4.882355001067892e-05, | |
| "loss": 0.5208, | |
| "num_input_tokens_seen": 31457280, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.10448979591836735, | |
| "grad_norm": 0.03380432352423668, | |
| "learning_rate": 4.8662920920639866e-05, | |
| "loss": 0.5193, | |
| "num_input_tokens_seen": 33554432, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.1110204081632653, | |
| "grad_norm": 0.031558409333229065, | |
| "learning_rate": 4.849231551964771e-05, | |
| "loss": 0.4995, | |
| "num_input_tokens_seen": 35651584, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.11755102040816326, | |
| "grad_norm": 0.031164532527327538, | |
| "learning_rate": 4.8311805735108894e-05, | |
| "loss": 0.5277, | |
| "num_input_tokens_seen": 37748736, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.12408163265306123, | |
| "grad_norm": 0.030629999935626984, | |
| "learning_rate": 4.81214676701278e-05, | |
| "loss": 0.5187, | |
| "num_input_tokens_seen": 39845888, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.1306122448979592, | |
| "grad_norm": 0.028932394459843636, | |
| "learning_rate": 4.792138157142158e-05, | |
| "loss": 0.5322, | |
| "num_input_tokens_seen": 41943040, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.027270587161183357, | |
| "learning_rate": 4.7711631795488096e-05, | |
| "loss": 0.4978, | |
| "num_input_tokens_seen": 44040192, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.1436734693877551, | |
| "grad_norm": 0.027742592617869377, | |
| "learning_rate": 4.749230677304114e-05, | |
| "loss": 0.5253, | |
| "num_input_tokens_seen": 46137344, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.15020408163265306, | |
| "grad_norm": 0.027034137398004532, | |
| "learning_rate": 4.726349897172791e-05, | |
| "loss": 0.5309, | |
| "num_input_tokens_seen": 48234496, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.156734693877551, | |
| "grad_norm": 0.026147669181227684, | |
| "learning_rate": 4.702530485714461e-05, | |
| "loss": 0.5197, | |
| "num_input_tokens_seen": 50331648, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.16326530612244897, | |
| "grad_norm": 0.025035962462425232, | |
| "learning_rate": 4.677782485216644e-05, | |
| "loss": 0.5115, | |
| "num_input_tokens_seen": 52428800, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.16979591836734695, | |
| "grad_norm": 0.024556027725338936, | |
| "learning_rate": 4.6521163294609196e-05, | |
| "loss": 0.4952, | |
| "num_input_tokens_seen": 54525952, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.1763265306122449, | |
| "grad_norm": 0.02424173429608345, | |
| "learning_rate": 4.625542839324036e-05, | |
| "loss": 0.4781, | |
| "num_input_tokens_seen": 56623104, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.02580653876066208, | |
| "learning_rate": 4.598073218215817e-05, | |
| "loss": 0.5067, | |
| "num_input_tokens_seen": 58720256, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.1893877551020408, | |
| "grad_norm": 0.024303199723362923, | |
| "learning_rate": 4.5697190473557946e-05, | |
| "loss": 0.514, | |
| "num_input_tokens_seen": 60817408, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.19591836734693877, | |
| "grad_norm": 0.02427365817129612, | |
| "learning_rate": 4.540492280890555e-05, | |
| "loss": 0.5062, | |
| "num_input_tokens_seen": 62914560, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.20244897959183675, | |
| "grad_norm": 0.023720987141132355, | |
| "learning_rate": 4.510405240853854e-05, | |
| "loss": 0.494, | |
| "num_input_tokens_seen": 65011712, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.2089795918367347, | |
| "grad_norm": 0.0251515731215477, | |
| "learning_rate": 4.4794706119716455e-05, | |
| "loss": 0.5056, | |
| "num_input_tokens_seen": 67108864, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.21551020408163266, | |
| "grad_norm": 0.022940408438444138, | |
| "learning_rate": 4.447701436314176e-05, | |
| "loss": 0.5128, | |
| "num_input_tokens_seen": 69206016, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.2220408163265306, | |
| "grad_norm": 0.023887069895863533, | |
| "learning_rate": 4.415111107797445e-05, | |
| "loss": 0.5024, | |
| "num_input_tokens_seen": 71303168, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.022622637450695038, | |
| "learning_rate": 4.381713366536311e-05, | |
| "loss": 0.4998, | |
| "num_input_tokens_seen": 73400320, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.23510204081632652, | |
| "grad_norm": 0.02263909950852394, | |
| "learning_rate": 4.347522293051648e-05, | |
| "loss": 0.5222, | |
| "num_input_tokens_seen": 75497472, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.2416326530612245, | |
| "grad_norm": 0.023353304713964462, | |
| "learning_rate": 4.312552302333982e-05, | |
| "loss": 0.501, | |
| "num_input_tokens_seen": 77594624, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.24816326530612245, | |
| "grad_norm": 0.02225712686777115, | |
| "learning_rate": 4.276818137766118e-05, | |
| "loss": 0.5035, | |
| "num_input_tokens_seen": 79691776, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.2546938775510204, | |
| "grad_norm": 0.022769000381231308, | |
| "learning_rate": 4.2403348649073174e-05, | |
| "loss": 0.5115, | |
| "num_input_tokens_seen": 81788928, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.2612244897959184, | |
| "grad_norm": 0.0222884863615036, | |
| "learning_rate": 4.203117865141635e-05, | |
| "loss": 0.5087, | |
| "num_input_tokens_seen": 83886080, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2677551020408163, | |
| "grad_norm": 0.02289729192852974, | |
| "learning_rate": 4.1651828291931264e-05, | |
| "loss": 0.5095, | |
| "num_input_tokens_seen": 85983232, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 0.023385386914014816, | |
| "learning_rate": 4.126545750510605e-05, | |
| "loss": 0.5033, | |
| "num_input_tokens_seen": 88080384, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.2808163265306122, | |
| "grad_norm": 0.02286476455628872, | |
| "learning_rate": 4.0872229185248075e-05, | |
| "loss": 0.5245, | |
| "num_input_tokens_seen": 90177536, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.2873469387755102, | |
| "grad_norm": 0.02207457832992077, | |
| "learning_rate": 4.047230911780737e-05, | |
| "loss": 0.5041, | |
| "num_input_tokens_seen": 92274688, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.2938775510204082, | |
| "grad_norm": 0.021405808627605438, | |
| "learning_rate": 4.0065865909481417e-05, | |
| "loss": 0.4929, | |
| "num_input_tokens_seen": 94371840, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.3004081632653061, | |
| "grad_norm": 0.022242728620767593, | |
| "learning_rate": 3.965307091713037e-05, | |
| "loss": 0.4939, | |
| "num_input_tokens_seen": 96468992, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.3069387755102041, | |
| "grad_norm": 0.02234838157892227, | |
| "learning_rate": 3.923409817553284e-05, | |
| "loss": 0.5008, | |
| "num_input_tokens_seen": 98566144, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.313469387755102, | |
| "grad_norm": 0.020911742001771927, | |
| "learning_rate": 3.880912432401265e-05, | |
| "loss": 0.5023, | |
| "num_input_tokens_seen": 100663296, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.02207380160689354, | |
| "learning_rate": 3.837832853196751e-05, | |
| "loss": 0.5124, | |
| "num_input_tokens_seen": 102760448, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.32653061224489793, | |
| "grad_norm": 0.021706702187657356, | |
| "learning_rate": 3.794189242333106e-05, | |
| "loss": 0.5033, | |
| "num_input_tokens_seen": 104857600, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3330612244897959, | |
| "grad_norm": 0.02188958041369915, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 0.4958, | |
| "num_input_tokens_seen": 106954752, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.3395918367346939, | |
| "grad_norm": 0.02165570855140686, | |
| "learning_rate": 3.705283756425872e-05, | |
| "loss": 0.5006, | |
| "num_input_tokens_seen": 109051904, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.3461224489795918, | |
| "grad_norm": 0.02152332104742527, | |
| "learning_rate": 3.6600593640234086e-05, | |
| "loss": 0.5033, | |
| "num_input_tokens_seen": 111149056, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.3526530612244898, | |
| "grad_norm": 0.02067963406443596, | |
| "learning_rate": 3.6143458894413465e-05, | |
| "loss": 0.5038, | |
| "num_input_tokens_seen": 113246208, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.35918367346938773, | |
| "grad_norm": 0.02125193364918232, | |
| "learning_rate": 3.568162605525953e-05, | |
| "loss": 0.4973, | |
| "num_input_tokens_seen": 115343360, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.3657142857142857, | |
| "grad_norm": 0.020537488162517548, | |
| "learning_rate": 3.5215289831955786e-05, | |
| "loss": 0.495, | |
| "num_input_tokens_seen": 117440512, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.3722448979591837, | |
| "grad_norm": 0.021631518378853798, | |
| "learning_rate": 3.474464683231698e-05, | |
| "loss": 0.4971, | |
| "num_input_tokens_seen": 119537664, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.3787755102040816, | |
| "grad_norm": 0.023077189922332764, | |
| "learning_rate": 3.426989547989902e-05, | |
| "loss": 0.5412, | |
| "num_input_tokens_seen": 121634816, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.3853061224489796, | |
| "grad_norm": 0.02074201963841915, | |
| "learning_rate": 3.379123593034342e-05, | |
| "loss": 0.5219, | |
| "num_input_tokens_seen": 123731968, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.39183673469387753, | |
| "grad_norm": 0.021061761304736137, | |
| "learning_rate": 3.330886998699149e-05, | |
| "loss": 0.4989, | |
| "num_input_tokens_seen": 125829120, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3983673469387755, | |
| "grad_norm": 0.021866271272301674, | |
| "learning_rate": 3.282300101580386e-05, | |
| "loss": 0.5122, | |
| "num_input_tokens_seen": 127926272, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.4048979591836735, | |
| "grad_norm": 0.020894547924399376, | |
| "learning_rate": 3.2333833859621153e-05, | |
| "loss": 0.4987, | |
| "num_input_tokens_seen": 130023424, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.4114285714285714, | |
| "grad_norm": 0.020912354812026024, | |
| "learning_rate": 3.1841574751802076e-05, | |
| "loss": 0.4968, | |
| "num_input_tokens_seen": 132120576, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.4179591836734694, | |
| "grad_norm": 0.02170964516699314, | |
| "learning_rate": 3.13464312292752e-05, | |
| "loss": 0.499, | |
| "num_input_tokens_seen": 134217728, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.42448979591836733, | |
| "grad_norm": 0.020900549367070198, | |
| "learning_rate": 3.084861204504122e-05, | |
| "loss": 0.5055, | |
| "num_input_tokens_seen": 136314880, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.4310204081632653, | |
| "grad_norm": 0.020687857642769814, | |
| "learning_rate": 3.0348327080162435e-05, | |
| "loss": 0.5014, | |
| "num_input_tokens_seen": 138412032, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.43755102040816324, | |
| "grad_norm": 0.02071143500506878, | |
| "learning_rate": 2.9845787255276753e-05, | |
| "loss": 0.4892, | |
| "num_input_tokens_seen": 140509184, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.4440816326530612, | |
| "grad_norm": 0.021139299497008324, | |
| "learning_rate": 2.9341204441673266e-05, | |
| "loss": 0.5069, | |
| "num_input_tokens_seen": 142606336, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.4506122448979592, | |
| "grad_norm": 0.020430322736501694, | |
| "learning_rate": 2.8834791371967142e-05, | |
| "loss": 0.4988, | |
| "num_input_tokens_seen": 144703488, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 0.020096473395824432, | |
| "learning_rate": 2.8326761550411345e-05, | |
| "loss": 0.4771, | |
| "num_input_tokens_seen": 146800640, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4636734693877551, | |
| "grad_norm": 0.020946258679032326, | |
| "learning_rate": 2.781732916288303e-05, | |
| "loss": 0.5036, | |
| "num_input_tokens_seen": 148897792, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.47020408163265304, | |
| "grad_norm": 0.019651729613542557, | |
| "learning_rate": 2.7306708986582553e-05, | |
| "loss": 0.499, | |
| "num_input_tokens_seen": 150994944, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.476734693877551, | |
| "grad_norm": 0.020175347104668617, | |
| "learning_rate": 2.679511629948319e-05, | |
| "loss": 0.5067, | |
| "num_input_tokens_seen": 153092096, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.483265306122449, | |
| "grad_norm": 0.02037941850721836, | |
| "learning_rate": 2.628276678956974e-05, | |
| "loss": 0.5015, | |
| "num_input_tokens_seen": 155189248, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.4897959183673469, | |
| "grad_norm": 0.020525282248854637, | |
| "learning_rate": 2.5769876463904265e-05, | |
| "loss": 0.504, | |
| "num_input_tokens_seen": 157286400, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.4963265306122449, | |
| "grad_norm": 0.02100261114537716, | |
| "learning_rate": 2.5256661557557247e-05, | |
| "loss": 0.513, | |
| "num_input_tokens_seen": 159383552, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.5028571428571429, | |
| "grad_norm": 0.02081277035176754, | |
| "learning_rate": 2.4743338442442755e-05, | |
| "loss": 0.5093, | |
| "num_input_tokens_seen": 161480704, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.5093877551020408, | |
| "grad_norm": 0.01962732896208763, | |
| "learning_rate": 2.4230123536095748e-05, | |
| "loss": 0.475, | |
| "num_input_tokens_seen": 163577856, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.5159183673469387, | |
| "grad_norm": 0.020476974546909332, | |
| "learning_rate": 2.3717233210430256e-05, | |
| "loss": 0.4827, | |
| "num_input_tokens_seen": 165675008, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.5224489795918368, | |
| "grad_norm": 0.02092832699418068, | |
| "learning_rate": 2.3204883700516812e-05, | |
| "loss": 0.5234, | |
| "num_input_tokens_seen": 167772160, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5289795918367347, | |
| "grad_norm": 0.020366327837109566, | |
| "learning_rate": 2.2693291013417453e-05, | |
| "loss": 0.5204, | |
| "num_input_tokens_seen": 169869312, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.5355102040816326, | |
| "grad_norm": 0.020487571135163307, | |
| "learning_rate": 2.2182670837116975e-05, | |
| "loss": 0.4886, | |
| "num_input_tokens_seen": 171966464, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.5420408163265306, | |
| "grad_norm": 0.01977311633527279, | |
| "learning_rate": 2.1673238449588668e-05, | |
| "loss": 0.5005, | |
| "num_input_tokens_seen": 174063616, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.5485714285714286, | |
| "grad_norm": 0.01979038491845131, | |
| "learning_rate": 2.116520862803286e-05, | |
| "loss": 0.4832, | |
| "num_input_tokens_seen": 176160768, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.5551020408163265, | |
| "grad_norm": 0.020210135728120804, | |
| "learning_rate": 2.0658795558326743e-05, | |
| "loss": 0.5021, | |
| "num_input_tokens_seen": 178257920, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.5616326530612245, | |
| "grad_norm": 0.020240608602762222, | |
| "learning_rate": 2.015421274472325e-05, | |
| "loss": 0.4839, | |
| "num_input_tokens_seen": 180355072, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.5681632653061225, | |
| "grad_norm": 0.020403753966093063, | |
| "learning_rate": 1.965167291983757e-05, | |
| "loss": 0.5078, | |
| "num_input_tokens_seen": 182452224, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.5746938775510204, | |
| "grad_norm": 0.020527003332972527, | |
| "learning_rate": 1.9151387954958794e-05, | |
| "loss": 0.4981, | |
| "num_input_tokens_seen": 184549376, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.5812244897959183, | |
| "grad_norm": 0.02056964300572872, | |
| "learning_rate": 1.8653568770724806e-05, | |
| "loss": 0.4913, | |
| "num_input_tokens_seen": 186646528, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.5877551020408164, | |
| "grad_norm": 0.019625499844551086, | |
| "learning_rate": 1.815842524819793e-05, | |
| "loss": 0.4862, | |
| "num_input_tokens_seen": 188743680, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5942857142857143, | |
| "grad_norm": 0.02040235698223114, | |
| "learning_rate": 1.7666166140378852e-05, | |
| "loss": 0.5146, | |
| "num_input_tokens_seen": 190840832, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.6008163265306122, | |
| "grad_norm": 0.02010432258248329, | |
| "learning_rate": 1.7176998984196146e-05, | |
| "loss": 0.4952, | |
| "num_input_tokens_seen": 192937984, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.6073469387755102, | |
| "grad_norm": 0.01999637298285961, | |
| "learning_rate": 1.6691130013008514e-05, | |
| "loss": 0.4845, | |
| "num_input_tokens_seen": 195035136, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.6138775510204082, | |
| "grad_norm": 0.019539078697562218, | |
| "learning_rate": 1.620876406965658e-05, | |
| "loss": 0.4881, | |
| "num_input_tokens_seen": 197132288, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.6204081632653061, | |
| "grad_norm": 0.020548058673739433, | |
| "learning_rate": 1.5730104520100982e-05, | |
| "loss": 0.498, | |
| "num_input_tokens_seen": 199229440, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.626938775510204, | |
| "grad_norm": 0.020467426627874374, | |
| "learning_rate": 1.5255353167683017e-05, | |
| "loss": 0.4989, | |
| "num_input_tokens_seen": 201326592, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.6334693877551021, | |
| "grad_norm": 0.019751951098442078, | |
| "learning_rate": 1.4784710168044213e-05, | |
| "loss": 0.4941, | |
| "num_input_tokens_seen": 203423744, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.01974237710237503, | |
| "learning_rate": 1.4318373944740484e-05, | |
| "loss": 0.4919, | |
| "num_input_tokens_seen": 205520896, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.6465306122448979, | |
| "grad_norm": 0.019516095519065857, | |
| "learning_rate": 1.3856541105586545e-05, | |
| "loss": 0.5047, | |
| "num_input_tokens_seen": 207618048, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.6530612244897959, | |
| "grad_norm": 0.01965564861893654, | |
| "learning_rate": 1.339940635976592e-05, | |
| "loss": 0.5104, | |
| "num_input_tokens_seen": 209715200, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6595918367346939, | |
| "grad_norm": 0.019674964249134064, | |
| "learning_rate": 1.2947162435741278e-05, | |
| "loss": 0.5008, | |
| "num_input_tokens_seen": 211812352, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.6661224489795918, | |
| "grad_norm": 0.019747601822018623, | |
| "learning_rate": 1.2500000000000006e-05, | |
| "loss": 0.5022, | |
| "num_input_tokens_seen": 213909504, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.6726530612244898, | |
| "grad_norm": 0.0198560431599617, | |
| "learning_rate": 1.205810757666894e-05, | |
| "loss": 0.4979, | |
| "num_input_tokens_seen": 216006656, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.6791836734693878, | |
| "grad_norm": 0.020007461309432983, | |
| "learning_rate": 1.1621671468032493e-05, | |
| "loss": 0.4972, | |
| "num_input_tokens_seen": 218103808, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.6857142857142857, | |
| "grad_norm": 0.019895223900675774, | |
| "learning_rate": 1.1190875675987356e-05, | |
| "loss": 0.5058, | |
| "num_input_tokens_seen": 220200960, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.6922448979591836, | |
| "grad_norm": 0.019599556922912598, | |
| "learning_rate": 1.0765901824467167e-05, | |
| "loss": 0.4935, | |
| "num_input_tokens_seen": 222298112, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.6987755102040817, | |
| "grad_norm": 0.019807366654276848, | |
| "learning_rate": 1.0346929082869641e-05, | |
| "loss": 0.5041, | |
| "num_input_tokens_seen": 224395264, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.7053061224489796, | |
| "grad_norm": 0.019651690497994423, | |
| "learning_rate": 9.934134090518593e-06, | |
| "loss": 0.5066, | |
| "num_input_tokens_seen": 226492416, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.7118367346938775, | |
| "grad_norm": 0.020175212994217873, | |
| "learning_rate": 9.527690882192636e-06, | |
| "loss": 0.5181, | |
| "num_input_tokens_seen": 228589568, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.7183673469387755, | |
| "grad_norm": 0.021520880982279778, | |
| "learning_rate": 9.127770814751933e-06, | |
| "loss": 0.4877, | |
| "num_input_tokens_seen": 230686720, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7248979591836735, | |
| "grad_norm": 0.02041519619524479, | |
| "learning_rate": 8.734542494893955e-06, | |
| "loss": 0.4886, | |
| "num_input_tokens_seen": 232783872, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.7314285714285714, | |
| "grad_norm": 0.020032085478305817, | |
| "learning_rate": 8.348171708068747e-06, | |
| "loss": 0.4986, | |
| "num_input_tokens_seen": 234881024, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.7379591836734694, | |
| "grad_norm": 0.01945379003882408, | |
| "learning_rate": 7.968821348583644e-06, | |
| "loss": 0.4892, | |
| "num_input_tokens_seen": 236978176, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.7444897959183674, | |
| "grad_norm": 0.019653445109725, | |
| "learning_rate": 7.5966513509268365e-06, | |
| "loss": 0.5048, | |
| "num_input_tokens_seen": 239075328, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.7510204081632653, | |
| "grad_norm": 0.020067734643816948, | |
| "learning_rate": 7.231818622338823e-06, | |
| "loss": 0.4907, | |
| "num_input_tokens_seen": 241172480, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.7575510204081632, | |
| "grad_norm": 0.0194945577532053, | |
| "learning_rate": 6.8744769766601854e-06, | |
| "loss": 0.5074, | |
| "num_input_tokens_seen": 243269632, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.7640816326530612, | |
| "grad_norm": 0.019672313705086708, | |
| "learning_rate": 6.524777069483526e-06, | |
| "loss": 0.4954, | |
| "num_input_tokens_seen": 245366784, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.7706122448979592, | |
| "grad_norm": 0.019437000155448914, | |
| "learning_rate": 6.182866334636889e-06, | |
| "loss": 0.5036, | |
| "num_input_tokens_seen": 247463936, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.7771428571428571, | |
| "grad_norm": 0.020035067573189735, | |
| "learning_rate": 5.848888922025553e-06, | |
| "loss": 0.5133, | |
| "num_input_tokens_seen": 249561088, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.7836734693877551, | |
| "grad_norm": 0.019359605386853218, | |
| "learning_rate": 5.522985636858239e-06, | |
| "loss": 0.4965, | |
| "num_input_tokens_seen": 251658240, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7902040816326531, | |
| "grad_norm": 0.01978667639195919, | |
| "learning_rate": 5.205293880283552e-06, | |
| "loss": 0.5068, | |
| "num_input_tokens_seen": 253755392, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.796734693877551, | |
| "grad_norm": 0.019405698403716087, | |
| "learning_rate": 4.8959475914614554e-06, | |
| "loss": 0.4993, | |
| "num_input_tokens_seen": 255852544, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.803265306122449, | |
| "grad_norm": 0.01977471075952053, | |
| "learning_rate": 4.5950771910944605e-06, | |
| "loss": 0.5173, | |
| "num_input_tokens_seen": 257949696, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.809795918367347, | |
| "grad_norm": 0.01949562318623066, | |
| "learning_rate": 4.3028095264420535e-06, | |
| "loss": 0.5032, | |
| "num_input_tokens_seen": 260046848, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.8163265306122449, | |
| "grad_norm": 0.019873222336173058, | |
| "learning_rate": 4.019267817841835e-06, | |
| "loss": 0.505, | |
| "num_input_tokens_seen": 262144000, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.8228571428571428, | |
| "grad_norm": 0.019852887839078903, | |
| "learning_rate": 3.7445716067596503e-06, | |
| "loss": 0.5145, | |
| "num_input_tokens_seen": 264241152, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.8293877551020408, | |
| "grad_norm": 0.019747605547308922, | |
| "learning_rate": 3.478836705390809e-06, | |
| "loss": 0.4967, | |
| "num_input_tokens_seen": 266338304, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.8359183673469388, | |
| "grad_norm": 0.019296282902359962, | |
| "learning_rate": 3.222175147833556e-06, | |
| "loss": 0.5001, | |
| "num_input_tokens_seen": 268435456, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.8424489795918367, | |
| "grad_norm": 0.020198358222842216, | |
| "learning_rate": 2.974695142855388e-06, | |
| "loss": 0.5037, | |
| "num_input_tokens_seen": 270532608, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.8489795918367347, | |
| "grad_norm": 0.020021170377731323, | |
| "learning_rate": 2.7365010282720952e-06, | |
| "loss": 0.5238, | |
| "num_input_tokens_seen": 272629760, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.8555102040816327, | |
| "grad_norm": 0.019700711593031883, | |
| "learning_rate": 2.507693226958871e-06, | |
| "loss": 0.5007, | |
| "num_input_tokens_seen": 274726912, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.8620408163265306, | |
| "grad_norm": 0.019315095618367195, | |
| "learning_rate": 2.2883682045119063e-06, | |
| "loss": 0.5119, | |
| "num_input_tokens_seen": 276824064, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.8685714285714285, | |
| "grad_norm": 0.02047501504421234, | |
| "learning_rate": 2.0786184285784297e-06, | |
| "loss": 0.4874, | |
| "num_input_tokens_seen": 278921216, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.8751020408163265, | |
| "grad_norm": 0.019602535292506218, | |
| "learning_rate": 1.8785323298722097e-06, | |
| "loss": 0.4969, | |
| "num_input_tokens_seen": 281018368, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.8816326530612245, | |
| "grad_norm": 0.019447464495897293, | |
| "learning_rate": 1.6881942648911076e-06, | |
| "loss": 0.4835, | |
| "num_input_tokens_seen": 283115520, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.8881632653061224, | |
| "grad_norm": 0.0193500854074955, | |
| "learning_rate": 1.5076844803522922e-06, | |
| "loss": 0.4863, | |
| "num_input_tokens_seen": 285212672, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.8946938775510204, | |
| "grad_norm": 0.019578518345952034, | |
| "learning_rate": 1.3370790793601373e-06, | |
| "loss": 0.5046, | |
| "num_input_tokens_seen": 287309824, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.9012244897959184, | |
| "grad_norm": 0.020458122715353966, | |
| "learning_rate": 1.1764499893210878e-06, | |
| "loss": 0.5018, | |
| "num_input_tokens_seen": 289406976, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.9077551020408163, | |
| "grad_norm": 0.019856926053762436, | |
| "learning_rate": 1.0258649316189722e-06, | |
| "loss": 0.5025, | |
| "num_input_tokens_seen": 291504128, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.9142857142857143, | |
| "grad_norm": 0.019992457702755928, | |
| "learning_rate": 8.85387393063622e-07, | |
| "loss": 0.5139, | |
| "num_input_tokens_seen": 293601280, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.9208163265306123, | |
| "grad_norm": 0.019166303798556328, | |
| "learning_rate": 7.550765991247654e-07, | |
| "loss": 0.484, | |
| "num_input_tokens_seen": 295698432, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.9273469387755102, | |
| "grad_norm": 0.019759617745876312, | |
| "learning_rate": 6.349874889624962e-07, | |
| "loss": 0.5127, | |
| "num_input_tokens_seen": 297795584, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.9338775510204081, | |
| "grad_norm": 0.019604109227657318, | |
| "learning_rate": 5.25170692264887e-07, | |
| "loss": 0.4876, | |
| "num_input_tokens_seen": 299892736, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.9404081632653061, | |
| "grad_norm": 0.019114624708890915, | |
| "learning_rate": 4.256725079024554e-07, | |
| "loss": 0.4925, | |
| "num_input_tokens_seen": 301989888, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.9469387755102041, | |
| "grad_norm": 0.019426511600613594, | |
| "learning_rate": 3.3653488440851255e-07, | |
| "loss": 0.4861, | |
| "num_input_tokens_seen": 304087040, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.953469387755102, | |
| "grad_norm": 0.019297398626804352, | |
| "learning_rate": 2.5779540229361745e-07, | |
| "loss": 0.493, | |
| "num_input_tokens_seen": 306184192, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.01964535564184189, | |
| "learning_rate": 1.8948725820160662e-07, | |
| "loss": 0.5109, | |
| "num_input_tokens_seen": 308281344, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.966530612244898, | |
| "grad_norm": 0.019757628440856934, | |
| "learning_rate": 1.3163925091384533e-07, | |
| "loss": 0.5182, | |
| "num_input_tokens_seen": 310378496, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.9730612244897959, | |
| "grad_norm": 0.019987070932984352, | |
| "learning_rate": 8.427576920763958e-08, | |
| "loss": 0.4902, | |
| "num_input_tokens_seen": 312475648, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.9795918367346939, | |
| "grad_norm": 0.020403273403644562, | |
| "learning_rate": 4.741678157389739e-08, | |
| "loss": 0.5019, | |
| "num_input_tokens_seen": 314572800, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.9861224489795918, | |
| "grad_norm": 0.0199234988540411, | |
| "learning_rate": 2.1077827798404726e-08, | |
| "loss": 0.5081, | |
| "num_input_tokens_seen": 316669952, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.9926530612244898, | |
| "grad_norm": 0.019740723073482513, | |
| "learning_rate": 5.270012410216185e-09, | |
| "loss": 0.4872, | |
| "num_input_tokens_seen": 318767104, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.9991836734693877, | |
| "grad_norm": 0.01960798352956772, | |
| "learning_rate": 0.0, | |
| "loss": 0.4965, | |
| "num_input_tokens_seen": 320864256, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.9991836734693877, | |
| "num_input_tokens_seen": 320864256, | |
| "step": 153, | |
| "total_flos": 1.2496254746971079e+19, | |
| "train_loss": 0.5089508540490094, | |
| "train_runtime": 7110.2315, | |
| "train_samples_per_second": 11.018, | |
| "train_steps_per_second": 0.022 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 153, | |
| "num_input_tokens_seen": 320864256, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2496254746971079e+19, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |