shorecode commited on
Commit
5ece4f1
·
verified ·
1 Parent(s): 5fe7fe3

Upload folder using huggingface_hub

Browse files
checkpoint-3000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ccb64efc319292d7f82dc423867462a94434241d26e813849458ef1210fcb156
3
  size 124642443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fad6569af2a2233c0f59d09763e1a2336b9da42ae7e6cb1c6635bec33ae2db5
3
  size 124642443
checkpoint-3000/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50e8a01a31f2514a1d0c186f083d72ffd11c4fd26ab26b74d8853781437876c9
3
  size 62314258
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c020ec47e8581acc86642c7a901886880ed0e21a67b6866aa6e46c3ba3177530
3
  size 62314258
checkpoint-3000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:47f551f352c3582aab145c6f54f30ffbeee0b864e47c66aae4b8bf65aa86b3ab
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6d46a044feeb5332c0f8084263bfb9e9fd6fe7cb244071d0de9b4cbe3ae3208
3
  size 14645
checkpoint-3000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99c0caf2011a7cb0034062f49b1c20f2067d88b910ef9cf32d24c7e9ddd08314
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68a6419333f014840fbe18337f422d0d59ab125d87109c828c6b1ef65f210f17
3
  size 1465
checkpoint-3000/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.32485110990795885,
6
  "eval_steps": 500,
7
  "global_step": 3000,
8
  "is_hyper_param_search": false,
@@ -10,140 +10,140 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.02707092582566324,
14
- "grad_norm": 0.9713733196258545,
15
- "learning_rate": 9.910124526258799e-05,
16
- "loss": 3.4945,
17
  "step": 250
18
  },
19
  {
20
- "epoch": 0.05414185165132648,
21
- "grad_norm": 0.9521434307098389,
22
- "learning_rate": 9.819888106839921e-05,
23
- "loss": 3.5097,
24
  "step": 500
25
  },
26
  {
27
- "epoch": 0.05414185165132648,
28
- "eval_loss": 2.95473575592041,
29
- "eval_runtime": 87.829,
30
- "eval_samples_per_second": 112.15,
31
- "eval_steps_per_second": 7.014,
32
  "step": 500
33
  },
34
  {
35
- "epoch": 0.08121277747698971,
36
- "grad_norm": 0.9334385395050049,
37
- "learning_rate": 9.729651687421044e-05,
38
- "loss": 3.4922,
39
  "step": 750
40
  },
41
  {
42
- "epoch": 0.10828370330265295,
43
- "grad_norm": 1.2239603996276855,
44
- "learning_rate": 9.639415268002166e-05,
45
- "loss": 3.477,
46
  "step": 1000
47
  },
48
  {
49
- "epoch": 0.10828370330265295,
50
- "eval_loss": 2.9319565296173096,
51
- "eval_runtime": 87.8964,
52
- "eval_samples_per_second": 112.064,
53
- "eval_steps_per_second": 7.008,
54
  "step": 1000
55
  },
56
  {
57
- "epoch": 0.1353546291283162,
58
- "grad_norm": 1.1727226972579956,
59
- "learning_rate": 9.549178848583288e-05,
60
- "loss": 3.493,
61
  "step": 1250
62
  },
63
  {
64
- "epoch": 0.16242555495397942,
65
- "grad_norm": 0.966973602771759,
66
- "learning_rate": 9.458942429164411e-05,
67
- "loss": 3.5013,
68
  "step": 1500
69
  },
70
  {
71
- "epoch": 0.16242555495397942,
72
- "eval_loss": 2.9275825023651123,
73
- "eval_runtime": 87.899,
74
- "eval_samples_per_second": 112.06,
75
- "eval_steps_per_second": 7.008,
76
  "step": 1500
77
  },
78
  {
79
- "epoch": 0.18949648077964265,
80
- "grad_norm": 0.9813922047615051,
81
- "learning_rate": 9.368706009745533e-05,
82
- "loss": 3.4907,
83
  "step": 1750
84
  },
85
  {
86
- "epoch": 0.2165674066053059,
87
- "grad_norm": 1.027085542678833,
88
- "learning_rate": 9.278469590326656e-05,
89
- "loss": 3.4588,
90
  "step": 2000
91
  },
92
  {
93
- "epoch": 0.2165674066053059,
94
- "eval_loss": 2.9044992923736572,
95
- "eval_runtime": 87.8776,
96
- "eval_samples_per_second": 112.088,
97
- "eval_steps_per_second": 7.01,
98
  "step": 2000
99
  },
100
  {
101
- "epoch": 0.24363833243096913,
102
- "grad_norm": 1.0764214992523193,
103
- "learning_rate": 9.188233170907778e-05,
104
- "loss": 3.4531,
105
  "step": 2250
106
  },
107
  {
108
- "epoch": 0.2707092582566324,
109
- "grad_norm": 1.0297119617462158,
110
- "learning_rate": 9.0979967514889e-05,
111
- "loss": 3.4445,
112
  "step": 2500
113
  },
114
  {
115
- "epoch": 0.2707092582566324,
116
- "eval_loss": 2.8855738639831543,
117
- "eval_runtime": 87.8728,
118
- "eval_samples_per_second": 112.094,
119
- "eval_steps_per_second": 7.01,
120
  "step": 2500
121
  },
122
  {
123
- "epoch": 0.2977801840822956,
124
- "grad_norm": 0.9697523713111877,
125
- "learning_rate": 9.007760332070024e-05,
126
- "loss": 3.4349,
127
  "step": 2750
128
  },
129
  {
130
- "epoch": 0.32485110990795885,
131
- "grad_norm": 0.9611329436302185,
132
- "learning_rate": 8.917523912651147e-05,
133
- "loss": 3.4213,
134
  "step": 3000
135
  },
136
  {
137
- "epoch": 0.32485110990795885,
138
- "eval_loss": 2.8725759983062744,
139
- "eval_runtime": 87.9054,
140
- "eval_samples_per_second": 112.052,
141
- "eval_steps_per_second": 7.008,
142
  "step": 3000
143
  }
144
  ],
145
  "logging_steps": 250,
146
- "max_steps": 27705,
147
  "num_input_tokens_seen": 0,
148
  "num_train_epochs": 3,
149
  "save_steps": 1000,
@@ -159,8 +159,8 @@
159
  "attributes": {}
160
  }
161
  },
162
- "total_flos": 1083162230784000.0,
163
- "train_batch_size": 16,
164
  "trial_name": null,
165
  "trial_params": null
166
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.6091370558375635,
6
  "eval_steps": 500,
7
  "global_step": 3000,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.050761421319796954,
14
+ "grad_norm": 0.7555294632911682,
15
+ "learning_rate": 9.831472081218275e-05,
16
+ "loss": 3.185,
17
  "step": 250
18
  },
19
  {
20
+ "epoch": 0.10152284263959391,
21
+ "grad_norm": 0.7812637090682983,
22
+ "learning_rate": 9.662267343485618e-05,
23
+ "loss": 3.1833,
24
  "step": 500
25
  },
26
  {
27
+ "epoch": 0.10152284263959391,
28
+ "eval_loss": 2.7107906341552734,
29
+ "eval_runtime": 88.982,
30
+ "eval_samples_per_second": 110.697,
31
+ "eval_steps_per_second": 3.697,
32
  "step": 500
33
  },
34
  {
35
+ "epoch": 0.15228426395939088,
36
+ "grad_norm": 0.7523223161697388,
37
+ "learning_rate": 9.493062605752962e-05,
38
+ "loss": 3.1846,
39
  "step": 750
40
  },
41
  {
42
+ "epoch": 0.20304568527918782,
43
+ "grad_norm": 0.820755124092102,
44
+ "learning_rate": 9.323857868020304e-05,
45
+ "loss": 3.1779,
46
  "step": 1000
47
  },
48
  {
49
+ "epoch": 0.20304568527918782,
50
+ "eval_loss": 2.701174020767212,
51
+ "eval_runtime": 88.9362,
52
+ "eval_samples_per_second": 110.754,
53
+ "eval_steps_per_second": 3.699,
54
  "step": 1000
55
  },
56
  {
57
+ "epoch": 0.25380710659898476,
58
+ "grad_norm": 0.7443365454673767,
59
+ "learning_rate": 9.154653130287648e-05,
60
+ "loss": 3.1669,
61
  "step": 1250
62
  },
63
  {
64
+ "epoch": 0.30456852791878175,
65
+ "grad_norm": 0.7510855197906494,
66
+ "learning_rate": 8.985448392554991e-05,
67
+ "loss": 3.163,
68
  "step": 1500
69
  },
70
  {
71
+ "epoch": 0.30456852791878175,
72
+ "eval_loss": 2.6923437118530273,
73
+ "eval_runtime": 88.8708,
74
+ "eval_samples_per_second": 110.835,
75
+ "eval_steps_per_second": 3.702,
76
  "step": 1500
77
  },
78
  {
79
+ "epoch": 0.3553299492385787,
80
+ "grad_norm": 0.7907871007919312,
81
+ "learning_rate": 8.816243654822337e-05,
82
+ "loss": 3.1668,
83
  "step": 1750
84
  },
85
  {
86
+ "epoch": 0.40609137055837563,
87
+ "grad_norm": 0.8167365193367004,
88
+ "learning_rate": 8.647038917089679e-05,
89
+ "loss": 3.1533,
90
  "step": 2000
91
  },
92
  {
93
+ "epoch": 0.40609137055837563,
94
+ "eval_loss": 2.68522310256958,
95
+ "eval_runtime": 88.8442,
96
+ "eval_samples_per_second": 110.868,
97
+ "eval_steps_per_second": 3.703,
98
  "step": 2000
99
  },
100
  {
101
+ "epoch": 0.45685279187817257,
102
+ "grad_norm": 0.7124233841896057,
103
+ "learning_rate": 8.477834179357022e-05,
104
+ "loss": 3.157,
105
  "step": 2250
106
  },
107
  {
108
+ "epoch": 0.5076142131979695,
109
+ "grad_norm": 0.7829596400260925,
110
+ "learning_rate": 8.308629441624366e-05,
111
+ "loss": 3.1582,
112
  "step": 2500
113
  },
114
  {
115
+ "epoch": 0.5076142131979695,
116
+ "eval_loss": 2.681734561920166,
117
+ "eval_runtime": 88.8737,
118
+ "eval_samples_per_second": 110.831,
119
+ "eval_steps_per_second": 3.702,
120
  "step": 2500
121
  },
122
  {
123
+ "epoch": 0.5583756345177665,
124
+ "grad_norm": 0.7784757018089294,
125
+ "learning_rate": 8.139424703891709e-05,
126
+ "loss": 3.1581,
127
  "step": 2750
128
  },
129
  {
130
+ "epoch": 0.6091370558375635,
131
+ "grad_norm": 0.7557055354118347,
132
+ "learning_rate": 7.970219966159053e-05,
133
+ "loss": 3.157,
134
  "step": 3000
135
  },
136
  {
137
+ "epoch": 0.6091370558375635,
138
+ "eval_loss": 2.6753175258636475,
139
+ "eval_runtime": 89.0244,
140
+ "eval_samples_per_second": 110.644,
141
+ "eval_steps_per_second": 3.696,
142
  "step": 3000
143
  }
144
  ],
145
  "logging_steps": 250,
146
+ "max_steps": 14775,
147
  "num_input_tokens_seen": 0,
148
  "num_train_epochs": 3,
149
  "save_steps": 1000,
 
159
  "attributes": {}
160
  }
161
  },
162
+ "total_flos": 2030929182720000.0,
163
+ "train_batch_size": 30,
164
  "trial_name": null,
165
  "trial_params": null
166
  }
checkpoint-3000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e66348775f5c0c4e602e276cfabd86d6b08e208f328b80af2d3902749f42fb99
3
  size 6033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1e3d67aa3e387cf228af7dde653bfe60b869b8342460aae15f94e2d51478f92
3
  size 6033