bimabk commited on
Commit
e4841eb
·
verified ·
1 Parent(s): 9de0b3d

Upload task output 1

Browse files
adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
32
  "v_proj",
 
33
  "q_proj",
34
  "gate_proj",
35
  "k_proj",
36
- "up_proj",
37
- "o_proj",
38
- "down_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "down_proj",
33
  "v_proj",
34
+ "up_proj",
35
  "q_proj",
36
  "gate_proj",
37
  "k_proj",
38
+ "o_proj"
 
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:403177e212d7f18549db2283c6d6f0b585a5c79befe6edc03ade708c3e0cc82b
3
  size 323014168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2c2ddd97d678c463706fa230da0e93fa52db448900cc8f2fe481cbee6fc08c8
3
  size 323014168
loss.txt CHANGED
@@ -1 +1 @@
1
- 1,no_eval
 
1
+ 31,no_eval
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 4e-05,
6
  "eval_steps": 500,
7
- "global_step": 1,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -16,36 +16,1026 @@
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.0,
19
- "completions/max_length": 488.0,
20
- "completions/max_terminated_length": 488.0,
21
- "completions/mean_length": 94.015625,
22
- "completions/mean_terminated_length": 94.015625,
23
- "completions/min_length": 3.0,
24
- "completions/min_terminated_length": 3.0,
25
- "entropy": 0.7336700148880482,
26
  "epoch": 4e-05,
27
- "frac_reward_zero_std": 0.0,
28
- "grad_norm": 1.1460554599761963,
29
  "kl": 0.0,
30
  "learning_rate": 0.0,
31
- "loss": -0.0334,
32
- "num_tokens": 91761.0,
33
- "reward": -0.9690441489219666,
34
- "reward_std": 0.3482987880706787,
35
- "rewards/rollout_reward_func/mean": -0.9690441489219666,
36
- "rewards/rollout_reward_func/std": 0.42848339676856995,
37
- "sampling/importance_sampling_ratio/max": 1.3604308366775513,
38
- "sampling/importance_sampling_ratio/mean": 0.9847421646118164,
39
- "sampling/importance_sampling_ratio/min": 0.5550056099891663,
40
- "sampling/sampling_logp_difference/max": 0.5354118347167969,
41
- "sampling/sampling_logp_difference/mean": 0.05978238210082054,
42
  "step": 1,
43
- "step_time": 15.418727782999895
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  }
45
  ],
46
  "logging_steps": 1.0,
47
- "max_steps": 1800,
48
- "num_input_tokens_seen": 91761,
49
  "num_train_epochs": 1,
50
  "save_steps": 500,
51
  "stateful_callbacks": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.00124,
6
  "eval_steps": 500,
7
+ "global_step": 31,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.0,
19
+ "completions/max_length": 114.0,
20
+ "completions/max_terminated_length": 114.0,
21
+ "completions/mean_length": 98.921875,
22
+ "completions/mean_terminated_length": 98.921875,
23
+ "completions/min_length": 58.0,
24
+ "completions/min_terminated_length": 58.0,
25
+ "entropy": 0.19646543450653553,
26
  "epoch": 4e-05,
27
+ "frac_reward_zero_std": 0.125,
28
+ "grad_norm": 0.9278627038002014,
29
  "kl": 0.0,
30
  "learning_rate": 0.0,
31
+ "loss": -0.0037,
32
+ "num_tokens": 34199.0,
33
+ "reward": 10.018266677856445,
34
+ "reward_std": 4.776409149169922,
35
+ "rewards/rollout_reward_func/mean": 10.018266677856445,
36
+ "rewards/rollout_reward_func/std": 7.215184688568115,
37
+ "sampling/importance_sampling_ratio/max": 1.2804653644561768,
38
+ "sampling/importance_sampling_ratio/mean": 0.9914791584014893,
39
+ "sampling/importance_sampling_ratio/min": 0.6299315094947815,
40
+ "sampling/sampling_logp_difference/max": 0.30363547801971436,
41
+ "sampling/sampling_logp_difference/mean": 0.01853932812809944,
42
  "step": 1,
43
+ "step_time": 8.847447882999631
44
+ },
45
+ {
46
+ "clip_ratio/high_max": 0.0,
47
+ "clip_ratio/high_mean": 0.0,
48
+ "clip_ratio/low_mean": 0.0,
49
+ "clip_ratio/low_min": 0.0,
50
+ "clip_ratio/region_mean": 0.0,
51
+ "completions/clipped_ratio": 0.0,
52
+ "completions/max_length": 117.0,
53
+ "completions/max_terminated_length": 117.0,
54
+ "completions/mean_length": 82.109375,
55
+ "completions/mean_terminated_length": 82.109375,
56
+ "completions/min_length": 2.0,
57
+ "completions/min_terminated_length": 2.0,
58
+ "entropy": 0.13141211355105042,
59
+ "epoch": 8e-05,
60
+ "frac_reward_zero_std": 0.125,
61
+ "grad_norm": 0.6150323748588562,
62
+ "kl": 0.0,
63
+ "learning_rate": 2.8571428571428575e-07,
64
+ "loss": 0.0052,
65
+ "num_tokens": 67966.0,
66
+ "reward": 13.53847885131836,
67
+ "reward_std": 4.694334030151367,
68
+ "rewards/rollout_reward_func/mean": 13.538479804992676,
69
+ "rewards/rollout_reward_func/std": 7.1933274269104,
70
+ "sampling/importance_sampling_ratio/max": 1.1734346151351929,
71
+ "sampling/importance_sampling_ratio/mean": 0.9927313923835754,
72
+ "sampling/importance_sampling_ratio/min": 0.4750845432281494,
73
+ "sampling/sampling_logp_difference/max": 0.5278338193893433,
74
+ "sampling/sampling_logp_difference/mean": 0.0108323460444808,
75
+ "step": 2,
76
+ "step_time": 7.127139926000382
77
+ },
78
+ {
79
+ "clip_ratio/high_max": 0.0,
80
+ "clip_ratio/high_mean": 0.0,
81
+ "clip_ratio/low_mean": 0.0,
82
+ "clip_ratio/low_min": 0.0,
83
+ "clip_ratio/region_mean": 0.0,
84
+ "completions/clipped_ratio": 0.0,
85
+ "completions/max_length": 116.0,
86
+ "completions/max_terminated_length": 116.0,
87
+ "completions/mean_length": 86.8125,
88
+ "completions/mean_terminated_length": 86.8125,
89
+ "completions/min_length": 2.0,
90
+ "completions/min_terminated_length": 2.0,
91
+ "entropy": 0.12198319449089468,
92
+ "epoch": 0.00012,
93
+ "frac_reward_zero_std": 0.375,
94
+ "grad_norm": 1.4374709129333496,
95
+ "kl": 0.0009402413852512836,
96
+ "learning_rate": 5.714285714285715e-07,
97
+ "loss": 0.0066,
98
+ "num_tokens": 100354.0,
99
+ "reward": 10.038440704345703,
100
+ "reward_std": 2.5273919105529785,
101
+ "rewards/rollout_reward_func/mean": 10.038440704345703,
102
+ "rewards/rollout_reward_func/std": 3.4852232933044434,
103
+ "sampling/importance_sampling_ratio/max": 1.9403773546218872,
104
+ "sampling/importance_sampling_ratio/mean": 1.0181429386138916,
105
+ "sampling/importance_sampling_ratio/min": 0.7186898589134216,
106
+ "sampling/sampling_logp_difference/max": 0.6623420715332031,
107
+ "sampling/sampling_logp_difference/mean": 0.012397471815347672,
108
+ "step": 3,
109
+ "step_time": 9.22867403300097
110
+ },
111
+ {
112
+ "clip_ratio/high_max": 0.0,
113
+ "clip_ratio/high_mean": 0.0,
114
+ "clip_ratio/low_mean": 0.0,
115
+ "clip_ratio/low_min": 0.0,
116
+ "clip_ratio/region_mean": 0.0,
117
+ "completions/clipped_ratio": 0.0,
118
+ "completions/max_length": 114.0,
119
+ "completions/max_terminated_length": 114.0,
120
+ "completions/mean_length": 98.03125,
121
+ "completions/mean_terminated_length": 98.03125,
122
+ "completions/min_length": 2.0,
123
+ "completions/min_terminated_length": 2.0,
124
+ "entropy": 0.19211739487946033,
125
+ "epoch": 0.00016,
126
+ "frac_reward_zero_std": 0.25,
127
+ "grad_norm": 0.9476256370544434,
128
+ "kl": 0.0006379662081599236,
129
+ "learning_rate": 8.571428571428572e-07,
130
+ "loss": -0.0098,
131
+ "num_tokens": 129880.0,
132
+ "reward": 10.634136199951172,
133
+ "reward_std": 2.9668378829956055,
134
+ "rewards/rollout_reward_func/mean": 10.634136199951172,
135
+ "rewards/rollout_reward_func/std": 6.936125755310059,
136
+ "sampling/importance_sampling_ratio/max": 1.2051615715026855,
137
+ "sampling/importance_sampling_ratio/mean": 0.9807850122451782,
138
+ "sampling/importance_sampling_ratio/min": 0.2036220282316208,
139
+ "sampling/sampling_logp_difference/max": 1.0556471347808838,
140
+ "sampling/sampling_logp_difference/mean": 0.01921888440847397,
141
+ "step": 4,
142
+ "step_time": 7.034282748000123
143
+ },
144
+ {
145
+ "clip_ratio/high_max": 0.03125,
146
+ "clip_ratio/high_mean": 0.0078125,
147
+ "clip_ratio/low_mean": 0.0,
148
+ "clip_ratio/low_min": 0.0,
149
+ "clip_ratio/region_mean": 0.0078125,
150
+ "completions/clipped_ratio": 0.0,
151
+ "completions/max_length": 117.0,
152
+ "completions/max_terminated_length": 117.0,
153
+ "completions/mean_length": 100.96875,
154
+ "completions/mean_terminated_length": 100.96875,
155
+ "completions/min_length": 58.0,
156
+ "completions/min_terminated_length": 58.0,
157
+ "entropy": 0.17289281217381358,
158
+ "epoch": 0.0002,
159
+ "frac_reward_zero_std": 0.25,
160
+ "grad_norm": 0.8713013529777527,
161
+ "kl": 0.0023218162823468447,
162
+ "learning_rate": 1.142857142857143e-06,
163
+ "loss": 0.003,
164
+ "num_tokens": 163618.0,
165
+ "reward": 10.66733169555664,
166
+ "reward_std": 2.2034752368927,
167
+ "rewards/rollout_reward_func/mean": 10.66733169555664,
168
+ "rewards/rollout_reward_func/std": 7.9976725578308105,
169
+ "sampling/importance_sampling_ratio/max": 1.3269492387771606,
170
+ "sampling/importance_sampling_ratio/mean": 1.0104732513427734,
171
+ "sampling/importance_sampling_ratio/min": 0.8934441208839417,
172
+ "sampling/sampling_logp_difference/max": 0.23215103149414062,
173
+ "sampling/sampling_logp_difference/mean": 0.011650541797280312,
174
+ "step": 5,
175
+ "step_time": 7.114355061999959
176
+ },
177
+ {
178
+ "clip_ratio/high_max": 0.0,
179
+ "clip_ratio/high_mean": 0.0,
180
+ "clip_ratio/low_mean": 0.00390625,
181
+ "clip_ratio/low_min": 0.0,
182
+ "clip_ratio/region_mean": 0.00390625,
183
+ "completions/clipped_ratio": 0.0,
184
+ "completions/max_length": 110.0,
185
+ "completions/max_terminated_length": 110.0,
186
+ "completions/mean_length": 97.421875,
187
+ "completions/mean_terminated_length": 97.421875,
188
+ "completions/min_length": 2.0,
189
+ "completions/min_terminated_length": 2.0,
190
+ "entropy": 0.1717861178331077,
191
+ "epoch": 0.00024,
192
+ "frac_reward_zero_std": 0.125,
193
+ "grad_norm": 0.8476853966712952,
194
+ "kl": 0.0003589589614421129,
195
+ "learning_rate": 1.4285714285714286e-06,
196
+ "loss": 0.0079,
197
+ "num_tokens": 194293.0,
198
+ "reward": 11.867555618286133,
199
+ "reward_std": 3.1243598461151123,
200
+ "rewards/rollout_reward_func/mean": 11.867554664611816,
201
+ "rewards/rollout_reward_func/std": 6.9736504554748535,
202
+ "sampling/importance_sampling_ratio/max": 1.1688475608825684,
203
+ "sampling/importance_sampling_ratio/mean": 1.007383108139038,
204
+ "sampling/importance_sampling_ratio/min": 0.8644587397575378,
205
+ "sampling/sampling_logp_difference/max": 0.15558338165283203,
206
+ "sampling/sampling_logp_difference/mean": 0.009581982158124447,
207
+ "step": 6,
208
+ "step_time": 8.641884654999558
209
+ },
210
+ {
211
+ "clip_ratio/high_max": 0.0,
212
+ "clip_ratio/high_mean": 0.0,
213
+ "clip_ratio/low_mean": 0.0,
214
+ "clip_ratio/low_min": 0.0,
215
+ "clip_ratio/region_mean": 0.0,
216
+ "completions/clipped_ratio": 0.0,
217
+ "completions/max_length": 116.0,
218
+ "completions/max_terminated_length": 116.0,
219
+ "completions/mean_length": 102.59375,
220
+ "completions/mean_terminated_length": 102.59375,
221
+ "completions/min_length": 58.0,
222
+ "completions/min_terminated_length": 58.0,
223
+ "entropy": 0.1727504450827837,
224
+ "epoch": 0.00028,
225
+ "frac_reward_zero_std": 0.25,
226
+ "grad_norm": 0.8516405820846558,
227
+ "kl": 0.001040005125105381,
228
+ "learning_rate": 1.7142857142857145e-06,
229
+ "loss": 0.0032,
230
+ "num_tokens": 228451.0,
231
+ "reward": 10.84666919708252,
232
+ "reward_std": 2.81697416305542,
233
+ "rewards/rollout_reward_func/mean": 10.84666919708252,
234
+ "rewards/rollout_reward_func/std": 3.9264276027679443,
235
+ "sampling/importance_sampling_ratio/max": 1.192152738571167,
236
+ "sampling/importance_sampling_ratio/mean": 0.9886180758476257,
237
+ "sampling/importance_sampling_ratio/min": 0.7800420522689819,
238
+ "sampling/sampling_logp_difference/max": 0.24933236837387085,
239
+ "sampling/sampling_logp_difference/mean": 0.015093531459569931,
240
+ "step": 7,
241
+ "step_time": 7.0255837680001605
242
+ },
243
+ {
244
+ "clip_ratio/high_max": 0.0,
245
+ "clip_ratio/high_mean": 0.0,
246
+ "clip_ratio/low_mean": 0.0,
247
+ "clip_ratio/low_min": 0.0,
248
+ "clip_ratio/region_mean": 0.0,
249
+ "completions/clipped_ratio": 0.0,
250
+ "completions/max_length": 116.0,
251
+ "completions/max_terminated_length": 116.0,
252
+ "completions/mean_length": 103.84375,
253
+ "completions/mean_terminated_length": 103.84375,
254
+ "completions/min_length": 94.0,
255
+ "completions/min_terminated_length": 94.0,
256
+ "entropy": 0.16474777180701494,
257
+ "epoch": 0.00032,
258
+ "frac_reward_zero_std": 0.5,
259
+ "grad_norm": 0.7940111756324768,
260
+ "kl": 0.0005032288372603944,
261
+ "learning_rate": 2.0000000000000003e-06,
262
+ "loss": 0.0009,
263
+ "num_tokens": 263877.0,
264
+ "reward": 10.964729309082031,
265
+ "reward_std": 1.7662436962127686,
266
+ "rewards/rollout_reward_func/mean": 10.964729309082031,
267
+ "rewards/rollout_reward_func/std": 7.242088317871094,
268
+ "sampling/importance_sampling_ratio/max": 1.2055360078811646,
269
+ "sampling/importance_sampling_ratio/mean": 1.004713773727417,
270
+ "sampling/importance_sampling_ratio/min": 0.7339034676551819,
271
+ "sampling/sampling_logp_difference/max": 0.3093966245651245,
272
+ "sampling/sampling_logp_difference/mean": 0.012640302069485188,
273
+ "step": 8,
274
+ "step_time": 7.154970041000297
275
+ },
276
+ {
277
+ "clip_ratio/high_max": 0.0,
278
+ "clip_ratio/high_mean": 0.0,
279
+ "clip_ratio/low_mean": 0.00390625,
280
+ "clip_ratio/low_min": 0.0,
281
+ "clip_ratio/region_mean": 0.00390625,
282
+ "completions/clipped_ratio": 0.0,
283
+ "completions/max_length": 116.0,
284
+ "completions/max_terminated_length": 116.0,
285
+ "completions/mean_length": 105.921875,
286
+ "completions/mean_terminated_length": 105.921875,
287
+ "completions/min_length": 94.0,
288
+ "completions/min_terminated_length": 94.0,
289
+ "entropy": 0.1817057733424008,
290
+ "epoch": 0.00036,
291
+ "frac_reward_zero_std": 0.125,
292
+ "grad_norm": 0.7268296480178833,
293
+ "kl": 0.0013755811378359795,
294
+ "learning_rate": 2.285714285714286e-06,
295
+ "loss": -0.0026,
296
+ "num_tokens": 302532.0,
297
+ "reward": 12.797033309936523,
298
+ "reward_std": 3.91664457321167,
299
+ "rewards/rollout_reward_func/mean": 12.797033309936523,
300
+ "rewards/rollout_reward_func/std": 6.278445720672607,
301
+ "sampling/importance_sampling_ratio/max": 1.328028917312622,
302
+ "sampling/importance_sampling_ratio/mean": 0.9793672561645508,
303
+ "sampling/importance_sampling_ratio/min": 0.6404329538345337,
304
+ "sampling/sampling_logp_difference/max": 0.5215651988983154,
305
+ "sampling/sampling_logp_difference/mean": 0.020225321874022484,
306
+ "step": 9,
307
+ "step_time": 8.5138989149998
308
+ },
309
+ {
310
+ "clip_ratio/high_max": 0.03125,
311
+ "clip_ratio/high_mean": 0.0078125,
312
+ "clip_ratio/low_mean": 0.00390625,
313
+ "clip_ratio/low_min": 0.0,
314
+ "clip_ratio/region_mean": 0.01171875,
315
+ "completions/clipped_ratio": 0.0,
316
+ "completions/max_length": 116.0,
317
+ "completions/max_terminated_length": 116.0,
318
+ "completions/mean_length": 87.90625,
319
+ "completions/mean_terminated_length": 87.90625,
320
+ "completions/min_length": 2.0,
321
+ "completions/min_terminated_length": 2.0,
322
+ "entropy": 0.1744129522703588,
323
+ "epoch": 0.0004,
324
+ "frac_reward_zero_std": 0.125,
325
+ "grad_norm": 0.6529544591903687,
326
+ "kl": 0.0011357483454048634,
327
+ "learning_rate": 2.571428571428571e-06,
328
+ "loss": 0.0154,
329
+ "num_tokens": 337382.0,
330
+ "reward": 9.921274185180664,
331
+ "reward_std": 3.3844058513641357,
332
+ "rewards/rollout_reward_func/mean": 9.921274185180664,
333
+ "rewards/rollout_reward_func/std": 6.207524299621582,
334
+ "sampling/importance_sampling_ratio/max": 1.2752257585525513,
335
+ "sampling/importance_sampling_ratio/mean": 1.0078442096710205,
336
+ "sampling/importance_sampling_ratio/min": 0.8506130576133728,
337
+ "sampling/sampling_logp_difference/max": 0.23299765586853027,
338
+ "sampling/sampling_logp_difference/mean": 0.010802164673805237,
339
+ "step": 10,
340
+ "step_time": 7.074561795000363
341
+ },
342
+ {
343
+ "clip_ratio/high_max": 0.0,
344
+ "clip_ratio/high_mean": 0.0,
345
+ "clip_ratio/low_mean": 0.0,
346
+ "clip_ratio/low_min": 0.0,
347
+ "clip_ratio/region_mean": 0.0,
348
+ "completions/clipped_ratio": 0.0,
349
+ "completions/max_length": 113.0,
350
+ "completions/max_terminated_length": 113.0,
351
+ "completions/mean_length": 102.5,
352
+ "completions/mean_terminated_length": 102.5,
353
+ "completions/min_length": 58.0,
354
+ "completions/min_terminated_length": 58.0,
355
+ "entropy": 0.1777965882793069,
356
+ "epoch": 0.00044,
357
+ "frac_reward_zero_std": 0.25,
358
+ "grad_norm": 0.9877777099609375,
359
+ "kl": 0.0009214265737682581,
360
+ "learning_rate": 2.8571428571428573e-06,
361
+ "loss": -0.0011,
362
+ "num_tokens": 373766.0,
363
+ "reward": 13.959955215454102,
364
+ "reward_std": 3.020989418029785,
365
+ "rewards/rollout_reward_func/mean": 13.959955215454102,
366
+ "rewards/rollout_reward_func/std": 8.485596656799316,
367
+ "sampling/importance_sampling_ratio/max": 1.161659836769104,
368
+ "sampling/importance_sampling_ratio/mean": 0.9922082424163818,
369
+ "sampling/importance_sampling_ratio/min": 0.8054305911064148,
370
+ "sampling/sampling_logp_difference/max": 0.21640020608901978,
371
+ "sampling/sampling_logp_difference/mean": 0.01080007292330265,
372
+ "step": 11,
373
+ "step_time": 7.412672027999633
374
+ },
375
+ {
376
+ "clip_ratio/high_max": 0.046875,
377
+ "clip_ratio/high_mean": 0.01171875,
378
+ "clip_ratio/low_mean": 0.0078125,
379
+ "clip_ratio/low_min": 0.0,
380
+ "clip_ratio/region_mean": 0.01953125,
381
+ "completions/clipped_ratio": 0.0,
382
+ "completions/max_length": 116.0,
383
+ "completions/max_terminated_length": 116.0,
384
+ "completions/mean_length": 99.0,
385
+ "completions/mean_terminated_length": 99.0,
386
+ "completions/min_length": 2.0,
387
+ "completions/min_terminated_length": 2.0,
388
+ "entropy": 0.18321187514811754,
389
+ "epoch": 0.00048,
390
+ "frac_reward_zero_std": 0.25,
391
+ "grad_norm": 0.7719992995262146,
392
+ "kl": 0.0012023542076349258,
393
+ "learning_rate": 3.142857142857143e-06,
394
+ "loss": 0.0098,
395
+ "num_tokens": 409802.0,
396
+ "reward": 13.237505912780762,
397
+ "reward_std": 3.283658027648926,
398
+ "rewards/rollout_reward_func/mean": 13.237504959106445,
399
+ "rewards/rollout_reward_func/std": 8.317092895507812,
400
+ "sampling/importance_sampling_ratio/max": 1.3050973415374756,
401
+ "sampling/importance_sampling_ratio/mean": 0.9854879379272461,
402
+ "sampling/importance_sampling_ratio/min": 0.6236510872840881,
403
+ "sampling/sampling_logp_difference/max": 0.47287511825561523,
404
+ "sampling/sampling_logp_difference/mean": 0.018457502126693726,
405
+ "step": 12,
406
+ "step_time": 7.962151656000515
407
+ },
408
+ {
409
+ "clip_ratio/high_max": 0.0,
410
+ "clip_ratio/high_mean": 0.0,
411
+ "clip_ratio/low_mean": 0.0,
412
+ "clip_ratio/low_min": 0.0,
413
+ "clip_ratio/region_mean": 0.0,
414
+ "completions/clipped_ratio": 0.0,
415
+ "completions/max_length": 113.0,
416
+ "completions/max_terminated_length": 113.0,
417
+ "completions/mean_length": 98.671875,
418
+ "completions/mean_terminated_length": 98.671875,
419
+ "completions/min_length": 58.0,
420
+ "completions/min_terminated_length": 58.0,
421
+ "entropy": 0.1837792107835412,
422
+ "epoch": 0.00052,
423
+ "frac_reward_zero_std": 0.0,
424
+ "grad_norm": 1.1921685934066772,
425
+ "kl": 0.0006314956117421389,
426
+ "learning_rate": 3.428571428571429e-06,
427
+ "loss": -0.0016,
428
+ "num_tokens": 443993.0,
429
+ "reward": 8.716323852539062,
430
+ "reward_std": 3.7696497440338135,
431
+ "rewards/rollout_reward_func/mean": 8.716324806213379,
432
+ "rewards/rollout_reward_func/std": 4.9213151931762695,
433
+ "sampling/importance_sampling_ratio/max": 1.2224400043487549,
434
+ "sampling/importance_sampling_ratio/mean": 1.0003582239151,
435
+ "sampling/importance_sampling_ratio/min": 0.703804075717926,
436
+ "sampling/sampling_logp_difference/max": 0.3478405475616455,
437
+ "sampling/sampling_logp_difference/mean": 0.013392799533903599,
438
+ "step": 13,
439
+ "step_time": 6.872026027999937
440
+ },
441
+ {
442
+ "clip_ratio/high_max": 0.0,
443
+ "clip_ratio/high_mean": 0.0,
444
+ "clip_ratio/low_mean": 0.0,
445
+ "clip_ratio/low_min": 0.0,
446
+ "clip_ratio/region_mean": 0.0,
447
+ "completions/clipped_ratio": 0.0,
448
+ "completions/max_length": 117.0,
449
+ "completions/max_terminated_length": 117.0,
450
+ "completions/mean_length": 87.078125,
451
+ "completions/mean_terminated_length": 87.078125,
452
+ "completions/min_length": 2.0,
453
+ "completions/min_terminated_length": 2.0,
454
+ "entropy": 0.1762648681178689,
455
+ "epoch": 0.00056,
456
+ "frac_reward_zero_std": 0.0,
457
+ "grad_norm": 0.8949439525604248,
458
+ "kl": 0.0012336352374404669,
459
+ "learning_rate": 3.7142857142857146e-06,
460
+ "loss": 0.0215,
461
+ "num_tokens": 477034.0,
462
+ "reward": 13.550655364990234,
463
+ "reward_std": 4.669343948364258,
464
+ "rewards/rollout_reward_func/mean": 13.550655364990234,
465
+ "rewards/rollout_reward_func/std": 7.081562042236328,
466
+ "sampling/importance_sampling_ratio/max": 1.2484861612319946,
467
+ "sampling/importance_sampling_ratio/mean": 0.9962727427482605,
468
+ "sampling/importance_sampling_ratio/min": 0.7459995746612549,
469
+ "sampling/sampling_logp_difference/max": 0.29502665996551514,
470
+ "sampling/sampling_logp_difference/mean": 0.011807022616267204,
471
+ "step": 14,
472
+ "step_time": 7.88232419100018
473
+ },
474
+ {
475
+ "clip_ratio/high_max": 0.0,
476
+ "clip_ratio/high_mean": 0.0,
477
+ "clip_ratio/low_mean": 0.00390625,
478
+ "clip_ratio/low_min": 0.0,
479
+ "clip_ratio/region_mean": 0.00390625,
480
+ "completions/clipped_ratio": 0.0,
481
+ "completions/max_length": 116.0,
482
+ "completions/max_terminated_length": 116.0,
483
+ "completions/mean_length": 98.84375,
484
+ "completions/mean_terminated_length": 98.84375,
485
+ "completions/min_length": 2.0,
486
+ "completions/min_terminated_length": 2.0,
487
+ "entropy": 0.14872624445706606,
488
+ "epoch": 0.0006,
489
+ "frac_reward_zero_std": 0.375,
490
+ "grad_norm": 0.5905139446258545,
491
+ "kl": 0.0012812165077775717,
492
+ "learning_rate": 4.000000000000001e-06,
493
+ "loss": 0.0049,
494
+ "num_tokens": 511160.0,
495
+ "reward": 10.958195686340332,
496
+ "reward_std": 3.023810386657715,
497
+ "rewards/rollout_reward_func/mean": 10.958196640014648,
498
+ "rewards/rollout_reward_func/std": 4.840097427368164,
499
+ "sampling/importance_sampling_ratio/max": 1.139298915863037,
500
+ "sampling/importance_sampling_ratio/mean": 0.9891129732131958,
501
+ "sampling/importance_sampling_ratio/min": 0.7860896587371826,
502
+ "sampling/sampling_logp_difference/max": 0.20429694652557373,
503
+ "sampling/sampling_logp_difference/mean": 0.009246795438230038,
504
+ "step": 15,
505
+ "step_time": 8.032791925999845
506
+ },
507
+ {
508
+ "clip_ratio/high_max": 0.0,
509
+ "clip_ratio/high_mean": 0.0,
510
+ "clip_ratio/low_mean": 0.0,
511
+ "clip_ratio/low_min": 0.0,
512
+ "clip_ratio/region_mean": 0.0,
513
+ "completions/clipped_ratio": 0.0,
514
+ "completions/max_length": 116.0,
515
+ "completions/max_terminated_length": 116.0,
516
+ "completions/mean_length": 99.53125,
517
+ "completions/mean_terminated_length": 99.53125,
518
+ "completions/min_length": 70.0,
519
+ "completions/min_terminated_length": 70.0,
520
+ "entropy": 0.1544574573636055,
521
+ "epoch": 0.00064,
522
+ "frac_reward_zero_std": 0.375,
523
+ "grad_norm": 1.1432291269302368,
524
+ "kl": 0.003647498415375594,
525
+ "learning_rate": 4.2857142857142855e-06,
526
+ "loss": 0.0286,
527
+ "num_tokens": 545066.0,
528
+ "reward": 9.331792831420898,
529
+ "reward_std": 1.93760085105896,
530
+ "rewards/rollout_reward_func/mean": 9.331792831420898,
531
+ "rewards/rollout_reward_func/std": 5.057403087615967,
532
+ "sampling/importance_sampling_ratio/max": 1.3562160730361938,
533
+ "sampling/importance_sampling_ratio/mean": 0.9893874526023865,
534
+ "sampling/importance_sampling_ratio/min": 0.5071713328361511,
535
+ "sampling/sampling_logp_difference/max": 0.40043067932128906,
536
+ "sampling/sampling_logp_difference/mean": 0.017478572204709053,
537
+ "step": 16,
538
+ "step_time": 7.066636556999811
539
+ },
540
+ {
541
+ "clip_ratio/high_max": 0.03125,
542
+ "clip_ratio/high_mean": 0.0078125,
543
+ "clip_ratio/low_mean": 0.00390625,
544
+ "clip_ratio/low_min": 0.0,
545
+ "clip_ratio/region_mean": 0.01171875,
546
+ "completions/clipped_ratio": 0.0,
547
+ "completions/max_length": 114.0,
548
+ "completions/max_terminated_length": 114.0,
549
+ "completions/mean_length": 99.1875,
550
+ "completions/mean_terminated_length": 99.1875,
551
+ "completions/min_length": 57.0,
552
+ "completions/min_terminated_length": 57.0,
553
+ "entropy": 0.21858551260083914,
554
+ "epoch": 0.00068,
555
+ "frac_reward_zero_std": 0.125,
556
+ "grad_norm": 1.1050941944122314,
557
+ "kl": 0.0031870862003415823,
558
+ "learning_rate": 4.571428571428572e-06,
559
+ "loss": 0.024,
560
+ "num_tokens": 580310.0,
561
+ "reward": 10.79472541809082,
562
+ "reward_std": 2.3240370750427246,
563
+ "rewards/rollout_reward_func/mean": 10.79472541809082,
564
+ "rewards/rollout_reward_func/std": 7.201944828033447,
565
+ "sampling/importance_sampling_ratio/max": 1.270713210105896,
566
+ "sampling/importance_sampling_ratio/mean": 1.0256067514419556,
567
+ "sampling/importance_sampling_ratio/min": 0.751465916633606,
568
+ "sampling/sampling_logp_difference/max": 0.30277013778686523,
569
+ "sampling/sampling_logp_difference/mean": 0.022086970508098602,
570
+ "step": 17,
571
+ "step_time": 8.46836156100062
572
+ },
573
+ {
574
+ "clip_ratio/high_max": 0.0,
575
+ "clip_ratio/high_mean": 0.0,
576
+ "clip_ratio/low_mean": 0.0,
577
+ "clip_ratio/low_min": 0.0,
578
+ "clip_ratio/region_mean": 0.0,
579
+ "completions/clipped_ratio": 0.0,
580
+ "completions/max_length": 116.0,
581
+ "completions/max_terminated_length": 116.0,
582
+ "completions/mean_length": 101.03125,
583
+ "completions/mean_terminated_length": 101.03125,
584
+ "completions/min_length": 2.0,
585
+ "completions/min_terminated_length": 2.0,
586
+ "entropy": 0.20830629393458366,
587
+ "epoch": 0.00072,
588
+ "frac_reward_zero_std": 0.125,
589
+ "grad_norm": 0.6307684183120728,
590
+ "kl": 0.0033635632134974003,
591
+ "learning_rate": 4.857142857142858e-06,
592
+ "loss": 0.0016,
593
+ "num_tokens": 616096.0,
594
+ "reward": 10.57010269165039,
595
+ "reward_std": 3.070500373840332,
596
+ "rewards/rollout_reward_func/mean": 10.570101737976074,
597
+ "rewards/rollout_reward_func/std": 6.5729169845581055,
598
+ "sampling/importance_sampling_ratio/max": 1.3546802997589111,
599
+ "sampling/importance_sampling_ratio/mean": 1.0241458415985107,
600
+ "sampling/importance_sampling_ratio/min": 0.6485600471496582,
601
+ "sampling/sampling_logp_difference/max": 0.388120174407959,
602
+ "sampling/sampling_logp_difference/mean": 0.028211820870637894,
603
+ "step": 18,
604
+ "step_time": 7.452459238999609
605
+ },
606
+ {
607
+ "clip_ratio/high_max": 0.0,
608
+ "clip_ratio/high_mean": 0.0,
609
+ "clip_ratio/low_mean": 0.0,
610
+ "clip_ratio/low_min": 0.0,
611
+ "clip_ratio/region_mean": 0.0,
612
+ "completions/clipped_ratio": 0.0,
613
+ "completions/max_length": 116.0,
614
+ "completions/max_terminated_length": 116.0,
615
+ "completions/mean_length": 99.75,
616
+ "completions/mean_terminated_length": 99.75,
617
+ "completions/min_length": 2.0,
618
+ "completions/min_terminated_length": 2.0,
619
+ "entropy": 0.175787306856364,
620
+ "epoch": 0.00076,
621
+ "frac_reward_zero_std": 0.0,
622
+ "grad_norm": 0.8829342722892761,
623
+ "kl": 0.01204019202850759,
624
+ "learning_rate": 5.142857142857142e-06,
625
+ "loss": -0.0198,
626
+ "num_tokens": 647564.0,
627
+ "reward": 10.796049118041992,
628
+ "reward_std": 3.542346477508545,
629
+ "rewards/rollout_reward_func/mean": 10.79604721069336,
630
+ "rewards/rollout_reward_func/std": 4.0647735595703125,
631
+ "sampling/importance_sampling_ratio/max": 1.428004503250122,
632
+ "sampling/importance_sampling_ratio/mean": 1.0033756494522095,
633
+ "sampling/importance_sampling_ratio/min": 0.6384609341621399,
634
+ "sampling/sampling_logp_difference/max": 0.39859604835510254,
635
+ "sampling/sampling_logp_difference/mean": 0.021462757140398026,
636
+ "step": 19,
637
+ "step_time": 6.987104802000204
638
+ },
639
+ {
640
+ "clip_ratio/high_max": 0.0,
641
+ "clip_ratio/high_mean": 0.0,
642
+ "clip_ratio/low_mean": 0.0,
643
+ "clip_ratio/low_min": 0.0,
644
+ "clip_ratio/region_mean": 0.0,
645
+ "completions/clipped_ratio": 0.0,
646
+ "completions/max_length": 116.0,
647
+ "completions/max_terminated_length": 116.0,
648
+ "completions/mean_length": 107.59375,
649
+ "completions/mean_terminated_length": 107.59375,
650
+ "completions/min_length": 94.0,
651
+ "completions/min_terminated_length": 94.0,
652
+ "entropy": 0.16672849422320724,
653
+ "epoch": 0.0008,
654
+ "frac_reward_zero_std": 0.25,
655
+ "grad_norm": 0.5178841352462769,
656
+ "kl": 0.008307450218126178,
657
+ "learning_rate": 5.428571428571429e-06,
658
+ "loss": 0.0045,
659
+ "num_tokens": 679866.0,
660
+ "reward": 14.327037811279297,
661
+ "reward_std": 2.3548567295074463,
662
+ "rewards/rollout_reward_func/mean": 14.327038764953613,
663
+ "rewards/rollout_reward_func/std": 6.473320007324219,
664
+ "sampling/importance_sampling_ratio/max": 1.1844276189804077,
665
+ "sampling/importance_sampling_ratio/mean": 1.0173743963241577,
666
+ "sampling/importance_sampling_ratio/min": 0.7388046979904175,
667
+ "sampling/sampling_logp_difference/max": 0.30275261402130127,
668
+ "sampling/sampling_logp_difference/mean": 0.01609945483505726,
669
+ "step": 20,
670
+ "step_time": 8.920992404999652
671
+ },
672
+ {
673
+ "clip_ratio/high_max": 0.015625,
674
+ "clip_ratio/high_mean": 0.00390625,
675
+ "clip_ratio/low_mean": 0.0,
676
+ "clip_ratio/low_min": 0.0,
677
+ "clip_ratio/region_mean": 0.00390625,
678
+ "completions/clipped_ratio": 0.0,
679
+ "completions/max_length": 116.0,
680
+ "completions/max_terminated_length": 116.0,
681
+ "completions/mean_length": 94.15625,
682
+ "completions/mean_terminated_length": 94.15625,
683
+ "completions/min_length": 2.0,
684
+ "completions/min_terminated_length": 2.0,
685
+ "entropy": 0.1697295242920518,
686
+ "epoch": 0.00084,
687
+ "frac_reward_zero_std": 0.25,
688
+ "grad_norm": 0.5032536387443542,
689
+ "kl": 0.005904986290261149,
690
+ "learning_rate": 5.7142857142857145e-06,
691
+ "loss": 0.0063,
692
+ "num_tokens": 712944.0,
693
+ "reward": 9.954044342041016,
694
+ "reward_std": 2.929586887359619,
695
+ "rewards/rollout_reward_func/mean": 9.954044342041016,
696
+ "rewards/rollout_reward_func/std": 3.8391432762145996,
697
+ "sampling/importance_sampling_ratio/max": 1.1809278726577759,
698
+ "sampling/importance_sampling_ratio/mean": 0.9926953911781311,
699
+ "sampling/importance_sampling_ratio/min": 0.671869158744812,
700
+ "sampling/sampling_logp_difference/max": 0.3325324058532715,
701
+ "sampling/sampling_logp_difference/mean": 0.019740980118513107,
702
+ "step": 21,
703
+ "step_time": 7.055920110000216
704
+ },
705
+ {
706
+ "clip_ratio/high_max": 0.015625,
707
+ "clip_ratio/high_mean": 0.00390625,
708
+ "clip_ratio/low_mean": 0.0,
709
+ "clip_ratio/low_min": 0.0,
710
+ "clip_ratio/region_mean": 0.00390625,
711
+ "completions/clipped_ratio": 0.0,
712
+ "completions/max_length": 120.0,
713
+ "completions/max_terminated_length": 120.0,
714
+ "completions/mean_length": 104.90625,
715
+ "completions/mean_terminated_length": 104.90625,
716
+ "completions/min_length": 94.0,
717
+ "completions/min_terminated_length": 94.0,
718
+ "entropy": 0.14952043676748872,
719
+ "epoch": 0.00088,
720
+ "frac_reward_zero_std": 0.5,
721
+ "grad_norm": 0.41113221645355225,
722
+ "kl": 0.020291190361604095,
723
+ "learning_rate": 6e-06,
724
+ "loss": -0.0074,
725
+ "num_tokens": 745842.0,
726
+ "reward": 15.216217041015625,
727
+ "reward_std": 2.0014686584472656,
728
+ "rewards/rollout_reward_func/mean": 15.216217041015625,
729
+ "rewards/rollout_reward_func/std": 7.341615676879883,
730
+ "sampling/importance_sampling_ratio/max": 1.4198538064956665,
731
+ "sampling/importance_sampling_ratio/mean": 1.0097235441207886,
732
+ "sampling/importance_sampling_ratio/min": 0.4881555140018463,
733
+ "sampling/sampling_logp_difference/max": 0.7177610397338867,
734
+ "sampling/sampling_logp_difference/mean": 0.0277442317456007,
735
+ "step": 22,
736
+ "step_time": 7.018612760999531
737
+ },
738
+ {
739
+ "clip_ratio/high_max": 0.0,
740
+ "clip_ratio/high_mean": 0.0,
741
+ "clip_ratio/low_mean": 0.0,
742
+ "clip_ratio/low_min": 0.0,
743
+ "clip_ratio/region_mean": 0.0,
744
+ "completions/clipped_ratio": 0.0,
745
+ "completions/max_length": 109.0,
746
+ "completions/max_terminated_length": 109.0,
747
+ "completions/mean_length": 97.75,
748
+ "completions/mean_terminated_length": 97.75,
749
+ "completions/min_length": 58.0,
750
+ "completions/min_terminated_length": 58.0,
751
+ "entropy": 0.14867904456332326,
752
+ "epoch": 0.00092,
753
+ "frac_reward_zero_std": 0.625,
754
+ "grad_norm": 0.4940281808376312,
755
+ "kl": 0.011669340077787638,
756
+ "learning_rate": 6.285714285714286e-06,
757
+ "loss": -0.0072,
758
+ "num_tokens": 777962.0,
759
+ "reward": 9.629616737365723,
760
+ "reward_std": 1.4242491722106934,
761
+ "rewards/rollout_reward_func/mean": 9.629616737365723,
762
+ "rewards/rollout_reward_func/std": 4.168231964111328,
763
+ "sampling/importance_sampling_ratio/max": 1.235112190246582,
764
+ "sampling/importance_sampling_ratio/mean": 0.9895117282867432,
765
+ "sampling/importance_sampling_ratio/min": 0.5506332516670227,
766
+ "sampling/sampling_logp_difference/max": 0.5967001914978027,
767
+ "sampling/sampling_logp_difference/mean": 0.02138374000787735,
768
+ "step": 23,
769
+ "step_time": 8.369276020999905
770
+ },
771
+ {
772
+ "clip_ratio/high_max": 0.0,
773
+ "clip_ratio/high_mean": 0.0,
774
+ "clip_ratio/low_mean": 0.0,
775
+ "clip_ratio/low_min": 0.0,
776
+ "clip_ratio/region_mean": 0.0,
777
+ "completions/clipped_ratio": 0.0,
778
+ "completions/max_length": 116.0,
779
+ "completions/max_terminated_length": 116.0,
780
+ "completions/mean_length": 105.8125,
781
+ "completions/mean_terminated_length": 105.8125,
782
+ "completions/min_length": 94.0,
783
+ "completions/min_terminated_length": 94.0,
784
+ "entropy": 0.153433071449399,
785
+ "epoch": 0.00096,
786
+ "frac_reward_zero_std": 0.625,
787
+ "grad_norm": 0.6142421960830688,
788
+ "kl": 0.036034643882885575,
789
+ "learning_rate": 6.571428571428572e-06,
790
+ "loss": -0.0105,
791
+ "num_tokens": 814758.0,
792
+ "reward": 14.580177307128906,
793
+ "reward_std": 1.8490748405456543,
794
+ "rewards/rollout_reward_func/mean": 14.580177307128906,
795
+ "rewards/rollout_reward_func/std": 7.987873554229736,
796
+ "sampling/importance_sampling_ratio/max": 1.822190284729004,
797
+ "sampling/importance_sampling_ratio/mean": 0.9637724161148071,
798
+ "sampling/importance_sampling_ratio/min": 0.504238486289978,
799
+ "sampling/sampling_logp_difference/max": 0.6850337982177734,
800
+ "sampling/sampling_logp_difference/mean": 0.02902819588780403,
801
+ "step": 24,
802
+ "step_time": 6.940408582999908
803
+ },
804
+ {
805
+ "clip_ratio/high_max": 0.0,
806
+ "clip_ratio/high_mean": 0.0,
807
+ "clip_ratio/low_mean": 0.0,
808
+ "clip_ratio/low_min": 0.0,
809
+ "clip_ratio/region_mean": 0.0,
810
+ "completions/clipped_ratio": 0.0,
811
+ "completions/max_length": 222.0,
812
+ "completions/max_terminated_length": 222.0,
813
+ "completions/mean_length": 164.84375,
814
+ "completions/mean_terminated_length": 164.84375,
815
+ "completions/min_length": 94.0,
816
+ "completions/min_terminated_length": 94.0,
817
+ "entropy": 0.16991846077144146,
818
+ "epoch": 0.001,
819
+ "frac_reward_zero_std": 0.125,
820
+ "grad_norm": 0.6987430453300476,
821
+ "kl": 0.07557606545742601,
822
+ "learning_rate": 6.857142857142858e-06,
823
+ "loss": -0.0195,
824
+ "num_tokens": 848064.0,
825
+ "reward": 15.993512153625488,
826
+ "reward_std": 3.809764862060547,
827
+ "rewards/rollout_reward_func/mean": 15.993511199951172,
828
+ "rewards/rollout_reward_func/std": 6.686747074127197,
829
+ "sampling/importance_sampling_ratio/max": 2.5669615268707275,
830
+ "sampling/importance_sampling_ratio/mean": 1.038097620010376,
831
+ "sampling/importance_sampling_ratio/min": 0.36189621686935425,
832
+ "sampling/sampling_logp_difference/max": 0.9301660060882568,
833
+ "sampling/sampling_logp_difference/mean": 0.04619593545794487,
834
+ "step": 25,
835
+ "step_time": 7.486579578000146
836
+ },
837
+ {
838
+ "clip_ratio/high_max": 0.057291666977107525,
839
+ "clip_ratio/high_mean": 0.014322916744276881,
840
+ "clip_ratio/low_mean": 0.0026041667442768812,
841
+ "clip_ratio/low_min": 0.0,
842
+ "clip_ratio/region_mean": 0.016927083488553762,
843
+ "completions/clipped_ratio": 0.0,
844
+ "completions/max_length": 222.0,
845
+ "completions/max_terminated_length": 222.0,
846
+ "completions/mean_length": 164.546875,
847
+ "completions/mean_terminated_length": 164.546875,
848
+ "completions/min_length": 94.0,
849
+ "completions/min_terminated_length": 94.0,
850
+ "entropy": 0.1926758922636509,
851
+ "epoch": 0.00104,
852
+ "frac_reward_zero_std": 0.125,
853
+ "grad_norm": 1.1327283382415771,
854
+ "kl": 0.07225027051754296,
855
+ "learning_rate": 7.1428571428571436e-06,
856
+ "loss": -0.002,
857
+ "num_tokens": 886623.0,
858
+ "reward": 16.11737060546875,
859
+ "reward_std": 4.506648540496826,
860
+ "rewards/rollout_reward_func/mean": 16.11737060546875,
861
+ "rewards/rollout_reward_func/std": 10.432522773742676,
862
+ "sampling/importance_sampling_ratio/max": 2.0328967571258545,
863
+ "sampling/importance_sampling_ratio/mean": 0.9802088737487793,
864
+ "sampling/importance_sampling_ratio/min": 0.3310491144657135,
865
+ "sampling/sampling_logp_difference/max": 0.9447128772735596,
866
+ "sampling/sampling_logp_difference/mean": 0.046886004507541656,
867
+ "step": 26,
868
+ "step_time": 8.893368583999745
869
+ },
870
+ {
871
+ "clip_ratio/high_max": 0.03125,
872
+ "clip_ratio/high_mean": 0.0078125,
873
+ "clip_ratio/low_mean": 0.0007812500116415322,
874
+ "clip_ratio/low_min": 0.0,
875
+ "clip_ratio/region_mean": 0.008593750011641532,
876
+ "completions/clipped_ratio": 0.0,
877
+ "completions/max_length": 222.0,
878
+ "completions/max_terminated_length": 222.0,
879
+ "completions/mean_length": 172.734375,
880
+ "completions/mean_terminated_length": 172.734375,
881
+ "completions/min_length": 94.0,
882
+ "completions/min_terminated_length": 94.0,
883
+ "entropy": 0.22064625099301338,
884
+ "epoch": 0.00108,
885
+ "frac_reward_zero_std": 0.0,
886
+ "grad_norm": 1.0944156646728516,
887
+ "kl": 0.10417186049744487,
888
+ "learning_rate": 7.428571428571429e-06,
889
+ "loss": -0.0587,
890
+ "num_tokens": 923914.0,
891
+ "reward": 18.09270477294922,
892
+ "reward_std": 4.988367557525635,
893
+ "rewards/rollout_reward_func/mean": 18.09270477294922,
894
+ "rewards/rollout_reward_func/std": 10.039715766906738,
895
+ "sampling/importance_sampling_ratio/max": 2.6798079013824463,
896
+ "sampling/importance_sampling_ratio/mean": 0.9714202284812927,
897
+ "sampling/importance_sampling_ratio/min": 1.6848749771671695e-13,
898
+ "sampling/sampling_logp_difference/max": 27.938308715820312,
899
+ "sampling/sampling_logp_difference/mean": 0.14040334522724152,
900
+ "step": 27,
901
+ "step_time": 7.770603708000635
902
+ },
903
+ {
904
+ "clip_ratio/high_max": 0.03125,
905
+ "clip_ratio/high_mean": 0.0078125,
906
+ "clip_ratio/low_mean": 0.0,
907
+ "clip_ratio/low_min": 0.0,
908
+ "clip_ratio/region_mean": 0.0078125,
909
+ "completions/clipped_ratio": 0.0,
910
+ "completions/max_length": 222.0,
911
+ "completions/max_terminated_length": 222.0,
912
+ "completions/mean_length": 153.140625,
913
+ "completions/mean_terminated_length": 153.140625,
914
+ "completions/min_length": 94.0,
915
+ "completions/min_terminated_length": 94.0,
916
+ "entropy": 0.1478887596167624,
917
+ "epoch": 0.00112,
918
+ "frac_reward_zero_std": 0.0,
919
+ "grad_norm": 0.6470702886581421,
920
+ "kl": 0.2168107850011438,
921
+ "learning_rate": 7.714285714285716e-06,
922
+ "loss": -0.0223,
923
+ "num_tokens": 957287.0,
924
+ "reward": 18.42925262451172,
925
+ "reward_std": 4.345149040222168,
926
+ "rewards/rollout_reward_func/mean": 18.42925262451172,
927
+ "rewards/rollout_reward_func/std": 9.097280502319336,
928
+ "sampling/importance_sampling_ratio/max": 2.399390697479248,
929
+ "sampling/importance_sampling_ratio/mean": 1.0000016689300537,
930
+ "sampling/importance_sampling_ratio/min": 0.18764659762382507,
931
+ "sampling/sampling_logp_difference/max": 1.446092128753662,
932
+ "sampling/sampling_logp_difference/mean": 0.07286648452281952,
933
+ "step": 28,
934
+ "step_time": 7.452310326000543
935
+ },
936
+ {
937
+ "clip_ratio/high_max": 0.015625,
938
+ "clip_ratio/high_mean": 0.00390625,
939
+ "clip_ratio/low_mean": 0.0026041667442768812,
940
+ "clip_ratio/low_min": 0.0,
941
+ "clip_ratio/region_mean": 0.006510416744276881,
942
+ "completions/clipped_ratio": 0.0,
943
+ "completions/max_length": 216.0,
944
+ "completions/max_terminated_length": 216.0,
945
+ "completions/mean_length": 160.234375,
946
+ "completions/mean_terminated_length": 160.234375,
947
+ "completions/min_length": 2.0,
948
+ "completions/min_terminated_length": 2.0,
949
+ "entropy": 0.18869919329881668,
950
+ "epoch": 0.00116,
951
+ "frac_reward_zero_std": 0.0,
952
+ "grad_norm": 0.7812609672546387,
953
+ "kl": 0.12577429198427126,
954
+ "learning_rate": 8.000000000000001e-06,
955
+ "loss": 0.0295,
956
+ "num_tokens": 992314.0,
957
+ "reward": 17.83092498779297,
958
+ "reward_std": 4.7564473152160645,
959
+ "rewards/rollout_reward_func/mean": 17.83092498779297,
960
+ "rewards/rollout_reward_func/std": 8.652295112609863,
961
+ "sampling/importance_sampling_ratio/max": 2.8800876140594482,
962
+ "sampling/importance_sampling_ratio/mean": 0.9842413067817688,
963
+ "sampling/importance_sampling_ratio/min": 0.3151380717754364,
964
+ "sampling/sampling_logp_difference/max": 1.1866700649261475,
965
+ "sampling/sampling_logp_difference/mean": 0.08438973873853683,
966
+ "step": 29,
967
+ "step_time": 9.375173230999735
968
+ },
969
+ {
970
+ "clip_ratio/high_max": 0.0,
971
+ "clip_ratio/high_mean": 0.0,
972
+ "clip_ratio/low_mean": 0.0,
973
+ "clip_ratio/low_min": 0.0,
974
+ "clip_ratio/region_mean": 0.0,
975
+ "completions/clipped_ratio": 0.0,
976
+ "completions/max_length": 221.0,
977
+ "completions/max_terminated_length": 221.0,
978
+ "completions/mean_length": 131.859375,
979
+ "completions/mean_terminated_length": 131.859375,
980
+ "completions/min_length": 2.0,
981
+ "completions/min_terminated_length": 2.0,
982
+ "entropy": 0.1357386689633131,
983
+ "epoch": 0.0012,
984
+ "frac_reward_zero_std": 0.0,
985
+ "grad_norm": 0.8130350112915039,
986
+ "kl": 0.08997523193829693,
987
+ "learning_rate": 8.285714285714287e-06,
988
+ "loss": -0.0283,
989
+ "num_tokens": 1027697.0,
990
+ "reward": 13.44178295135498,
991
+ "reward_std": 2.9926509857177734,
992
+ "rewards/rollout_reward_func/mean": 13.441783905029297,
993
+ "rewards/rollout_reward_func/std": 4.6520209312438965,
994
+ "sampling/importance_sampling_ratio/max": 2.0554587841033936,
995
+ "sampling/importance_sampling_ratio/mean": 0.9725006818771362,
996
+ "sampling/importance_sampling_ratio/min": 0.0,
997
+ "sampling/sampling_logp_difference/max": 1.2659821510314941,
998
+ "sampling/sampling_logp_difference/mean": 0.05908963084220886,
999
+ "step": 30,
1000
+ "step_time": 7.3954712499999005
1001
+ },
1002
+ {
1003
+ "clip_ratio/high_max": 0.046875000931322575,
1004
+ "clip_ratio/high_mean": 0.011718750232830644,
1005
+ "clip_ratio/low_mean": 0.0,
1006
+ "clip_ratio/low_min": 0.0,
1007
+ "clip_ratio/region_mean": 0.011718750232830644,
1008
+ "completions/clipped_ratio": 0.0,
1009
+ "completions/max_length": 222.0,
1010
+ "completions/max_terminated_length": 222.0,
1011
+ "completions/mean_length": 145.84375,
1012
+ "completions/mean_terminated_length": 145.84375,
1013
+ "completions/min_length": 94.0,
1014
+ "completions/min_terminated_length": 94.0,
1015
+ "entropy": 0.16315596690401435,
1016
+ "epoch": 0.00124,
1017
+ "frac_reward_zero_std": 0.0,
1018
+ "grad_norm": 0.6934608221054077,
1019
+ "kl": 0.1859159953892231,
1020
+ "learning_rate": 8.571428571428571e-06,
1021
+ "loss": 0.0146,
1022
+ "num_tokens": 1063055.0,
1023
+ "reward": 18.718799591064453,
1024
+ "reward_std": 4.135279655456543,
1025
+ "rewards/rollout_reward_func/mean": 18.718799591064453,
1026
+ "rewards/rollout_reward_func/std": 7.419597148895264,
1027
+ "sampling/importance_sampling_ratio/max": 2.9382729530334473,
1028
+ "sampling/importance_sampling_ratio/mean": 1.0185017585754395,
1029
+ "sampling/importance_sampling_ratio/min": 0.15695802867412567,
1030
+ "sampling/sampling_logp_difference/max": 1.3760042190551758,
1031
+ "sampling/sampling_logp_difference/mean": 0.10743667185306549,
1032
+ "step": 31,
1033
+ "step_time": 7.8455955710001035
1034
  }
1035
  ],
1036
  "logging_steps": 1.0,
1037
+ "max_steps": 600,
1038
+ "num_input_tokens_seen": 1063055,
1039
  "num_train_epochs": 1,
1040
  "save_steps": 500,
1041
  "stateful_callbacks": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:902e70f01b1cfe2aba74665cf0e6be5e37892fa9325a84c788eeda9494a62727
3
  size 8145
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a775c2b74232ad23885d216dd244a4a85eb0fae28310f94063736202117b7c8
3
  size 8145