j05hr3d commited on
Commit
04f3cc5
·
verified ·
1 Parent(s): 3f3569a

Model save

Browse files
Files changed (3) hide show
  1. README.md +13 -12
  2. adapter_model.safetensors +1 -1
  3. trainer_state.json +134 -119
README.md CHANGED
@@ -19,7 +19,7 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  This model is a fine-tuned version of [Qwen/Qwen2.5-Coder-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct) on the None dataset.
21
  It achieves the following results on the evaluation set:
22
- - Loss: 0.9492
23
 
24
  ## Model description
25
 
@@ -53,17 +53,18 @@ The following hyperparameters were used during training:
53
 
54
  | Training Loss | Epoch | Step | Validation Loss |
55
  |:-------------:|:------:|:----:|:---------------:|
56
- | 1.0767 | 0.2974 | 20 | 1.0971 |
57
- | 0.8846 | 0.5948 | 40 | 1.0391 |
58
- | 0.8305 | 0.8922 | 60 | 1.0038 |
59
- | 0.7694 | 1.1784 | 80 | 0.9842 |
60
- | 0.8401 | 1.4758 | 100 | 0.9652 |
61
- | 0.7302 | 1.7732 | 120 | 0.9530 |
62
- | 0.7097 | 2.0595 | 140 | 0.9501 |
63
- | 0.7044 | 2.3569 | 160 | 0.9492 |
64
- | 0.5562 | 2.6543 | 180 | 0.9499 |
65
- | 0.6572 | 2.9517 | 200 | 0.9509 |
66
- | 0.5972 | 3.2379 | 220 | 0.9651 |
 
67
 
68
 
69
  ### Framework versions
 
19
 
20
  This model is a fine-tuned version of [Qwen/Qwen2.5-Coder-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct) on the None dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.9728
23
 
24
  ## Model description
25
 
 
53
 
54
  | Training Loss | Epoch | Step | Validation Loss |
55
  |:-------------:|:------:|:----:|:---------------:|
56
+ | 1.0721 | 0.2985 | 20 | 1.1757 |
57
+ | 0.8989 | 0.5970 | 40 | 1.1059 |
58
+ | 0.8293 | 0.8955 | 60 | 1.0656 |
59
+ | 0.787 | 1.1940 | 80 | 1.0364 |
60
+ | 0.7025 | 1.4925 | 100 | 1.0206 |
61
+ | 0.7386 | 1.7910 | 120 | 0.9961 |
62
+ | 0.7471 | 2.0896 | 140 | 0.9916 |
63
+ | 0.624 | 2.3881 | 160 | 0.9843 |
64
+ | 0.6839 | 2.6866 | 180 | 0.9728 |
65
+ | 0.6561 | 2.9851 | 200 | 0.9737 |
66
+ | 0.6027 | 3.2836 | 220 | 0.9785 |
67
+ | 0.5221 | 3.5821 | 240 | 0.9843 |
68
 
69
 
70
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b68179dd4961db59364133b7edb05644a28ecf1e34552b96ba1192bc7d522dae
3
  size 239536272
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80eb04aaf7473993b1e707e8b1c6ef2520334bba07acf6b2237ee6fdd682bfe0
3
  size 239536272
trainer_state.json CHANGED
@@ -1,199 +1,214 @@
1
  {
2
- "best_global_step": 160,
3
- "best_metric": 0.9492282867431641,
4
- "best_model_checkpoint": "j05hr3d/SFT-Qwen2.5-Coder-3B/checkpoint-160",
5
- "epoch": 3.2379182156133828,
6
  "eval_steps": 20,
7
- "global_step": 220,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.29739776951672864,
14
- "grad_norm": 0.38066861033439636,
15
- "learning_rate": 9.756838905775076e-05,
16
- "loss": 1.0767,
17
  "step": 20
18
  },
19
  {
20
- "epoch": 0.29739776951672864,
21
- "eval_loss": 1.0970656871795654,
22
- "eval_runtime": 12.8563,
23
- "eval_samples_per_second": 4.511,
24
- "eval_steps_per_second": 0.622,
25
  "step": 20
26
  },
27
  {
28
- "epoch": 0.5947955390334573,
29
- "grad_norm": 0.49776822328567505,
30
- "learning_rate": 9.148936170212766e-05,
31
- "loss": 0.8846,
32
  "step": 40
33
  },
34
  {
35
- "epoch": 0.5947955390334573,
36
- "eval_loss": 1.0391403436660767,
37
- "eval_runtime": 10.5598,
38
- "eval_samples_per_second": 5.493,
39
- "eval_steps_per_second": 0.758,
40
  "step": 40
41
  },
42
  {
43
- "epoch": 0.8921933085501859,
44
- "grad_norm": 0.47521692514419556,
45
- "learning_rate": 8.541033434650457e-05,
46
- "loss": 0.8305,
47
  "step": 60
48
  },
49
  {
50
- "epoch": 0.8921933085501859,
51
- "eval_loss": 1.0037544965744019,
52
- "eval_runtime": 10.5615,
53
- "eval_samples_per_second": 5.492,
54
- "eval_steps_per_second": 0.757,
55
  "step": 60
56
  },
57
  {
58
- "epoch": 1.178438661710037,
59
- "grad_norm": 0.5635558366775513,
60
- "learning_rate": 7.933130699088146e-05,
61
- "loss": 0.7694,
62
  "step": 80
63
  },
64
  {
65
- "epoch": 1.178438661710037,
66
- "eval_loss": 0.9841997623443604,
67
- "eval_runtime": 10.561,
68
- "eval_samples_per_second": 5.492,
69
- "eval_steps_per_second": 0.758,
70
  "step": 80
71
  },
72
  {
73
- "epoch": 1.4758364312267658,
74
- "grad_norm": 0.47897258400917053,
75
- "learning_rate": 7.325227963525836e-05,
76
- "loss": 0.8401,
77
  "step": 100
78
  },
79
  {
80
- "epoch": 1.4758364312267658,
81
- "eval_loss": 0.9652481079101562,
82
- "eval_runtime": 10.5639,
83
- "eval_samples_per_second": 5.49,
84
- "eval_steps_per_second": 0.757,
85
  "step": 100
86
  },
87
  {
88
- "epoch": 1.7732342007434945,
89
- "grad_norm": 0.34135618805885315,
90
- "learning_rate": 6.717325227963525e-05,
91
- "loss": 0.7302,
92
  "step": 120
93
  },
94
  {
95
- "epoch": 1.7732342007434945,
96
- "eval_loss": 0.9529991149902344,
97
- "eval_runtime": 10.5682,
98
- "eval_samples_per_second": 5.488,
99
- "eval_steps_per_second": 0.757,
100
  "step": 120
101
  },
102
  {
103
- "epoch": 2.059479553903346,
104
- "grad_norm": 0.47506430745124817,
105
- "learning_rate": 6.109422492401215e-05,
106
- "loss": 0.7097,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 2.059479553903346,
111
- "eval_loss": 0.9500848650932312,
112
- "eval_runtime": 10.5677,
113
- "eval_samples_per_second": 5.488,
114
- "eval_steps_per_second": 0.757,
115
  "step": 140
116
  },
117
  {
118
- "epoch": 2.356877323420074,
119
- "grad_norm": 0.826922595500946,
120
- "learning_rate": 5.5015197568389065e-05,
121
- "loss": 0.7044,
122
  "step": 160
123
  },
124
  {
125
- "epoch": 2.356877323420074,
126
- "eval_loss": 0.9492282867431641,
127
- "eval_runtime": 10.615,
128
- "eval_samples_per_second": 5.464,
129
- "eval_steps_per_second": 0.754,
130
  "step": 160
131
  },
132
  {
133
- "epoch": 2.654275092936803,
134
- "grad_norm": 1.0807068347930908,
135
- "learning_rate": 4.893617021276596e-05,
136
- "loss": 0.5562,
137
  "step": 180
138
  },
139
  {
140
- "epoch": 2.654275092936803,
141
- "eval_loss": 0.9498729109764099,
142
- "eval_runtime": 10.6388,
143
- "eval_samples_per_second": 5.452,
144
- "eval_steps_per_second": 0.752,
145
  "step": 180
146
  },
147
  {
148
- "epoch": 2.9516728624535316,
149
- "grad_norm": 0.8790673613548279,
150
- "learning_rate": 4.2857142857142856e-05,
151
- "loss": 0.6572,
152
  "step": 200
153
  },
154
  {
155
- "epoch": 2.9516728624535316,
156
- "eval_loss": 0.9508957862854004,
157
- "eval_runtime": 10.6216,
158
- "eval_samples_per_second": 5.461,
159
- "eval_steps_per_second": 0.753,
160
  "step": 200
161
  },
162
  {
163
- "epoch": 3.2379182156133828,
164
- "grad_norm": 0.7400524020195007,
165
- "learning_rate": 3.677811550151976e-05,
166
- "loss": 0.5972,
167
  "step": 220
168
  },
169
  {
170
- "epoch": 3.2379182156133828,
171
- "eval_loss": 0.9650547504425049,
172
- "eval_runtime": 10.6208,
173
- "eval_samples_per_second": 5.461,
174
- "eval_steps_per_second": 0.753,
175
  "step": 220
176
  },
177
  {
178
- "epoch": 3.2379182156133828,
179
- "step": 220,
180
- "total_flos": 3.426567206333645e+16,
181
- "train_loss": 0.7596470009196888,
182
- "train_runtime": 958.9571,
183
- "train_samples_per_second": 2.8,
184
- "train_steps_per_second": 0.355
185
  },
186
  {
187
- "epoch": 3.2379182156133828,
188
- "eval_loss": 0.9492282867431641,
189
- "eval_runtime": 10.6264,
190
- "eval_samples_per_second": 5.458,
191
- "eval_steps_per_second": 0.753,
192
- "step": 220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  }
194
  ],
195
  "logging_steps": 20,
196
- "max_steps": 340,
197
  "num_input_tokens_seen": 0,
198
  "num_train_epochs": 5,
199
  "save_steps": 20,
@@ -218,7 +233,7 @@
218
  "attributes": {}
219
  }
220
  },
221
- "total_flos": 3.426567206333645e+16,
222
  "train_batch_size": 2,
223
  "trial_name": null,
224
  "trial_params": null
 
1
  {
2
+ "best_global_step": 180,
3
+ "best_metric": 0.9728425145149231,
4
+ "best_model_checkpoint": "j05hr3d/SFT-Qwen2.5-Coder-3B/checkpoint-180",
5
+ "epoch": 3.582089552238806,
6
  "eval_steps": 20,
7
+ "global_step": 240,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.29850746268656714,
14
+ "grad_norm": 0.48786428570747375,
15
+ "learning_rate": 9.753086419753087e-05,
16
+ "loss": 1.0721,
17
  "step": 20
18
  },
19
  {
20
+ "epoch": 0.29850746268656714,
21
+ "eval_loss": 1.1757386922836304,
22
+ "eval_runtime": 10.4355,
23
+ "eval_samples_per_second": 5.75,
24
+ "eval_steps_per_second": 0.767,
25
  "step": 20
26
  },
27
  {
28
+ "epoch": 0.5970149253731343,
29
+ "grad_norm": 0.688591718673706,
30
+ "learning_rate": 9.135802469135802e-05,
31
+ "loss": 0.8989,
32
  "step": 40
33
  },
34
  {
35
+ "epoch": 0.5970149253731343,
36
+ "eval_loss": 1.1058913469314575,
37
+ "eval_runtime": 8.5976,
38
+ "eval_samples_per_second": 6.979,
39
+ "eval_steps_per_second": 0.93,
40
  "step": 40
41
  },
42
  {
43
+ "epoch": 0.8955223880597015,
44
+ "grad_norm": 0.4469507336616516,
45
+ "learning_rate": 8.518518518518518e-05,
46
+ "loss": 0.8293,
47
  "step": 60
48
  },
49
  {
50
+ "epoch": 0.8955223880597015,
51
+ "eval_loss": 1.0656379461288452,
52
+ "eval_runtime": 8.5963,
53
+ "eval_samples_per_second": 6.98,
54
+ "eval_steps_per_second": 0.931,
55
  "step": 60
56
  },
57
  {
58
+ "epoch": 1.1940298507462686,
59
+ "grad_norm": 0.3167116641998291,
60
+ "learning_rate": 7.901234567901235e-05,
61
+ "loss": 0.787,
62
  "step": 80
63
  },
64
  {
65
+ "epoch": 1.1940298507462686,
66
+ "eval_loss": 1.0364410877227783,
67
+ "eval_runtime": 8.6008,
68
+ "eval_samples_per_second": 6.976,
69
+ "eval_steps_per_second": 0.93,
70
  "step": 80
71
  },
72
  {
73
+ "epoch": 1.4925373134328357,
74
+ "grad_norm": 0.588107705116272,
75
+ "learning_rate": 7.283950617283951e-05,
76
+ "loss": 0.7025,
77
  "step": 100
78
  },
79
  {
80
+ "epoch": 1.4925373134328357,
81
+ "eval_loss": 1.0205715894699097,
82
+ "eval_runtime": 8.6057,
83
+ "eval_samples_per_second": 6.972,
84
+ "eval_steps_per_second": 0.93,
85
  "step": 100
86
  },
87
  {
88
+ "epoch": 1.7910447761194028,
89
+ "grad_norm": 0.6192528009414673,
90
+ "learning_rate": 6.666666666666667e-05,
91
+ "loss": 0.7386,
92
  "step": 120
93
  },
94
  {
95
+ "epoch": 1.7910447761194028,
96
+ "eval_loss": 0.9961486458778381,
97
+ "eval_runtime": 8.5981,
98
+ "eval_samples_per_second": 6.978,
99
+ "eval_steps_per_second": 0.93,
100
  "step": 120
101
  },
102
  {
103
+ "epoch": 2.08955223880597,
104
+ "grad_norm": 0.49059680104255676,
105
+ "learning_rate": 6.049382716049383e-05,
106
+ "loss": 0.7471,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 2.08955223880597,
111
+ "eval_loss": 0.9915516972541809,
112
+ "eval_runtime": 8.6047,
113
+ "eval_samples_per_second": 6.973,
114
+ "eval_steps_per_second": 0.93,
115
  "step": 140
116
  },
117
  {
118
+ "epoch": 2.388059701492537,
119
+ "grad_norm": 0.41391539573669434,
120
+ "learning_rate": 5.4320987654320986e-05,
121
+ "loss": 0.624,
122
  "step": 160
123
  },
124
  {
125
+ "epoch": 2.388059701492537,
126
+ "eval_loss": 0.9842909574508667,
127
+ "eval_runtime": 8.6,
128
+ "eval_samples_per_second": 6.977,
129
+ "eval_steps_per_second": 0.93,
130
  "step": 160
131
  },
132
  {
133
+ "epoch": 2.6865671641791042,
134
+ "grad_norm": 0.5299602746963501,
135
+ "learning_rate": 4.814814814814815e-05,
136
+ "loss": 0.6839,
137
  "step": 180
138
  },
139
  {
140
+ "epoch": 2.6865671641791042,
141
+ "eval_loss": 0.9728425145149231,
142
+ "eval_runtime": 8.6028,
143
+ "eval_samples_per_second": 6.974,
144
+ "eval_steps_per_second": 0.93,
145
  "step": 180
146
  },
147
  {
148
+ "epoch": 2.9850746268656714,
149
+ "grad_norm": 0.6335314512252808,
150
+ "learning_rate": 4.197530864197531e-05,
151
+ "loss": 0.6561,
152
  "step": 200
153
  },
154
  {
155
+ "epoch": 2.9850746268656714,
156
+ "eval_loss": 0.973730206489563,
157
+ "eval_runtime": 8.5961,
158
+ "eval_samples_per_second": 6.98,
159
+ "eval_steps_per_second": 0.931,
160
  "step": 200
161
  },
162
  {
163
+ "epoch": 3.283582089552239,
164
+ "grad_norm": 0.6432749032974243,
165
+ "learning_rate": 3.580246913580247e-05,
166
+ "loss": 0.6027,
167
  "step": 220
168
  },
169
  {
170
+ "epoch": 3.283582089552239,
171
+ "eval_loss": 0.9784586429595947,
172
+ "eval_runtime": 8.5988,
173
+ "eval_samples_per_second": 6.978,
174
+ "eval_steps_per_second": 0.93,
175
  "step": 220
176
  },
177
  {
178
+ "epoch": 3.582089552238806,
179
+ "grad_norm": 0.638783872127533,
180
+ "learning_rate": 2.962962962962963e-05,
181
+ "loss": 0.5221,
182
+ "step": 240
 
 
183
  },
184
  {
185
+ "epoch": 3.582089552238806,
186
+ "eval_loss": 0.9842756986618042,
187
+ "eval_runtime": 8.5989,
188
+ "eval_samples_per_second": 6.978,
189
+ "eval_steps_per_second": 0.93,
190
+ "step": 240
191
+ },
192
+ {
193
+ "epoch": 3.582089552238806,
194
+ "step": 240,
195
+ "total_flos": 3.78148862442455e+16,
196
+ "train_loss": 0.7386853178342183,
197
+ "train_runtime": 1029.4211,
198
+ "train_samples_per_second": 2.599,
199
+ "train_steps_per_second": 0.325
200
+ },
201
+ {
202
+ "epoch": 3.582089552238806,
203
+ "eval_loss": 0.9728425145149231,
204
+ "eval_runtime": 8.6231,
205
+ "eval_samples_per_second": 6.958,
206
+ "eval_steps_per_second": 0.928,
207
+ "step": 240
208
  }
209
  ],
210
  "logging_steps": 20,
211
+ "max_steps": 335,
212
  "num_input_tokens_seen": 0,
213
  "num_train_epochs": 5,
214
  "save_steps": 20,
 
233
  "attributes": {}
234
  }
235
  },
236
+ "total_flos": 3.78148862442455e+16,
237
  "train_batch_size": 2,
238
  "trial_name": null,
239
  "trial_params": null