CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ai-forever

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: ai-forever/sber-swap
Path: blob/main/apex/tests/L0/run_optimizers/test_fused_optimizer.py
Views: 794
1
import unittest
2
import os
3
import random
4
5
import math
6
import torch
7
import apex
8
from itertools import product
9
from torch.optim import Optimizer
10
11
class TestFusedOptimizer(unittest.TestCase):
12
def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
13
self.max_abs_diff = max_abs_diff
14
self.max_rel_diff = max_rel_diff
15
self.iters = iters
16
torch.cuda.manual_seed(9876)
17
18
def tearDown(self):
19
pass
20
21
def gen_param_optim(self, tensors, options, tst_options=None):
22
23
# Adding this to make backward compatible with existing tests. Just in
24
# case "tst_options" are not provided, it gets a copy of options
25
# which contains the parameters for the reference optimizer
26
if tst_options == None:
27
tst_options = options
28
29
ref_param = []
30
tst_param = []
31
for tensor in tensors:
32
ref_param.append(torch.nn.Parameter(tensor.clone()))
33
tst_param.append(torch.nn.Parameter(tensor.clone()))
34
35
ref_optim = self.ref_optim(ref_param, **options)
36
tst_optim = self.fused_optim(tst_param, **tst_options)
37
38
return (ref_param, tst_param, ref_optim, tst_optim)
39
40
def gen_grad(self, ref_param, tst_param):
41
for p_ref, p_tst in zip(ref_param, tst_param):
42
p_ref.grad = torch.rand_like(p_ref)
43
p_tst.grad = p_ref.grad
44
45
def gen_mixed_grad(self, ref_param, tst_param, scale=1.0):
46
half_grads = []
47
for p_ref, p_tst in zip(ref_param, tst_param):
48
half_grads.append(torch.rand_like(p_ref).half())
49
p_ref.grad = half_grads[-1].float() / scale
50
return half_grads
51
52
def get_max_diff(self, ref_param, tst_param):
53
max_abs_diff = max_rel_diff = 0
54
for p_ref, p_tst in zip(ref_param, tst_param):
55
max_abs_diff_p = (p_ref - p_tst).abs().max().item()
56
max_rel_diff_p = ((p_ref - p_tst) / p_ref).abs().max().item()
57
58
if max_abs_diff_p > max_abs_diff: max_abs_diff = max_abs_diff_p
59
if max_rel_diff_p > max_rel_diff: max_rel_diff = max_rel_diff_p
60
61
return max_abs_diff, max_rel_diff
62
63
def gen_single_type_test(self, param_type=torch.float, device='cuda'):
64
nelem = 278011
65
66
# Some ref and test optimizers may require different set of options.
67
# This is a quick workaround to add that functionality while making
68
# minimum changes in existing code.
69
# If there is no "tst_options" field provided, safe to initialize
70
# the test optimizer with the parameters of reference optimizer.
71
if not hasattr(self, 'tst_options'):
72
self.tst_options = self.options
73
74
tensor = torch.rand(nelem, dtype=param_type, device=device)
75
76
ref_param, tst_param, ref_optim, tst_optim = \
77
self.gen_param_optim([tensor], self.options, self.tst_options)
78
79
for i in range(self.iters):
80
self.gen_grad(ref_param, tst_param)
81
ref_optim.step()
82
tst_optim.step()
83
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
84
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
85
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
86
87
88
class TestFusedAdam(TestFusedOptimizer):
89
90
def __init__(self, *args, **kwargs):
91
super(TestFusedAdam, self).__init__(*args, **kwargs)
92
self.options = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
93
'weight_decay': 0, 'amsgrad': False}
94
self.ref_optim = torch.optim.Adam
95
self.fused_optim = apex.optimizers.FusedAdam
96
97
def test_float(self):
98
self.gen_single_type_test(param_type=torch.float)
99
100
def test_half(self):
101
self.gen_single_type_test(param_type=torch.float16)
102
103
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
104
def test_multi_device(self):
105
devices = ("cuda:0", "cuda:1")
106
for current_dev, tensor_dev in product(devices, devices):
107
with torch.cuda.device(current_dev):
108
self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
109
110
@unittest.skip('Disable until 8/1/2019 adam/adamw upstream picked')
111
def test_multi_params(self):
112
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
113
114
tensors = []
115
for size in sizes:
116
tensors.append(torch.rand(size, dtype=torch.float, device='cuda'))
117
ref_param, tst_param, ref_optim, tst_optim = \
118
self.gen_param_optim(tensors, self.options)
119
120
for i in range(self.iters):
121
self.gen_grad(ref_param, tst_param)
122
ref_optim.step()
123
tst_optim.step()
124
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
125
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
126
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
127
128
@unittest.skip('No longer support fuse scaling')
129
def test_scale(self):
130
nelem = 278011
131
tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
132
ref_param, tst_param, ref_optim, tst_optim = \
133
self.gen_param_optim([tensor], self.options)
134
135
for i in range(self.iters):
136
scale = random.random() * 1000
137
half_grads = self.gen_mixed_grad(ref_param, tst_param, scale)
138
ref_optim.step()
139
tst_optim.step(grads=half_grads, scale=scale)
140
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
141
142
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
143
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
144
145
@unittest.skip('No longer support output fp16 param')
146
def test_fp16_output(self):
147
nelem = 278011
148
149
tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
150
ref_param, tst_param, ref_optim, tst_optim = \
151
self.gen_param_optim([tensor], self.options)
152
153
fp16_param = torch.nn.Parameter(tensor.clone().half())
154
155
for i in range(self.iters):
156
half_grads = self.gen_mixed_grad(ref_param, tst_param)
157
ref_optim.step()
158
tst_optim.step(grads=half_grads, output_params=[fp16_param])
159
160
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
161
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
162
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
163
164
max_abs_diff, max_rel_diff = self.get_max_diff(tst_param, \
165
[fp16_param.float()])
166
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
167
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
168
169
def test_adam_option(self):
170
nelem = 1
171
adam_option = {'lr':0.01, 'betas':(0.6, 0.9), 'eps':3e-06,
172
'weight_decay':0, 'amsgrad':False}
173
174
tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
175
ref_param, tst_param, ref_optim, tst_optim = \
176
self.gen_param_optim([tensor], adam_option)
177
178
for i in range(self.iters):
179
self.gen_grad(ref_param, tst_param)
180
ref_optim.step()
181
tst_optim.step()
182
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
183
184
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
185
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
186
187
188
class TestFusedAdagrad(TestFusedOptimizer):
189
def __init__(self, *args, **kwargs):
190
super(TestFusedAdagrad, self).__init__(*args, **kwargs)
191
self.options = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 1.0e-5}
192
self.ref_optim = torch.optim.Adagrad
193
self.fused_optim = apex.optimizers.FusedAdagrad
194
195
def test_float(self):
196
self.gen_single_type_test(param_type=torch.float)
197
198
@unittest.skip("PyTorch optimizer is not numerically correct for fp16")
199
def test_half(self):
200
self.gen_single_type_test(param_type=torch.float16)
201
202
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
203
def test_multi_device(self):
204
devices = ("cuda:0", "cuda:1")
205
for current_dev, tensor_dev in product(devices, devices):
206
with torch.cuda.device(current_dev):
207
self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
208
209
210
def test_multi_params(self):
211
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
212
adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}
213
214
tensors = []
215
for size in sizes:
216
tensors.append(torch.rand(size, dtype=torch.float, device="cuda"))
217
ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
218
tensors, adagrad_option
219
)
220
221
for _ in range(self.iters):
222
self.gen_grad(ref_param, tst_param)
223
ref_optim.step()
224
tst_optim.step()
225
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
226
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
227
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
228
229
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
230
def test_multi_params_different_devices_throws(self):
231
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
232
adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}
233
234
tensors = []
235
for i, size in enumerate(sizes):
236
tensors.append(torch.rand(size, dtype=torch.float, device="cuda:"+str(i % 2)))
237
ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
238
tensors, adagrad_option
239
)
240
self.gen_grad(ref_param, tst_param)
241
with self.assertRaisesRegex(RuntimeError, "not on the same device"):
242
tst_optim.step()
243
244
def test_adagrad_option(self):
245
nelem = 1
246
adagrad_option = {"lr": 0.01, "eps": 3e-06, "weight_decay": 0}
247
248
tensor = torch.rand(nelem, dtype=torch.float, device="cuda")
249
ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
250
[tensor], adagrad_option
251
)
252
253
for _ in range(self.iters):
254
self.gen_grad(ref_param, tst_param)
255
ref_optim.step()
256
tst_optim.step()
257
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
258
259
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
260
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
261
262
263
class TestFusedSGD(TestFusedOptimizer):
264
def __init__(self, *args, **kwargs):
265
super(TestFusedSGD, self).__init__(*args, **kwargs)
266
self.options = {"lr": .25, "momentum": .125}
267
self.ref_optim = torch.optim.SGD
268
self.fused_optim = apex.optimizers.FusedSGD
269
270
def test_float(self):
271
self.gen_single_type_test(param_type=torch.float)
272
273
def test_half(self):
274
self.gen_single_type_test(param_type=torch.float16)
275
276
@unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
277
def test_multi_device(self):
278
devices = ("cuda:0", "cuda:1")
279
for current_dev, tensor_dev in product(devices, devices):
280
with torch.cuda.device(current_dev):
281
self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
282
283
if __name__ == '__main__':
284
unittest.main()
285
286