test_fused_optimizer.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. from itertools import product
  2. import random
  3. import unittest
  4. import torch
  5. import apex
  6. class TestFusedOptimizer(unittest.TestCase):
  7. def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
  8. self.max_abs_diff = max_abs_diff
  9. self.max_rel_diff = max_rel_diff
  10. self.iters = iters
  11. torch.manual_seed(9876)
  12. def tearDown(self):
  13. pass
  14. def gen_param_optim(self, tensors, options, tst_options=None):
  15. # Adding this to make backward compatible with existing tests. Just in
  16. # case "tst_options" are not provided, it gets a copy of options
  17. # which contains the parameters for the reference optimizer
  18. if tst_options == None:
  19. tst_options = options
  20. ref_param = []
  21. tst_param = []
  22. for tensor in tensors:
  23. ref_param.append(torch.nn.Parameter(tensor.clone()))
  24. tst_param.append(torch.nn.Parameter(tensor.clone()))
  25. ref_optim = self.ref_optim(ref_param, **options)
  26. tst_optim = self.fused_optim(tst_param, **tst_options)
  27. return (ref_param, tst_param, ref_optim, tst_optim)
  28. def gen_grad(self, ref_param, tst_param):
  29. for p_ref, p_tst in zip(ref_param, tst_param):
  30. p_ref.grad = torch.rand_like(p_ref)
  31. p_tst.grad = p_ref.grad
  32. def gen_mixed_grad(self, ref_param, tst_param, scale=1.0):
  33. half_grads = []
  34. for p_ref, p_tst in zip(ref_param, tst_param):
  35. half_grads.append(torch.rand_like(p_ref).half())
  36. p_ref.grad = half_grads[-1].float() / scale
  37. return half_grads
  38. def get_max_diff(self, ref_param, tst_param):
  39. max_abs_diff = max_rel_diff = 0
  40. for p_ref, p_tst in zip(ref_param, tst_param):
  41. max_abs_diff_p = (p_ref - p_tst).abs().max().item()
  42. max_rel_diff_p = ((p_ref - p_tst) / p_ref).abs().max().item()
  43. if max_abs_diff_p > max_abs_diff: max_abs_diff = max_abs_diff_p
  44. if max_rel_diff_p > max_rel_diff: max_rel_diff = max_rel_diff_p
  45. return max_abs_diff, max_rel_diff
  46. def gen_single_type_test(self, param_type=torch.float, device='cuda', *, skip_assert: bool = False):
  47. nelem = 278011
  48. # Some ref and test optimizers may require different set of options.
  49. # This is a quick workaround to add that functionality while making
  50. # minimum changes in existing code.
  51. # If there is no "tst_options" field provided, safe to initialize
  52. # the test optimizer with the parameters of reference optimizer.
  53. if not hasattr(self, 'tst_options'):
  54. self.tst_options = self.options
  55. tensor = torch.rand(nelem, dtype=param_type, device=device)
  56. ref_param, tst_param, ref_optim, tst_optim = \
  57. self.gen_param_optim([tensor], self.options, self.tst_options)
  58. for i in range(self.iters):
  59. self.gen_grad(ref_param, tst_param)
  60. ref_optim.step()
  61. tst_optim.step()
  62. if skip_assert:
  63. return
  64. max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
  65. self.assertLessEqual(max_abs_diff, self.max_abs_diff)
  66. self.assertLessEqual(max_rel_diff, self.max_rel_diff)
  67. class TestFusedAdam(TestFusedOptimizer):
  68. def setUp(self):
  69. super().setUp()
  70. self.options = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
  71. 'weight_decay': 0, 'amsgrad': False}
  72. self.ref_optim = torch.optim.Adam
  73. self.fused_optim = apex.optimizers.FusedAdam
  74. def test_float(self):
  75. self.gen_single_type_test(param_type=torch.float)
  76. # NOTE(mkozuki): Current threshold values look too small for BFloat16.
  77. # TODO(mkozuki): Refactor `TestFusedOptimizer`
  78. def test_half(self):
  79. self.gen_single_type_test(param_type=torch.float16, skip_assert=True)
  80. def test_bfloat16(self):
  81. self.gen_single_type_test(param_type=torch.bfloat16, skip_assert=True)
  82. @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
  83. def test_multi_device(self):
  84. devices = ("cuda:0", "cuda:1")
  85. for current_dev, tensor_dev in product(devices, devices):
  86. with torch.cuda.device(current_dev):
  87. self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
  88. @unittest.skip('Disable until 8/1/2019 adam/adamw upstream picked')
  89. def test_multi_params(self):
  90. sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
  91. tensors = []
  92. for size in sizes:
  93. tensors.append(torch.rand(size, dtype=torch.float, device='cuda'))
  94. ref_param, tst_param, ref_optim, tst_optim = \
  95. self.gen_param_optim(tensors, self.options)
  96. for i in range(self.iters):
  97. self.gen_grad(ref_param, tst_param)
  98. ref_optim.step()
  99. tst_optim.step()
  100. max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
  101. self.assertLessEqual(max_abs_diff, self.max_abs_diff)
  102. self.assertLessEqual(max_rel_diff, self.max_rel_diff)
  103. @unittest.skip('No longer support fuse scaling')
  104. def test_scale(self):
  105. nelem = 278011
  106. tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
  107. ref_param, tst_param, ref_optim, tst_optim = \
  108. self.gen_param_optim([tensor], self.options)
  109. for i in range(self.iters):
  110. scale = random.random() * 1000
  111. half_grads = self.gen_mixed_grad(ref_param, tst_param, scale)
  112. ref_optim.step()
  113. tst_optim.step(grads=half_grads, scale=scale)
  114. max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
  115. self.assertLessEqual(max_abs_diff, self.max_abs_diff)
  116. self.assertLessEqual(max_rel_diff, self.max_rel_diff)
  117. @unittest.skip('No longer support output fp16 param')
  118. def test_fp16_output(self):
  119. nelem = 278011
  120. tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
  121. ref_param, tst_param, ref_optim, tst_optim = \
  122. self.gen_param_optim([tensor], self.options)
  123. fp16_param = torch.nn.Parameter(tensor.clone().half())
  124. for i in range(self.iters):
  125. half_grads = self.gen_mixed_grad(ref_param, tst_param)
  126. ref_optim.step()
  127. tst_optim.step(grads=half_grads, output_params=[fp16_param])
  128. max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
  129. self.assertLessEqual(max_abs_diff, self.max_abs_diff)
  130. self.assertLessEqual(max_rel_diff, self.max_rel_diff)
  131. max_abs_diff, max_rel_diff = self.get_max_diff(tst_param, \
  132. [fp16_param.float()])
  133. self.assertLessEqual(max_abs_diff, self.max_abs_diff)
  134. self.assertLessEqual(max_rel_diff, self.max_rel_diff)
  135. def test_adam_option(self):
  136. nelem = 1
  137. adam_option = {'lr':0.01, 'betas':(0.6, 0.9), 'eps':3e-06,
  138. 'weight_decay':0, 'amsgrad':False}
  139. tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
  140. ref_param, tst_param, ref_optim, tst_optim = \
  141. self.gen_param_optim([tensor], adam_option)
  142. for i in range(self.iters):
  143. self.gen_grad(ref_param, tst_param)
  144. ref_optim.step()
  145. tst_optim.step()
  146. max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
  147. self.assertLessEqual(max_abs_diff, self.max_abs_diff)
  148. self.assertLessEqual(max_rel_diff, self.max_rel_diff)
  149. def test_frozen_model(self):
  150. nelem = 1
  151. adam_option = {'lr':0.01, 'betas':(0.6, 0.9), 'eps':3e-06,
  152. 'weight_decay':0, 'amsgrad':False}
  153. tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
  154. ref_param, tst_param, ref_optim, tst_optim = \
  155. self.gen_param_optim([tensor], adam_option)
  156. #Add an empty param group which may occur for pipeline parallel p-tuning
  157. tst_optim.add_param_group({"params": []})
  158. for i in range(self.iters):
  159. self.gen_grad(ref_param, tst_param)
  160. ref_optim.step()
  161. tst_optim.step()
  162. max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
  163. self.assertLessEqual(max_abs_diff, self.max_abs_diff)
  164. self.assertLessEqual(max_rel_diff, self.max_rel_diff)
  165. class TestFusedAdagrad(TestFusedOptimizer):
  166. def __init__(self, *args, **kwargs):
  167. super(TestFusedAdagrad, self).__init__(*args, **kwargs)
  168. self.options = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 1.0e-5}
  169. self.ref_optim = torch.optim.Adagrad
  170. self.fused_optim = apex.optimizers.FusedAdagrad
  171. def test_float(self):
  172. self.gen_single_type_test(param_type=torch.float)
  173. @unittest.skip("PyTorch optimizer is not numerically correct for fp16")
  174. def test_half(self):
  175. self.gen_single_type_test(param_type=torch.float16)
  176. @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
  177. def test_multi_device(self):
  178. devices = ("cuda:0", "cuda:1")
  179. for current_dev, tensor_dev in product(devices, devices):
  180. with torch.cuda.device(current_dev):
  181. self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
  182. def test_multi_params(self):
  183. sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
  184. adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}
  185. tensors = []
  186. for size in sizes:
  187. tensors.append(torch.rand(size, dtype=torch.float, device="cuda"))
  188. ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
  189. tensors, adagrad_option
  190. )
  191. for _ in range(self.iters):
  192. self.gen_grad(ref_param, tst_param)
  193. ref_optim.step()
  194. tst_optim.step()
  195. max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
  196. self.assertLessEqual(max_abs_diff, self.max_abs_diff)
  197. self.assertLessEqual(max_rel_diff, self.max_rel_diff)
  198. @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
  199. def test_multi_params_different_devices_throws(self):
  200. sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
  201. adagrad_option = {"lr": 5e-4, "eps": 1e-08, "weight_decay": 0}
  202. tensors = []
  203. for i, size in enumerate(sizes):
  204. tensors.append(torch.rand(size, dtype=torch.float, device="cuda:"+str(i % 2)))
  205. ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
  206. tensors, adagrad_option
  207. )
  208. self.gen_grad(ref_param, tst_param)
  209. with self.assertRaisesRegex(RuntimeError, "not on the same device"):
  210. tst_optim.step()
  211. def test_adagrad_option(self):
  212. nelem = 1
  213. adagrad_option = {"lr": 0.01, "eps": 3e-06, "weight_decay": 0}
  214. tensor = torch.rand(nelem, dtype=torch.float, device="cuda")
  215. ref_param, tst_param, ref_optim, tst_optim = self.gen_param_optim(
  216. [tensor], adagrad_option
  217. )
  218. for _ in range(self.iters):
  219. self.gen_grad(ref_param, tst_param)
  220. ref_optim.step()
  221. tst_optim.step()
  222. max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
  223. self.assertLessEqual(max_abs_diff, self.max_abs_diff)
  224. self.assertLessEqual(max_rel_diff, self.max_rel_diff)
  225. class TestFusedSGD(TestFusedOptimizer):
  226. def __init__(self, *args, **kwargs):
  227. super(TestFusedSGD, self).__init__(*args, **kwargs)
  228. self.options = {"lr": .25, "momentum": .125}
  229. self.ref_optim = torch.optim.SGD
  230. self.fused_optim = apex.optimizers.FusedSGD
  231. def test_float(self):
  232. self.gen_single_type_test(param_type=torch.float)
  233. def test_half(self):
  234. self.gen_single_type_test(param_type=torch.float16)
  235. @unittest.skipIf(torch.cuda.device_count()<2, "more than 1 GPU required")
  236. def test_multi_device(self):
  237. devices = ("cuda:0", "cuda:1")
  238. for current_dev, tensor_dev in product(devices, devices):
  239. with torch.cuda.device(current_dev):
  240. self.gen_single_type_test(param_type=torch.float, device=tensor_dev)
  241. if __name__ == '__main__':
  242. unittest.main()