optimizer.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. # --------------------------------------------------------
  2. # Swin Transformer
  3. # Copyright (c) 2021 Microsoft
  4. # Licensed under The MIT License [see LICENSE for details]
  5. # Written by Ze Liu
  6. # --------------------------------------------------------
  7. from functools import partial
  8. from torch import optim as optim
  9. try:
  10. from apex.optimizers import FusedAdam, FusedLAMB
  11. except:
  12. FusedAdam = None
  13. FusedLAMB = None
  14. print("To use FusedLAMB or FusedAdam, please install apex.")
  15. def build_optimizer(config, model, simmim=False, is_pretrain=False):
  16. """
  17. Build optimizer, set weight decay of normalization to 0 by default.
  18. """
  19. skip = {}
  20. skip_keywords = {}
  21. if hasattr(model, 'no_weight_decay'):
  22. skip = model.no_weight_decay()
  23. if hasattr(model, 'no_weight_decay_keywords'):
  24. skip_keywords = model.no_weight_decay_keywords()
  25. if simmim:
  26. if is_pretrain:
  27. parameters = get_pretrain_param_groups(model, skip, skip_keywords)
  28. else:
  29. depths = config.MODEL.SWIN.DEPTHS if config.MODEL.TYPE == 'swin' else config.MODEL.SWINV2.DEPTHS
  30. num_layers = sum(depths)
  31. get_layer_func = partial(get_swin_layer, num_layers=num_layers + 2, depths=depths)
  32. scales = list(config.TRAIN.LAYER_DECAY ** i for i in reversed(range(num_layers + 2)))
  33. parameters = get_finetune_param_groups(model, config.TRAIN.BASE_LR, config.TRAIN.WEIGHT_DECAY, get_layer_func, scales, skip, skip_keywords)
  34. else:
  35. parameters = set_weight_decay(model, skip, skip_keywords)
  36. opt_lower = config.TRAIN.OPTIMIZER.NAME.lower()
  37. optimizer = None
  38. if opt_lower == 'sgd':
  39. optimizer = optim.SGD(parameters, momentum=config.TRAIN.OPTIMIZER.MOMENTUM, nesterov=True,
  40. lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY)
  41. elif opt_lower == 'adamw':
  42. optimizer = optim.AdamW(parameters, eps=config.TRAIN.OPTIMIZER.EPS, betas=config.TRAIN.OPTIMIZER.BETAS,
  43. lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY)
  44. elif opt_lower == 'fused_adam':
  45. optimizer = FusedAdam(parameters, eps=config.TRAIN.OPTIMIZER.EPS, betas=config.TRAIN.OPTIMIZER.BETAS,
  46. lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY)
  47. elif opt_lower == 'fused_lamb':
  48. optimizer = FusedLAMB(parameters, eps=config.TRAIN.OPTIMIZER.EPS, betas=config.TRAIN.OPTIMIZER.BETAS,
  49. lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY)
  50. return optimizer
  51. def set_weight_decay(model, skip_list=(), skip_keywords=()):
  52. has_decay = []
  53. no_decay = []
  54. for name, param in model.named_parameters():
  55. if not param.requires_grad:
  56. continue # frozen weights
  57. if len(param.shape) == 1 or name.endswith(".bias") or (name in skip_list) or \
  58. check_keywords_in_name(name, skip_keywords):
  59. no_decay.append(param)
  60. # print(f"{name} has no weight decay")
  61. else:
  62. has_decay.append(param)
  63. return [{'params': has_decay},
  64. {'params': no_decay, 'weight_decay': 0.}]
  65. def check_keywords_in_name(name, keywords=()):
  66. isin = False
  67. for keyword in keywords:
  68. if keyword in name:
  69. isin = True
  70. return isin
  71. def get_pretrain_param_groups(model, skip_list=(), skip_keywords=()):
  72. has_decay = []
  73. no_decay = []
  74. has_decay_name = []
  75. no_decay_name = []
  76. for name, param in model.named_parameters():
  77. if not param.requires_grad:
  78. continue
  79. if len(param.shape) == 1 or name.endswith(".bias") or (name in skip_list) or \
  80. check_keywords_in_name(name, skip_keywords):
  81. no_decay.append(param)
  82. no_decay_name.append(name)
  83. else:
  84. has_decay.append(param)
  85. has_decay_name.append(name)
  86. return [{'params': has_decay},
  87. {'params': no_decay, 'weight_decay': 0.}]
  88. def get_swin_layer(name, num_layers, depths):
  89. if name in ("mask_token"):
  90. return 0
  91. elif name.startswith("patch_embed"):
  92. return 0
  93. elif name.startswith("layers"):
  94. layer_id = int(name.split('.')[1])
  95. block_id = name.split('.')[3]
  96. if block_id == 'reduction' or block_id == 'norm':
  97. return sum(depths[:layer_id + 1])
  98. layer_id = sum(depths[:layer_id]) + int(block_id)
  99. return layer_id + 1
  100. else:
  101. return num_layers - 1
  102. def get_finetune_param_groups(model, lr, weight_decay, get_layer_func, scales, skip_list=(), skip_keywords=()):
  103. parameter_group_names = {}
  104. parameter_group_vars = {}
  105. for name, param in model.named_parameters():
  106. if not param.requires_grad:
  107. continue
  108. if len(param.shape) == 1 or name.endswith(".bias") or (name in skip_list) or \
  109. check_keywords_in_name(name, skip_keywords):
  110. group_name = "no_decay"
  111. this_weight_decay = 0.
  112. else:
  113. group_name = "decay"
  114. this_weight_decay = weight_decay
  115. if get_layer_func is not None:
  116. layer_id = get_layer_func(name)
  117. group_name = "layer_%d_%s" % (layer_id, group_name)
  118. else:
  119. layer_id = None
  120. if group_name not in parameter_group_names:
  121. if scales is not None:
  122. scale = scales[layer_id]
  123. else:
  124. scale = 1.
  125. parameter_group_names[group_name] = {
  126. "group_name": group_name,
  127. "weight_decay": this_weight_decay,
  128. "params": [],
  129. "lr": lr * scale,
  130. "lr_scale": scale,
  131. }
  132. parameter_group_vars[group_name] = {
  133. "group_name": group_name,
  134. "weight_decay": this_weight_decay,
  135. "params": [],
  136. "lr": lr * scale,
  137. "lr_scale": scale
  138. }
  139. parameter_group_vars[group_name]["params"].append(param)
  140. parameter_group_names[group_name]["params"].append(name)
  141. return list(parameter_group_vars.values())