optimizer.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. # --------------------------------------------------------
  2. # Modified by $@#Anonymous#@$
  3. # --------------------------------------------------------
  4. # Swin Transformer
  5. # Copyright (c) 2021 Microsoft
  6. # Licensed under The MIT License [see LICENSE for details]
  7. # Written by Ze Liu
  8. # --------------------------------------------------------
  9. from functools import partial
  10. from torch import optim as optim
  11. def build_optimizer(config, model, logger, simmim=False, is_pretrain=False, **kwargs):
  12. """
  13. Build optimizer, set weight decay of normalization to 0 by default.
  14. """
  15. logger.info(f"==============> building optimizer {config.TRAIN.OPTIMIZER.NAME}....................")
  16. skip = {}
  17. skip_keywords = {}
  18. if hasattr(model, 'no_weight_decay'):
  19. skip = model.no_weight_decay()
  20. if hasattr(model, 'no_weight_decay_keywords'):
  21. skip_keywords = model.no_weight_decay_keywords()
  22. if simmim:
  23. if is_pretrain:
  24. parameters = get_pretrain_param_groups(model, skip, skip_keywords)
  25. else:
  26. depths = config.MODEL.SWIN.DEPTHS if config.MODEL.TYPE == 'swin' else config.MODEL.SWINV2.DEPTHS
  27. num_layers = sum(depths)
  28. get_layer_func = partial(get_swin_layer, num_layers=num_layers + 2, depths=depths)
  29. scales = list(config.TRAIN.LAYER_DECAY ** i for i in reversed(range(num_layers + 2)))
  30. parameters = get_finetune_param_groups(model, config.TRAIN.BASE_LR, config.TRAIN.WEIGHT_DECAY, get_layer_func, scales, skip, skip_keywords)
  31. else:
  32. parameters, no_decay_names = set_weight_decay(model, skip, skip_keywords)
  33. logger.info(f"No weight decay list: {no_decay_names}")
  34. opt_lower = config.TRAIN.OPTIMIZER.NAME.lower()
  35. optimizer = None
  36. if opt_lower == 'sgd':
  37. optimizer = optim.SGD(parameters, momentum=config.TRAIN.OPTIMIZER.MOMENTUM, nesterov=True,
  38. lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY)
  39. elif opt_lower == 'adamw':
  40. optimizer = optim.AdamW(parameters, eps=config.TRAIN.OPTIMIZER.EPS, betas=config.TRAIN.OPTIMIZER.BETAS,
  41. lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY)
  42. else:
  43. raise NotImplementedError
  44. return optimizer
  45. def set_weight_decay(model, skip_list=(), skip_keywords=()):
  46. has_decay = []
  47. no_decay = []
  48. no_decay_names = []
  49. for name, param in model.named_parameters():
  50. if not param.requires_grad:
  51. continue # frozen weights
  52. if len(param.shape) == 1 or name.endswith(".bias") or (name in skip_list) or \
  53. check_keywords_in_name(name, skip_keywords):
  54. no_decay.append(param)
  55. no_decay_names.append(name)
  56. # print(f"{name} has no weight decay")
  57. else:
  58. has_decay.append(param)
  59. return [{'params': has_decay},
  60. {'params': no_decay, 'weight_decay': 0.}], no_decay_names
  61. def check_keywords_in_name(name, keywords=()):
  62. isin = False
  63. for keyword in keywords:
  64. if keyword in name:
  65. isin = True
  66. return isin
  67. # ==========================
  68. # for mim, currently not used, and may have bugs...
  69. def build_optimizer_swimmim(config, model, logger, simmim=True, is_pretrain=False):
  70. """
  71. Build optimizer, set weight decay of normalization to 0 by default.
  72. """
  73. skip = {}
  74. skip_keywords = {}
  75. if hasattr(model, 'no_weight_decay'):
  76. skip = model.no_weight_decay()
  77. if hasattr(model, 'no_weight_decay_keywords'):
  78. skip_keywords = model.no_weight_decay_keywords()
  79. if is_pretrain:
  80. parameters = get_pretrain_param_groups(model, skip, skip_keywords)
  81. else:
  82. depths = config.MODEL.SWIN.DEPTHS if config.MODEL.TYPE == 'swin' else config.MODEL.SWINV2.DEPTHS
  83. num_layers = sum(depths)
  84. get_layer_func = partial(get_swin_layer, num_layers=num_layers + 2, depths=depths)
  85. scales = list(config.TRAIN.LAYER_DECAY ** i for i in reversed(range(num_layers + 2)))
  86. parameters = get_finetune_param_groups(model, config.TRAIN.BASE_LR, config.TRAIN.WEIGHT_DECAY, get_layer_func, scales, skip, skip_keywords)
  87. opt_lower = config.TRAIN.OPTIMIZER.NAME.lower()
  88. optimizer = None
  89. if opt_lower == 'sgd':
  90. optimizer = optim.SGD(parameters, momentum=config.TRAIN.OPTIMIZER.MOMENTUM, nesterov=True,
  91. lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY)
  92. elif opt_lower == 'adamw':
  93. optimizer = optim.AdamW(parameters, eps=config.TRAIN.OPTIMIZER.EPS, betas=config.TRAIN.OPTIMIZER.BETAS,
  94. lr=config.TRAIN.BASE_LR, weight_decay=config.TRAIN.WEIGHT_DECAY)
  95. else:
  96. raise NotImplementedError
  97. return optimizer
  98. def get_pretrain_param_groups(model, skip_list=(), skip_keywords=()):
  99. has_decay = []
  100. no_decay = []
  101. has_decay_name = []
  102. no_decay_name = []
  103. for name, param in model.named_parameters():
  104. if not param.requires_grad:
  105. continue
  106. if len(param.shape) == 1 or name.endswith(".bias") or (name in skip_list) or \
  107. check_keywords_in_name(name, skip_keywords):
  108. no_decay.append(param)
  109. no_decay_name.append(name)
  110. else:
  111. has_decay.append(param)
  112. has_decay_name.append(name)
  113. return [{'params': has_decay},
  114. {'params': no_decay, 'weight_decay': 0.}]
  115. def get_swin_layer(name, num_layers, depths):
  116. if name in ("mask_token"):
  117. return 0
  118. elif name.startswith("patch_embed"):
  119. return 0
  120. elif name.startswith("layers"):
  121. layer_id = int(name.split('.')[1])
  122. block_id = name.split('.')[3]
  123. if block_id == 'reduction' or block_id == 'norm':
  124. return sum(depths[:layer_id + 1])
  125. layer_id = sum(depths[:layer_id]) + int(block_id)
  126. return layer_id + 1
  127. else:
  128. return num_layers - 1
  129. def get_finetune_param_groups(model, lr, weight_decay, get_layer_func, scales, skip_list=(), skip_keywords=()):
  130. parameter_group_names = {}
  131. parameter_group_vars = {}
  132. for name, param in model.named_parameters():
  133. if not param.requires_grad:
  134. continue
  135. if len(param.shape) == 1 or name.endswith(".bias") or (name in skip_list) or \
  136. check_keywords_in_name(name, skip_keywords):
  137. group_name = "no_decay"
  138. this_weight_decay = 0.
  139. else:
  140. group_name = "decay"
  141. this_weight_decay = weight_decay
  142. if get_layer_func is not None:
  143. layer_id = get_layer_func(name)
  144. group_name = "layer_%d_%s" % (layer_id, group_name)
  145. else:
  146. layer_id = None
  147. if group_name not in parameter_group_names:
  148. if scales is not None:
  149. scale = scales[layer_id]
  150. else:
  151. scale = 1.
  152. parameter_group_names[group_name] = {
  153. "group_name": group_name,
  154. "weight_decay": this_weight_decay,
  155. "params": [],
  156. "lr": lr * scale,
  157. "lr_scale": scale,
  158. }
  159. parameter_group_vars[group_name] = {
  160. "group_name": group_name,
  161. "weight_decay": this_weight_decay,
  162. "params": [],
  163. "lr": lr * scale,
  164. "lr_scale": scale
  165. }
  166. parameter_group_vars[group_name]["params"].append(param)
  167. parameter_group_names[group_name]["params"].append(name)
  168. return list(parameter_group_vars.values())