transformer_blocks.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. import copy
  2. import math
  3. from typing import Optional
  4. import torch
  5. import torch.nn.functional as F
  6. from torch import Tensor, nn
  7. # modified from https://github.com/microsoft/X-Decoder/blob/main/xdecoder/body/transformer_blocks.py # noqa
  8. """Transformer class.
  9. Copy-paste from torch.nn.Transformer with modifications:
  10. * positional encodings are passed in MHattention
  11. * extra LN at the end of encoder is removed
  12. * decoder returns a stack of activations from all decoding layers
  13. """
  14. class Conv2d(torch.nn.Conv2d):
  15. """A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and
  16. more features."""
  17. def __init__(self, *args, **kwargs):
  18. """Extra keyword arguments supported in addition to those in
  19. `torch.nn.Conv2d`:
  20. Args:
  21. norm (nn.Module, optional): a normalization layer
  22. activation (callable(Tensor) -> Tensor): a callable
  23. activation function
  24. It assumes that norm layer is used before activation.
  25. """
  26. norm = kwargs.pop('norm', None)
  27. activation = kwargs.pop('activation', None)
  28. super().__init__(*args, **kwargs)
  29. self.norm = norm
  30. self.activation = activation
  31. def forward(self, x):
  32. x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
  33. self.dilation, self.groups)
  34. if self.norm is not None:
  35. x = self.norm(x)
  36. if self.activation is not None:
  37. x = self.activation(x)
  38. return x
  39. class PositionEmbeddingSine(nn.Module):
  40. """This is a more standard version of the position embedding, very similar
  41. to the one used by the Attention is all you need paper, generalized to work
  42. on images."""
  43. def __init__(self,
  44. num_pos_feats=64,
  45. temperature=10000,
  46. normalize=False,
  47. scale=None):
  48. super().__init__()
  49. self.num_pos_feats = num_pos_feats
  50. self.temperature = temperature
  51. self.normalize = normalize
  52. if scale is not None and normalize is False:
  53. raise ValueError('normalize should be True if scale is passed')
  54. if scale is None:
  55. scale = 2 * math.pi
  56. self.scale = scale
  57. def forward(self, x, mask=None):
  58. if mask is None:
  59. mask = torch.zeros((x.size(0), x.size(2), x.size(3)),
  60. device=x.device,
  61. dtype=torch.bool)
  62. not_mask = ~mask
  63. y_embed = not_mask.cumsum(1, dtype=x.dtype)
  64. x_embed = not_mask.cumsum(2, dtype=x.dtype)
  65. if self.normalize:
  66. eps = 1e-6
  67. y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
  68. x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
  69. dim_t = torch.arange(
  70. self.num_pos_feats, dtype=x.dtype, device=x.device)
  71. dim_t = self.temperature**(2 * (dim_t // 2) / self.num_pos_feats)
  72. pos_x = x_embed[:, :, :, None] / dim_t
  73. pos_y = y_embed[:, :, :, None] / dim_t
  74. pos_x = torch.stack(
  75. (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
  76. dim=4).flatten(3)
  77. pos_y = torch.stack(
  78. (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
  79. dim=4).flatten(3)
  80. pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
  81. return pos
  82. def __repr__(self, _repr_indent=4):
  83. head = 'Positional encoding ' + self.__class__.__name__
  84. body = [
  85. 'num_pos_feats: {}'.format(self.num_pos_feats),
  86. 'temperature: {}'.format(self.temperature),
  87. 'normalize: {}'.format(self.normalize),
  88. 'scale: {}'.format(self.scale),
  89. ]
  90. # _repr_indent = 4
  91. lines = [head] + [' ' * _repr_indent + line for line in body]
  92. return '\n'.join(lines)
  93. class TransformerEncoder(nn.Module):
  94. def __init__(self, encoder_layer, num_layers, norm=None):
  95. super().__init__()
  96. self.layers = _get_clones(encoder_layer, num_layers)
  97. self.num_layers = num_layers
  98. self.norm = norm
  99. def forward(
  100. self,
  101. src,
  102. mask: Optional[Tensor] = None,
  103. src_key_padding_mask: Optional[Tensor] = None,
  104. pos: Optional[Tensor] = None,
  105. ):
  106. output = src
  107. for layer in self.layers:
  108. output = layer(
  109. output,
  110. src_mask=mask,
  111. src_key_padding_mask=src_key_padding_mask,
  112. pos=pos)
  113. if self.norm is not None:
  114. output = self.norm(output)
  115. return output
  116. class TransformerEncoderLayer(nn.Module):
  117. def __init__(
  118. self,
  119. d_model,
  120. nhead,
  121. dim_feedforward=2048,
  122. dropout=0.1,
  123. activation='relu',
  124. normalize_before=False,
  125. ):
  126. super().__init__()
  127. self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
  128. # Implementation of Feedforward model
  129. self.linear1 = nn.Linear(d_model, dim_feedforward)
  130. self.dropout = nn.Dropout(dropout)
  131. self.linear2 = nn.Linear(dim_feedforward, d_model)
  132. self.norm1 = nn.LayerNorm(d_model)
  133. self.norm2 = nn.LayerNorm(d_model)
  134. self.dropout1 = nn.Dropout(dropout)
  135. self.dropout2 = nn.Dropout(dropout)
  136. self.activation = _get_activation_fn(activation)
  137. self.normalize_before = normalize_before
  138. def with_pos_embed(self, tensor, pos: Optional[Tensor]):
  139. return tensor if pos is None else tensor + pos
  140. def forward_post(
  141. self,
  142. src,
  143. src_mask: Optional[Tensor] = None,
  144. src_key_padding_mask: Optional[Tensor] = None,
  145. pos: Optional[Tensor] = None,
  146. ):
  147. q = k = self.with_pos_embed(src, pos)
  148. src2 = self.self_attn(
  149. q,
  150. k,
  151. value=src,
  152. attn_mask=src_mask,
  153. key_padding_mask=src_key_padding_mask)[0]
  154. src = src + self.dropout1(src2)
  155. src = self.norm1(src)
  156. src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
  157. src = src + self.dropout2(src2)
  158. src = self.norm2(src)
  159. return src
  160. def forward_pre(
  161. self,
  162. src,
  163. src_mask: Optional[Tensor] = None,
  164. src_key_padding_mask: Optional[Tensor] = None,
  165. pos: Optional[Tensor] = None,
  166. ):
  167. src2 = self.norm1(src)
  168. q = k = self.with_pos_embed(src2, pos)
  169. src2 = self.self_attn(
  170. q,
  171. k,
  172. value=src2,
  173. attn_mask=src_mask,
  174. key_padding_mask=src_key_padding_mask)[0]
  175. src = src + self.dropout1(src2)
  176. src2 = self.norm2(src)
  177. src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
  178. src = src + self.dropout2(src2)
  179. return src
  180. def forward(
  181. self,
  182. src,
  183. src_mask: Optional[Tensor] = None,
  184. src_key_padding_mask: Optional[Tensor] = None,
  185. pos: Optional[Tensor] = None,
  186. ):
  187. if self.normalize_before:
  188. return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
  189. return self.forward_post(src, src_mask, src_key_padding_mask, pos)
  190. class SelfAttentionLayer(nn.Module):
  191. def __init__(self,
  192. d_model,
  193. nhead,
  194. dropout=0.0,
  195. activation='relu',
  196. normalize_before=False):
  197. super().__init__()
  198. self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
  199. self.norm = nn.LayerNorm(d_model)
  200. self.dropout = nn.Dropout(dropout)
  201. self.activation = _get_activation_fn(activation)
  202. self.normalize_before = normalize_before
  203. self._reset_parameters()
  204. def _reset_parameters(self):
  205. for p in self.parameters():
  206. if p.dim() > 1:
  207. nn.init.xavier_uniform_(p)
  208. def with_pos_embed(self, tensor, pos: Optional[Tensor]):
  209. return tensor if pos is None else tensor + pos
  210. def forward_post(self,
  211. tgt,
  212. tgt_mask: Optional[Tensor] = None,
  213. tgt_key_padding_mask: Optional[Tensor] = None,
  214. query_pos: Optional[Tensor] = None):
  215. q = k = self.with_pos_embed(tgt, query_pos)
  216. tgt2 = self.self_attn(
  217. q,
  218. k,
  219. value=tgt,
  220. attn_mask=tgt_mask,
  221. key_padding_mask=tgt_key_padding_mask)[0]
  222. tgt = tgt + self.dropout(tgt2)
  223. tgt = self.norm(tgt)
  224. return tgt
  225. def forward_pre(self,
  226. tgt,
  227. tgt_mask: Optional[Tensor] = None,
  228. tgt_key_padding_mask: Optional[Tensor] = None,
  229. query_pos: Optional[Tensor] = None):
  230. tgt2 = self.norm(tgt)
  231. q = k = self.with_pos_embed(tgt2, query_pos)
  232. tgt2 = self.self_attn(
  233. q,
  234. k,
  235. value=tgt2,
  236. attn_mask=tgt_mask,
  237. key_padding_mask=tgt_key_padding_mask)[0]
  238. tgt = tgt + self.dropout(tgt2)
  239. return tgt
  240. def forward(self,
  241. tgt,
  242. tgt_mask: Optional[Tensor] = None,
  243. tgt_key_padding_mask: Optional[Tensor] = None,
  244. query_pos: Optional[Tensor] = None):
  245. if self.normalize_before:
  246. return self.forward_pre(tgt, tgt_mask, tgt_key_padding_mask,
  247. query_pos)
  248. return self.forward_post(tgt, tgt_mask, tgt_key_padding_mask,
  249. query_pos)
  250. class CrossAttentionLayer(nn.Module):
  251. def __init__(self,
  252. d_model,
  253. nhead,
  254. dropout=0.0,
  255. activation='relu',
  256. normalize_before=False):
  257. super().__init__()
  258. self.multihead_attn = nn.MultiheadAttention(
  259. d_model, nhead, dropout=dropout)
  260. self.norm = nn.LayerNorm(d_model)
  261. self.dropout = nn.Dropout(dropout)
  262. self.activation = _get_activation_fn(activation)
  263. self.normalize_before = normalize_before
  264. self._reset_parameters()
  265. def _reset_parameters(self):
  266. for p in self.parameters():
  267. if p.dim() > 1:
  268. nn.init.xavier_uniform_(p)
  269. def with_pos_embed(self, tensor, pos: Optional[Tensor]):
  270. return tensor if pos is None else tensor + pos
  271. def forward_post(self,
  272. tgt,
  273. memory,
  274. memory_mask: Optional[Tensor] = None,
  275. memory_key_padding_mask: Optional[Tensor] = None,
  276. pos: Optional[Tensor] = None,
  277. query_pos: Optional[Tensor] = None):
  278. tgt2, avg_attn = self.multihead_attn(
  279. query=self.with_pos_embed(tgt, query_pos),
  280. key=self.with_pos_embed(memory, pos),
  281. value=memory,
  282. attn_mask=memory_mask,
  283. key_padding_mask=memory_key_padding_mask)
  284. tgt = tgt + self.dropout(tgt2)
  285. tgt = self.norm(tgt)
  286. return tgt, avg_attn
  287. def forward_pre(self,
  288. tgt,
  289. memory,
  290. memory_mask: Optional[Tensor] = None,
  291. memory_key_padding_mask: Optional[Tensor] = None,
  292. pos: Optional[Tensor] = None,
  293. query_pos: Optional[Tensor] = None):
  294. tgt2 = self.norm(tgt)
  295. tgt2, avg_attn = self.multihead_attn(
  296. query=self.with_pos_embed(tgt2, query_pos),
  297. key=self.with_pos_embed(memory, pos),
  298. value=memory,
  299. attn_mask=memory_mask,
  300. key_padding_mask=memory_key_padding_mask)
  301. tgt = tgt + self.dropout(tgt2)
  302. return tgt, avg_attn
  303. def forward(self,
  304. tgt,
  305. memory,
  306. memory_mask: Optional[Tensor] = None,
  307. memory_key_padding_mask: Optional[Tensor] = None,
  308. pos: Optional[Tensor] = None,
  309. query_pos: Optional[Tensor] = None):
  310. if self.normalize_before:
  311. return self.forward_pre(tgt, memory, memory_mask,
  312. memory_key_padding_mask, pos, query_pos)
  313. return self.forward_post(tgt, memory, memory_mask,
  314. memory_key_padding_mask, pos, query_pos)
  315. class FFNLayer(nn.Module):
  316. def __init__(self,
  317. d_model,
  318. dim_feedforward=2048,
  319. dropout=0.0,
  320. activation='relu',
  321. normalize_before=False):
  322. super().__init__()
  323. # Implementation of Feedforward model
  324. self.linear1 = nn.Linear(d_model, dim_feedforward)
  325. self.dropout = nn.Dropout(dropout)
  326. self.linear2 = nn.Linear(dim_feedforward, d_model)
  327. self.norm = nn.LayerNorm(d_model)
  328. self.activation = _get_activation_fn(activation)
  329. self.normalize_before = normalize_before
  330. self._reset_parameters()
  331. def _reset_parameters(self):
  332. for p in self.parameters():
  333. if p.dim() > 1:
  334. nn.init.xavier_uniform_(p)
  335. def with_pos_embed(self, tensor, pos: Optional[Tensor]):
  336. return tensor if pos is None else tensor + pos
  337. def forward_post(self, tgt):
  338. tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
  339. tgt = tgt + self.dropout(tgt2)
  340. tgt = self.norm(tgt)
  341. return tgt
  342. def forward_pre(self, tgt):
  343. tgt2 = self.norm(tgt)
  344. tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
  345. tgt = tgt + self.dropout(tgt2)
  346. return tgt
  347. def forward(self, tgt):
  348. if self.normalize_before:
  349. return self.forward_pre(tgt)
  350. return self.forward_post(tgt)
  351. class MLP(nn.Module):
  352. """Very simple multi-layer perceptron (also called FFN)"""
  353. def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
  354. super().__init__()
  355. self.num_layers = num_layers
  356. h = [hidden_dim] * (num_layers - 1)
  357. self.layers = nn.ModuleList(
  358. nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
  359. def forward(self, x):
  360. for i, layer in enumerate(self.layers):
  361. x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
  362. return x
  363. def get_norm(norm, out_channels):
  364. """
  365. Args:
  366. norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
  367. or a callable that takes a channel number and returns
  368. the normalization layer as a nn.Module.
  369. Returns:
  370. nn.Module or None: the normalization layer
  371. """
  372. if norm is None:
  373. return None
  374. if isinstance(norm, str):
  375. if len(norm) == 0:
  376. return None
  377. norm = {
  378. 'BN': nn.BatchNorm2d,
  379. 'GN': lambda channels: nn.GroupNorm(32, channels),
  380. }[norm]
  381. return norm(out_channels)
  382. def _get_clones(module, N):
  383. return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
  384. def _get_activation_fn(activation):
  385. """Return an activation function given a string."""
  386. if activation == 'relu':
  387. return F.relu
  388. if activation == 'gelu':
  389. return F.gelu
  390. if activation == 'glu':
  391. return F.glu
  392. raise RuntimeError(f'activation should be relu/gelu, not {activation}.')