| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437 |
- # Copyright (c) 2024, Tri Dao.
- # Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
- # For the backward pass, we keep weight_grad and bias_grad in registers and accumulate.
- # This backward pass is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
- # The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
- import math
- import torch
- import torch.nn.functional as F
- import triton
- import triton.language as tl
- from einops import rearrange
- def rms_norm_ref(x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True, upcast=True):
- dtype = x.dtype
- N = x.shape[-1]
- weight = weight.float()
- bias = bias.float() if bias is not None else None
- if upcast:
- x = x.float()
- z = z.float() if z is not None else z
- if z is not None and not norm_before_gate:
- x = x * F.silu(z)
- if group_size is None:
- rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
- out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight)
- else:
- x_group = rearrange(x, "... (g d) -> ... g d", d=group_size)
- rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + eps)
- out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight
- if bias is not None:
- out = out + bias
- if z is not None and norm_before_gate:
- out *= F.silu(z)
- return out.to(dtype)
- @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
- @triton.heuristics({"HAS_Z": lambda args: args["Z"] is not None})
- @triton.jit
- def _layer_norm_fwd_1pass_kernel(
- X, # pointer to the input
- Y, # pointer to the output
- W, # pointer to the weights
- B, # pointer to the biases
- Z, # pointer to the other branch
- Mean, # pointer to the mean
- Rstd, # pointer to the 1/std
- stride_x_row, # how much to increase the pointer when moving by 1 row
- stride_y_row,
- stride_z_row,
- M, # number of rows in X
- N, # number of columns in X
- eps, # epsilon to avoid division by zero
- BLOCK_N: tl.constexpr,
- HAS_BIAS: tl.constexpr,
- HAS_Z: tl.constexpr,
- NORM_BEFORE_GATE: tl.constexpr,
- IS_RMS_NORM: tl.constexpr,
- ):
- # Map the program id to the row of X and Y it should compute.
- row = tl.program_id(0)
- group = tl.program_id(1)
- X += row * stride_x_row + group * N
- Y += row * stride_y_row + group * N
- if HAS_Z:
- Z += row * stride_z_row + group * N
- if not IS_RMS_NORM:
- Mean += group * M
- Rstd += group * M
- W += group * N
- if HAS_BIAS:
- B += group * N
- # Compute mean and variance
- cols = tl.arange(0, BLOCK_N)
- x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32)
- if HAS_Z and not NORM_BEFORE_GATE:
- z = tl.load(Z + cols, mask=cols < N).to(tl.float32)
- x *= z * tl.sigmoid(z)
- if not IS_RMS_NORM:
- mean = tl.sum(x, axis=0) / N
- tl.store(Mean + row, mean)
- xbar = tl.where(cols < N, x - mean, 0.)
- var = tl.sum(xbar * xbar, axis=0) / N
- else:
- xbar = tl.where(cols < N, x, 0.)
- var = tl.sum(xbar * xbar, axis=0) / N
- rstd = 1 / tl.sqrt(var + eps)
- tl.store(Rstd + row, rstd)
- # Normalize and apply linear transformation
- mask = cols < N
- w = tl.load(W + cols, mask=mask).to(tl.float32)
- if HAS_BIAS:
- b = tl.load(B + cols, mask=mask).to(tl.float32)
- x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
- y = x_hat * w + b if HAS_BIAS else x_hat * w
- if HAS_Z and NORM_BEFORE_GATE:
- z = tl.load(Z + cols, mask=mask).to(tl.float32)
- y *= z * tl.sigmoid(z)
- # Write output
- tl.store(Y + cols, y, mask=mask)
- def _layer_norm_fwd(x, weight, bias, eps, z=None, out=None, group_size=None, norm_before_gate=True, is_rms_norm=False):
- M, N = x.shape
- if group_size is None:
- group_size = N
- assert N % group_size == 0
- ngroups = N // group_size
- assert x.stride(-1) == 1
- if z is not None:
- assert z.stride(-1) == 1
- assert z.shape == (M, N)
- assert weight.shape == (N,)
- assert weight.stride(-1) == 1
- if bias is not None:
- assert bias.stride(-1) == 1
- assert bias.shape == (N,)
- # allocate output
- if out is not None:
- assert out.shape == x.shape
- else:
- out = torch.empty_like(x)
- assert out.stride(-1) == 1
- mean = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device) if not is_rms_norm else None
- rstd = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device)
- # Less than 64KB per feature: enqueue fused kernel
- MAX_FUSED_SIZE = 65536 // x.element_size()
- BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
- if group_size > BLOCK_N:
- raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
- # heuristics for number of warps
- num_warps = min(max(BLOCK_N // 256, 1), 8)
- grid = (M, ngroups)
- with torch.cuda.device(x.device.index):
- _layer_norm_fwd_1pass_kernel[grid](x, out, weight, bias, z, mean, rstd,
- x.stride(0), out.stride(0), z.stride(0) if z is not None else 0,
- M, group_size, eps,
- BLOCK_N=BLOCK_N,
- NORM_BEFORE_GATE=norm_before_gate,
- IS_RMS_NORM=is_rms_norm,
- num_warps=num_warps)
- return out, mean, rstd
- @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
- @triton.heuristics({"HAS_Z": lambda args: args["Z"] is not None})
- @triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
- @triton.jit
- def _layer_norm_bwd_kernel(
- X, # pointer to the input
- W, # pointer to the weights
- B, # pointer to the biases
- Z, # pointer to the other branch
- Y, # pointer to the output to be recomputed
- DY, # pointer to the output gradient
- DX, # pointer to the input gradient
- DW, # pointer to the partial sum of weights gradient
- DB, # pointer to the partial sum of biases gradient
- DZ, # pointer to the other branch
- Mean, # pointer to the mean
- Rstd, # pointer to the 1/std
- stride_x_row, # how much to increase the pointer when moving by 1 row
- stride_z_row,
- stride_y_row,
- stride_dy_row,
- stride_dx_row,
- stride_dz_row,
- stride_dw_row,
- stride_db_row,
- M, # number of rows in X
- N, # number of columns in X
- eps, # epsilon to avoid division by zero
- rows_per_program,
- NORM_BEFORE_GATE: tl.constexpr,
- IS_RMS_NORM: tl.constexpr,
- HAS_BIAS: tl.constexpr,
- HAS_Z: tl.constexpr,
- RECOMPUTE_OUTPUT: tl.constexpr,
- BLOCK_N: tl.constexpr,
- ):
- # Map the program id to the elements of X, DX, and DY it should compute.
- row_block_id = tl.program_id(0)
- group = tl.program_id(1)
- row_start = row_block_id * rows_per_program
- cols = tl.arange(0, BLOCK_N)
- mask = cols < N
- X += row_start * stride_x_row + group * N
- if HAS_Z:
- Z += row_start * stride_z_row + group * N
- DZ += row_start * stride_dz_row + group * N
- DY += row_start * stride_dy_row + group * N
- DX += row_start * stride_dx_row + group * N
- if RECOMPUTE_OUTPUT:
- Y += row_start * stride_y_row + group * N
- if not IS_RMS_NORM:
- Mean += group * M
- Rstd += group * M
- W += group * N
- w = tl.load(W + cols, mask=mask).to(tl.float32)
- if (RECOMPUTE_OUTPUT or HAS_Z) and HAS_BIAS:
- B += group * N
- b = tl.load(B + cols, mask=mask, other=0.).to(tl.float32)
- dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
- if HAS_BIAS:
- db = tl.zeros((BLOCK_N,), dtype=tl.float32)
- row_end = min((row_block_id + 1) * rows_per_program, M)
- for row in range(row_start, row_end):
- # Load data to SRAM
- x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
- dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
- if not IS_RMS_NORM:
- mean = tl.load(Mean + row)
- if HAS_Z and not NORM_BEFORE_GATE:
- z = tl.load(Z + cols, mask=mask, other=0.).to(tl.float32)
- x_og = x
- x = x_og * z * tl.sigmoid(z)
- rstd = tl.load(Rstd + row)
- # Compute dx
- xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
- xhat = tl.where(mask, xhat, 0.)
- if HAS_Z and NORM_BEFORE_GATE:
- z = tl.load(Z + cols, mask=mask, other=0.).to(tl.float32)
- z_sigmoid = tl.sigmoid(z)
- y = xhat * w + b if HAS_BIAS else xhat * w
- if RECOMPUTE_OUTPUT:
- tl.store(Y + cols, y * z * z_sigmoid, mask=mask)
- dz = dy * y * z_sigmoid * (1 + z * (1 - z_sigmoid))
- tl.store(DZ + cols, dz, mask=mask)
- dy *= z * z_sigmoid
- else:
- if RECOMPUTE_OUTPUT:
- y = xhat * w + b if HAS_BIAS else xhat * w
- tl.store(Y + cols, y, mask=mask)
- wdy = w * dy
- c1 = tl.sum(xhat * wdy, axis=0) / N
- if not IS_RMS_NORM:
- c2 = tl.sum(wdy, axis=0) / N
- dx = (wdy - (xhat * c1 + c2)) * rstd
- else:
- dx = (wdy - xhat * c1) * rstd
- dw += dy * xhat
- if HAS_BIAS:
- db += dy
- if HAS_Z and not NORM_BEFORE_GATE:
- z_sigmoid = tl.sigmoid(z)
- dz = dx * x_og * z_sigmoid * (1 + z * (1 - z_sigmoid))
- tl.store(DZ + cols, dz, mask=mask)
- dx *= z * z_sigmoid
- # Write dx
- tl.store(DX + cols, dx, mask=mask)
- X += stride_x_row
- if HAS_Z:
- Z += stride_z_row
- DZ += stride_dz_row
- if RECOMPUTE_OUTPUT:
- Y += stride_y_row
- DY += stride_dy_row
- DX += stride_dx_row
- tl.store(DW + row_block_id * stride_dw_row + group * N + cols, dw, mask=mask)
- if HAS_BIAS:
- tl.store(DB + row_block_id * stride_db_row + group * N + cols, db, mask=mask)
- def _layer_norm_bwd(dy, x, weight, bias, eps, mean, rstd, z=None, group_size=None,
- norm_before_gate=True, is_rms_norm=False, recompute_output=False, dz=None, out=None):
- M, N = x.shape
- if group_size is None:
- group_size = N
- assert N % group_size == 0
- ngroups = N // group_size
- assert x.stride(-1) == 1
- assert dy.stride(-1) == 1
- assert dy.shape == (M, N)
- if z is not None:
- assert z.stride(-1) == 1
- assert z.shape == (M, N)
- assert weight.shape == (N,)
- assert weight.stride(-1) == 1
- if bias is not None:
- assert bias.stride(-1) == 1
- assert bias.shape == (N,)
- # allocate output
- dx = torch.empty_like(x)
- if dz is not None:
- assert z is not None
- assert dz.shape == z.shape
- assert dz.stride(-1) == 1
- else:
- dz = torch.empty_like(z) if z is not None else None
- if recompute_output:
- if out is None:
- out = torch.empty_like(x)
- assert out.shape == x.shape
- # Less than 64KB per feature: enqueue fused kernel
- MAX_FUSED_SIZE = 65536 // x.element_size()
- BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
- if group_size > BLOCK_N:
- raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
- # heuristics for number of warps
- num_warps = min(max(BLOCK_N // 256, 1), 8)
- sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
- # If group size is small (e.g., 64), we're only using 1 warp. So having just 108 programs
- # would limit the occupancy.
- nrow_groups = math.ceil(sm_count * math.ceil(4 / num_warps) / ngroups)
- _dw = torch.empty((nrow_groups, N), dtype=torch.float32, device=weight.device)
- _db = torch.empty((nrow_groups, N), dtype=torch.float32, device=bias.device) if bias is not None else None
- rows_per_program = math.ceil(M / nrow_groups)
- grid = (nrow_groups, ngroups)
- with torch.cuda.device(x.device.index):
- _layer_norm_bwd_kernel[grid](x, weight, bias, z, out if recompute_output else None,
- dy, dx, _dw, _db, dz, mean, rstd,
- x.stride(0),
- z.stride(0) if z is not None else 0,
- 0 if not recompute_output else out.stride(0),
- dy.stride(0), dx.stride(0),
- dz.stride(0) if dz is not None else 0,
- _dw.stride(0),
- _db.stride(0) if _db is not None else 0,
- M, group_size, eps,
- rows_per_program,
- BLOCK_N=BLOCK_N,
- NORM_BEFORE_GATE=norm_before_gate,
- IS_RMS_NORM=is_rms_norm,
- num_warps=num_warps)
- dw = _dw.sum(0).to(weight.dtype)
- db = _db.sum(0).to(bias.dtype) if bias is not None else None
- return (dx, dw, db, dz) if not recompute_output else (dx, dw, db, dz, out)
- class LayerNormFn(torch.autograd.Function):
- @staticmethod
- def forward(ctx, x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True,
- is_rms_norm=False):
- """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))
- """
- x_shape_og = x.shape
- # reshape input data into 2D tensor
- x = x.reshape(-1, x.shape[-1])
- if x.stride(-1) != 1:
- x = x.contiguous()
- if z is not None:
- assert z.shape == x_shape_og
- z = z.reshape(-1, z.shape[-1])
- if z.stride(-1) != 1:
- z = z.contiguous()
- weight = weight.contiguous()
- if bias is not None:
- bias = bias.contiguous()
- y, mean, rstd = _layer_norm_fwd(x, weight, bias, eps, z=z, group_size=group_size, norm_before_gate=norm_before_gate, is_rms_norm=is_rms_norm)
- ctx.save_for_backward(x, weight, bias, mean, rstd, z)
- ctx.x_shape_og = x_shape_og
- ctx.eps = eps
- ctx.group_size = group_size
- ctx.norm_before_gate = norm_before_gate
- ctx.is_rms_norm = is_rms_norm
- return y.reshape(x_shape_og)
- @staticmethod
- def backward(ctx, dy):
- x, weight, bias, mean, rstd, z = ctx.saved_tensors
- dy = dy.reshape(-1, dy.shape[-1])
- if dy.stride(-1) != 1:
- dy = dy.contiguous()
- assert dy.shape == x.shape
- dx, dw, db, dz = _layer_norm_bwd(dy, x, weight, bias, ctx.eps, mean, rstd, z, ctx.group_size,
- ctx.norm_before_gate, ctx.is_rms_norm)
- return dx.reshape(ctx.x_shape_og), dw, db, dz.reshape(ctx.x_shape_og) if dz is not None else None, None, None, None, None
- def layernorm_fn(x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True, is_rms_norm=False):
- return LayerNormFn.apply(x, weight, bias, z, eps, group_size, norm_before_gate, is_rms_norm)
- def rmsnorm_fn(x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True):
- return LayerNormFn.apply(x, weight, bias, z, eps, group_size, norm_before_gate, True)
- class LayerNorm(torch.nn.Module):
- def __init__(self, hidden_size, eps=1e-5, group_size=None, norm_before_gate=True, device=None, dtype=None):
- """If group_size is not None, we do GroupNorm with each group having group_size elements.
- group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group).
- """
- factory_kwargs = {"device": device, "dtype": dtype}
- super().__init__()
- self.eps = eps
- self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
- self.bias = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
- self.group_size = group_size
- self.norm_before_gate = norm_before_gate
- self.reset_parameters()
- def reset_parameters(self):
- torch.nn.init.ones_(self.weight)
- torch.nn.init.zeros_(self.bias)
- def forward(self, x, z=None):
- """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))
- """
- return layernorm_fn(x, self.weight, self.bias, z=z, group_size=self.group_size, eps=self.eps,
- norm_before_gate=self.norm_before_gate)
- class RMSNorm(torch.nn.Module):
- def __init__(self, hidden_size, eps=1e-5, group_size=None, norm_before_gate=True, device=None, dtype=None):
- """If group_size is not None, we do GroupNorm with each group having group_size elements.
- group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group).
- """
- factory_kwargs = {"device": device, "dtype": dtype}
- super().__init__()
- self.eps = eps
- self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
- self.register_parameter("bias", None)
- self.group_size = group_size
- self.norm_before_gate = norm_before_gate
- self.reset_parameters()
- def reset_parameters(self):
- torch.nn.init.ones_(self.weight)
- def forward(self, x, z=None):
- """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))
- """
- return rmsnorm_fn(x, self.weight, self.bias, z=z, eps=self.eps, group_size=self.group_size,
- norm_before_gate=self.norm_before_gate)
|