\[y = w \odot \frac{x - \mu}{\sqrt{\sigma^2 + \epsilon}} + b\]
\[ \mu = \frac{1}{m} \sum_{i=1}^{m} x_i, \quad \sigma = \sqrt{\frac{1}{m} \sum_{i=1}^{m} (x_i - \mu)^2} \]
\(\mu\) 是均值,\(\sigma^2\) 是方差。
Layer Norm 主要用于改进神经网络的训练稳定性和收敛速度。
// 输入输出 shape
int B, int T, int C
// 输入
const float *inp
// 均值和标准差的倒数
// 该值随机初始化后传入,前向完成后得到结果并在反向过程中重复使用
float *mean, float *rstd
// 供训练的权重矩阵
const float *weight, const float *bias
// 输出
float *out
void layernorm_forward_cpu(float* out, float* mean, float* rstd,
const float* inp, const float* weight, const float* bias,
int B, int T, int C) {
float eps = 1e-5f;
for (int b = 0; b < B; b++) {
for (int t = 0; t < T; t++) {
// seek to the input position inp[b,t,:]
const float* x = inp + b * T * C + t * C;
// calculate the mean
float m = 0.0f;
for (int i = 0; i < C; i++) {
+= x[i];
m }
= m/C;
m // calculate the variance (without any bias correction)
float v = 0.0f;
for (int i = 0; i < C; i++) {
float xshift = x[i] - m;
+= xshift * xshift;
v }
= v/C;
v // calculate the rstd
float s = 1.0f / sqrtf(v + eps);
// seek to the output position in out[b,t,:]
float* out_bt = out + b * T * C + t * C;
for (int i = 0; i < C; i++) {
float n = (s * (x[i] - m)); // normalized output
float o = n * weight[i] + bias[i]; // scale and shift it
[i] = o; // write
out_bt}
// cache the mean and rstd for the backward pass later
[b * T + t] = m;
mean[b * T + t] = s;
rstd}
}
}
由于 LayerNorm 的计算都是在维度 C 上进行,因此在 [B, T] 维度上可以通过 CUDA 实现并行计算,其实现的关键如下所示。其中 block_size 为可以设定的参数,grid_size 为 N/block_size 向上取整,因此总的线程grid_size * block_size >= N ,layernorm_forward_kernel1 的实现与 CPU 的内层循环的实现相同。
void layernorm_forward_kernel1(float* out, float* mean, float* rstd,
__global__ const float* inp, const float* weight, const float* bias,
int N, int C) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N) return;
// Seek to the input position inp[idx, :]
const float* x = inp + idx * C;
float* out_x = out + idx * C;
// Compute mean
float m = 0.0f;
for (int i = 0; i < C; i++) {
+= x[i];
m }
/= C;
m
// Compute variance
float v = 0.0f;
for (int i = 0; i < C; i++) {
float xshift = x[i] - m;
+= xshift * xshift;
v }
/= C;
v
// Compute rstd
float s = rsqrtf(v + 1e-5f);
// Normalize, scale, and shift
for (int i = 0; i < C; i++) {
float n = (x[i] - m) * s;
[i] = n * weight[i] + bias[i];
out_x}
// Store mean and rstd for backward pass
[idx] = m;
mean[idx] = s;
rstd}
void layernorm_forward1(float* out, float* mean, float* rstd,
const float* inp, const float* weight, const float* bias,
int B, int T, int C,
const int block_size) {
const int N = B * T;
const int grid_size = ceil_div(N, block_size);
<<<grid_size, block_size>>>(out, mean, rstd, inp, weight, bias, N, C);
layernorm_forward_kernel1(cudaGetLastError());
cudaCheck}