CoCalc -- layer_norm_cuda

GitHub Repository: ai-forever/sber-swap
Path: blob/main/apex/csrc/layer_norm_cuda_kernel.cu
⁸⁵² views
1
#include "ATen/ATen.h"
2
#include "ATen/AccumulateType.h"
3
#include "ATen/cuda/CUDAContext.h"
4
#include <THC/THCDeviceUtils.cuh>
5

6
#include <cuda.h>
7
#include <cuda_runtime.h>
8

9
#include "type_shim.h"
10

11
template<typename U> __device__
12
void cuWelfordOnlineSum(
13
  const U curr,
14
  U& mu,
15
  U& sigma2,
16
  U& count)
17
{
18
  count = count + U(1);
19
  U delta = curr - mu;
20
  U lmean = mu + delta / count;
21
  mu = lmean;
22
  U delta2 = curr - lmean;
23
  sigma2 = sigma2 + delta * delta2;
24
}
25

26
template<typename U> __device__
27
void cuChanOnlineSum(
28
  const U muB,
29
  const U sigma2B,
30
  const U countB,
31
  U& mu,
32
  U& sigma2,
33
  U& count)
34
{
35
  U delta = muB - mu;
36
  U nA = count;
37
  U nB = countB;
38
  count = count + countB;
39
  U nX = count;
40
  if (nX > U(0)) {
41
    nA = nA / nX;
42
    nB = nB / nX;
43
    mu = nA*mu + nB*muB;
44
    sigma2 = sigma2 + sigma2B + delta * delta * nA * nB * nX;
45
  } else {
46
    mu = U(0);
47
    sigma2 = U(0);
48
  }
49
}
50

51
template<typename T, typename U> __device__
52
void cuWelfordMuSigma2(
53
  const T* __restrict__ vals,
54
  const int n1,
55
  const int n2,
56
  const int i1,
57
  U& mu,
58
  U& sigma2,
59
  U* buf) 
60
{
61
  // Assumptions:
62
  // 1) blockDim.x == warpSize
63
  // 2) Tensor is contiguous
64
  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
65
  //
66
  // compute variance and mean over n2
67
  U count = U(0);
68
  mu= U(0);
69
  sigma2 = U(0);
70
  if (i1 < n1) {
71
    // one warp normalizes one n1 index,
72
    // synchronization is implicit
73
    // initialize with standard Welford algorithm
74
    const int numx = blockDim.x * blockDim.y;
75
    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
76
    const T* lvals = vals + i1*n2;
77
    int l = 4*thrx;
78
    for (;  l+3 < n2;  l+=4*numx) {
79
      for (int k = 0;  k < 4;  ++k) {
80
        U curr = static_cast<U>(lvals[l+k]);
81
        cuWelfordOnlineSum<U>(curr,mu,sigma2,count);
82
      }
83
    }
84
    for (;  l < n2;  ++l) {
85
      U curr = static_cast<U>(lvals[l]);
86
      cuWelfordOnlineSum<U>(curr,mu,sigma2,count);
87
    }
88
    // intra-warp reductions
89
    for (int l = 0;  l <= 4;  ++l) {
90
      int srcLaneB = (threadIdx.x+(1<<l))&31;
91
      U muB = WARP_SHFL(mu, srcLaneB);
92
      U countB = WARP_SHFL(count, srcLaneB);
93
      U sigma2B = WARP_SHFL(sigma2, srcLaneB);
94
      cuChanOnlineSum<U>(muB,sigma2B,countB,mu,sigma2,count);
95
    }
96
    // threadIdx.x == 0 has correct values for each warp
97
    // inter-warp reductions
98
    if (blockDim.y > 1) {
99
      U* ubuf = (U*)buf;
100
      U* ibuf = (U*)(ubuf + blockDim.y);
101
      for (int offset = blockDim.y/2;  offset > 0;  offset /= 2) {
102
        // upper half of warps write to shared
103
        if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2*offset) {
104
          const int wrt_y = threadIdx.y - offset;
105
          ubuf[2*wrt_y] = mu;
106
          ubuf[2*wrt_y+1] = sigma2;
107
          ibuf[wrt_y] = count;
108
        }
109
        __syncthreads();
110
        // lower half merges
111
        if (threadIdx.x == 0 && threadIdx.y < offset) {
112
          U muB = ubuf[2*threadIdx.y];
113
          U sigma2B = ubuf[2*threadIdx.y+1];
114
          U countB = ibuf[threadIdx.y];
115
          cuChanOnlineSum<U>(muB,sigma2B,countB,mu,sigma2,count);
116
        }
117
        __syncthreads();
118
      }
119
      // threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
120
      if (threadIdx.x == 0 && threadIdx.y == 0) {
121
        ubuf[0] = mu;
122
        ubuf[1] = sigma2;
123
      }
124
      __syncthreads();
125
      mu = ubuf[0];
126
      sigma2 = ubuf[1]/U(n2);
127
      // don't care about final value of count, we know count == n2
128
    } else {
129
      mu = WARP_SHFL(mu, 0);
130
      sigma2 = WARP_SHFL(sigma2/U(n2), 0);
131
    }
132
  }
133
}
134

135
template<> __device__
136
void cuWelfordMuSigma2(
137
  const at::Half* __restrict__ vals,
138
  const int n1,
139
  const int n2,
140
  const int i1,
141
  float& mu,
142
  float& sigma2,
143
  float* buf) 
144
{
145
  // Assumptions:
146
  // 1) blockDim.x == warpSize
147
  // 2) Tensor is contiguous
148
  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
149
  //
150
  // compute variance and mean over n2
151
  float count = 0.0f;
152
  mu= float(0);
153
  sigma2 = float(0);
154
  if (i1 < n1) {
155
    // one warp normalizes one n1 index,
156
    // synchronization is implicit
157
    // initialize with standard Welford algorithm
158
    const int numx = blockDim.x * blockDim.y;
159
    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
160
    const at::Half* lvals = vals + i1*n2;
161
    int l = 8*thrx;
162
    if ((((size_t)lvals)&3) != 0) {
163
      // 16 bit alignment
164
      // first thread consumes first point
165
      if (thrx == 0) {
166
        float curr = static_cast<float>(lvals[0]);
167
        cuWelfordOnlineSum(curr,mu,sigma2,count);
168
      }
169
      ++l;
170
    }
171
    // at this point, lvals[l] are 32 bit aligned for all threads.
172
    for (;  l+7 < n2;  l+=8*numx) {
173
      for (int k = 0;  k < 8;  k+=2) {
174
        float2 curr = __half22float2(*((__half2*)(lvals+l+k)));
175
        cuWelfordOnlineSum(curr.x,mu,sigma2,count);
176
	cuWelfordOnlineSum(curr.y,mu,sigma2,count);
177
      }
178
    }
179
    for (;  l < n2;  ++l) {
180
      float curr = static_cast<float>(lvals[l]);
181
      cuWelfordOnlineSum(curr,mu,sigma2,count);
182
    }
183
    // intra-warp reductions
184
    for (int l = 0;  l <= 4;  ++l) {
185
      int srcLaneB = (threadIdx.x+(1<<l))&31;
186
      float muB = WARP_SHFL(mu, srcLaneB);
187
      float countB = WARP_SHFL(count, srcLaneB);
188
      float sigma2B = WARP_SHFL(sigma2, srcLaneB);
189
      cuChanOnlineSum(muB,sigma2B,countB,mu,sigma2,count);
190
    }
191
    // threadIdx.x == 0 has correct values for each warp
192
    // inter-warp reductions
193
    if (blockDim.y > 1) {
194
      float* ubuf = (float*)buf;
195
      float* ibuf = (float*)(ubuf + blockDim.y);
196
      for (int offset = blockDim.y/2;  offset > 0;  offset /= 2) {
197
        // upper half of warps write to shared
198
        if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2*offset) {
199
          const int wrt_y = threadIdx.y - offset;
200
          ubuf[2*wrt_y] = mu;
201
          ubuf[2*wrt_y+1] = sigma2;
202
          ibuf[wrt_y] = count;
203
        }
204
        __syncthreads();
205
        // lower half merges
206
        if (threadIdx.x == 0 && threadIdx.y < offset) {
207
          float muB = ubuf[2*threadIdx.y];
208
          float sigma2B = ubuf[2*threadIdx.y+1];
209
          float countB = ibuf[threadIdx.y];
210
          cuChanOnlineSum(muB,sigma2B,countB,mu,sigma2,count);
211
        }
212
        __syncthreads();
213
      }
214
      // threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
215
      if (threadIdx.x == 0 && threadIdx.y == 0) {
216
        ubuf[0] = mu;
217
        ubuf[1] = sigma2;
218
      }
219
      __syncthreads();
220
      mu = ubuf[0];
221
      sigma2 = ubuf[1]/float(n2);
222
      // don't care about final value of count, we know count == n2
223
    } else {
224
      mu = WARP_SHFL(mu, 0);
225
      sigma2 = WARP_SHFL(sigma2/float(n2), 0);
226
    }
227
  }
228
}
229

230
template<typename U> U rsqrt(U v) {
231
  return U(1) / sqrt(v);
232
}
233
template<> float rsqrt(float v) {
234
  return rsqrtf(v);
235
}
236
template<> double rsqrt(double v) {
237
  return rsqrt(v);
238
}
239

240
namespace {
241
// This is the un-specialized struct.  Note that we prevent instantiation of this
242
// struct by putting an undefined symbol in the function body so it won't compile.
243
//  template <typename T>
244
//  struct SharedMemory
245
//  {
246
//      // Ensure that we won't compile any un-specialized types
247
//      __device__ T *getPointer()
248
//      {
249
//          extern __device__ void error(void);
250
//          error();
251
//          return NULL;
252
//      }
253
//  };
254
// https://github.com/NVIDIA/apex/issues/246
255
template <typename T>
256
struct SharedMemory;
257

258
template <>
259
struct SharedMemory <float>
260
{
261
    __device__ float *getPointer()
262
    {
263
        extern __shared__ float s_float[];
264
        return s_float;
265
    }
266
};
267

268
template <>
269
struct SharedMemory <double>
270
{
271
    __device__ double *getPointer()
272
    {
273
        extern __shared__ double s_double[];
274
        return s_double;
275
    }
276
};
277
}
278

279
template<typename T, typename U> __global__
280
void cuApplyLayerNorm(
281
  T* __restrict__ output_vals,
282
  U* __restrict__ mean,
283
  U* __restrict__ invvar,
284
  const T* __restrict__ vals,
285
  const int n1,
286
  const int n2,
287
  const U epsilon,
288
  const T* __restrict__ gamma,
289
  const T* __restrict__ beta
290
  ) 
291
{
292
  // Assumptions:
293
  // 1) blockDim.x == warpSize
294
  // 2) Tensors are contiguous
295
  //
296
  for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
297
    SharedMemory<U> shared;
298
    U* buf = shared.getPointer();
299
    U mu,sigma2;
300
    cuWelfordMuSigma2(vals,n1,n2,i1,mu,sigma2,buf);
301
    const T* lvals = vals + i1*n2;
302
    T* ovals = output_vals + i1*n2;
303
    U c_invvar = rsqrt(sigma2 + epsilon);
304
    const int numx = blockDim.x * blockDim.y;
305
    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
306
    if (gamma != NULL && beta != NULL) {
307
      for (int i = thrx;  i < n2;  i+=numx) {
308
        U curr = static_cast<U>(lvals[i]);
309
        ovals[i] = gamma[i] * static_cast<T>(c_invvar * (curr - mu)) + beta[i];
310
      }
311
    } else {
312
      for (int i = thrx;  i < n2;  i+=numx) {
313
        U curr = static_cast<U>(lvals[i]);
314
        ovals[i] = static_cast<T>(c_invvar * (curr - mu));
315
      }
316
    }
317
    if (threadIdx.x == 0 && threadIdx.y == 0) {
318
      mean[i1] = mu;
319
      invvar[i1] = c_invvar;
320
    }
321
  }
322
}
323

324
template<typename T, typename U> __device__
325
void cuLoadWriteStridedInputs(
326
    const int i1_block,
327
    const int thr_load_row_off,
328
    const int thr_load_col_off,
329
    const int i2_off,
330
    const int row_stride,
331
    U* warp_buf1,
332
    U* warp_buf2,
333
    const T* input,
334
    const T* dout,
335
    const int i1_end,
336
    const int n2,
337
    const U* __restrict__ mean,
338
    const U* __restrict__ invvar
339
    )
340
{
341
  int i1 = i1_block+thr_load_row_off;
342
  if (i1 < i1_end) {
343
    U curr_mean = mean[i1];
344
    U curr_invvar = invvar[i1];
345
    for (int k = 0;  k < blockDim.y;  ++k) {
346
      int i2 = i2_off + k;
347
      int load_idx = i1*n2+i2;
348
      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
349
      if (i2<n2) {
350
        U curr_input = static_cast<U>(input[load_idx]);
351
	U curr_dout = static_cast<U>(dout[load_idx]);
352
	warp_buf1[write_idx] = curr_dout;
353
	warp_buf2[write_idx] = curr_dout * (curr_input - curr_mean) * curr_invvar;
354
      } else {
355
        warp_buf1[write_idx] = U(0);
356
        warp_buf2[write_idx] = U(0);
357
      }
358
    }
359
  } else {
360
    for (int k = 0;  k < blockDim.y;  ++k) {
361
      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
362
      warp_buf1[write_idx] = U(0);
363
      warp_buf2[write_idx] = U(0);
364
    }
365
  }
366
}
367

368
template<typename T, typename U> __device__
369
void cuLoadAddStridedInputs(
370
    const int i1_block,
371
    const int thr_load_row_off,
372
    const int thr_load_col_off,
373
    const int i2_off,
374
    const int row_stride,
375
    U* warp_buf1,
376
    U* warp_buf2,
377
    const T* input,
378
    const T* dout,
379
    const int i1_end,
380
    const int n2,
381
    const U* __restrict__ mean,
382
    const U* __restrict__ invvar
383
    )
384
{
385
  int i1 = i1_block+thr_load_row_off;
386
  if (i1 < i1_end) {
387
    U curr_mean = mean[i1];
388
    U curr_invvar = invvar[i1];
389
    for (int k = 0;  k < blockDim.y;  ++k) {
390
      int i2 = i2_off + k;
391
      int load_idx = i1*n2+i2;
392
      int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
393
      if (i2<n2) {
394
        U curr_input = static_cast<U>(input[load_idx]);
395
	U curr_dout = static_cast<U>(dout[load_idx]);
396
	warp_buf1[write_idx] += curr_dout;
397
	warp_buf2[write_idx] += curr_dout * (curr_input - curr_mean) * curr_invvar;
398
      }
399
    }
400
  }
401
}
402

403
template<typename T, typename U> __global__
404
void cuComputePartGradGammaBeta(
405
    const T* __restrict__ dout,
406
    const T* __restrict__ input,
407
    const int n1,
408
    const int n2,
409
    const U* __restrict__ mean,
410
    const U* __restrict__ invvar,
411
    U epsilon,
412
    U* part_grad_gamma,
413
    U* part_grad_beta)
414
{
415
    const int numsegs_n1 = (n1+blockDim.y*blockDim.y-1) / (blockDim.y*blockDim.y);
416
    const int segs_per_block = (numsegs_n1 + gridDim.y - 1) / gridDim.y;
417
    const int i1_beg = blockIdx.y * segs_per_block * blockDim.y*blockDim.y;
418
    const int i1_beg_plus_one = (blockIdx.y+1) * segs_per_block * blockDim.y*blockDim.y;
419
    const int i1_end = i1_beg_plus_one < n1 ? i1_beg_plus_one : n1;
420
    const int row_stride = blockDim.x+1;
421
    const int thr_load_col_off = (threadIdx.x*blockDim.y)&(blockDim.x-1);
422
    const int thr_load_row_off = (threadIdx.x*blockDim.y)/blockDim.x + threadIdx.y*blockDim.y;
423
    const int i2_off = blockIdx.x * blockDim.x + thr_load_col_off;
424
    SharedMemory<U> shared;
425
    U* buf = shared.getPointer(); // buf has at least blockDim.x * blockDim.y * blockDim.y + (blockDim.y - 1)*(blockDim.x/blockDim.y) elements
426
    U* warp_buf1 = (U*)buf;
427
    U* warp_buf2 = warp_buf1 + blockDim.y * blockDim.y * row_stride;
428
    // compute partial sums from strided inputs
429
    // do this to increase number of loads in flight
430
    cuLoadWriteStridedInputs(i1_beg,thr_load_row_off,thr_load_col_off,i2_off,row_stride,warp_buf1,warp_buf2,input,dout,i1_end,n2,mean,invvar);
431
    for (int i1_block = i1_beg+blockDim.y*blockDim.y;  i1_block < i1_end;  i1_block+=blockDim.y*blockDim.y) {
432
      cuLoadAddStridedInputs(i1_block,thr_load_row_off,thr_load_col_off,i2_off,row_stride,warp_buf1,warp_buf2,input,dout,i1_end,n2,mean,invvar);
433
    }
434
    __syncthreads();
435
    // inter-warp reductions
436
    // sum within each warp
437
    U acc1 = U(0);
438
    U acc2 = U(0);
439
    for (int k = 0;  k < blockDim.y;  ++k) {
440
      int row1 = threadIdx.y + k*blockDim.y;
441
      int idx1 = row1*row_stride + threadIdx.x;
442
      acc1 += warp_buf1[idx1];
443
      acc2 += warp_buf2[idx1];
444
    }
445
    warp_buf1[threadIdx.y*row_stride+threadIdx.x] = acc1;
446
    warp_buf2[threadIdx.y*row_stride+threadIdx.x] = acc2;
447
    __syncthreads();
448
    // sum all warps
449
    for (int offset = blockDim.y/2;  offset > 1;  offset /= 2) {
450
      if (threadIdx.y < offset) {
451
        int row1 = threadIdx.y;
452
	int row2 = threadIdx.y + offset;
453
	int idx1 = row1*row_stride + threadIdx.x;
454
	int idx2 = row2*row_stride + threadIdx.x;
455
	warp_buf1[idx1] += warp_buf1[idx2];
456
	warp_buf2[idx1] += warp_buf2[idx2];
457
      }
458
      __syncthreads();
459
    }
460
    int i2 = blockIdx.x * blockDim.x + threadIdx.x;
461
    if (threadIdx.y == 0 && i2 < n2) {
462
      int row1 = threadIdx.y;
463
      int row2 = threadIdx.y + 1;
464
      int idx1 = row1*row_stride + threadIdx.x;
465
      int idx2 = row2*row_stride + threadIdx.x;
466
      part_grad_beta[blockIdx.y*n2+i2] = warp_buf1[idx1] + warp_buf1[idx2];
467
      part_grad_gamma[blockIdx.y*n2+i2] = warp_buf2[idx1] + warp_buf2[idx2];
468
    }
469
}
470

471
template<typename T, typename U> __global__
472
void cuComputeGradGammaBeta(
473
    const U* part_grad_gamma,
474
    const U* part_grad_beta,
475
    const int part_size,
476
    const int n1,
477
    const int n2,
478
    T* grad_gamma,
479
    T* grad_beta)
480
{
481
    // sum partial gradients for gamma and beta
482
    SharedMemory<U> shared;
483
    U* buf = shared.getPointer(); 
484
    int i2 = blockIdx.x * blockDim.x + threadIdx.x;
485
    if (i2 < n2) {
486
      // each warp does sequential reductions until reduced part_size is num_warps
487
      int num_warp_reductions = part_size / blockDim.y;
488
      U sum_gamma = U(0);
489
      U sum_beta = U(0);
490
      const U* part_grad_gamma_ptr = part_grad_gamma + threadIdx.y * num_warp_reductions * n2 + i2;
491
      const U* part_grad_beta_ptr = part_grad_beta + threadIdx.y * num_warp_reductions * n2 + i2;
492
      for (int warp_offset = 0;  warp_offset < num_warp_reductions;  ++warp_offset) {
493
        sum_gamma += part_grad_gamma_ptr[warp_offset*n2];
494
        sum_beta += part_grad_beta_ptr[warp_offset*n2];
495
      }
496
      // inter-warp reductions
497
      const int nbsize3 = blockDim.x * blockDim.y / 2;
498
      for (int offset = blockDim.y/2;  offset >= 1;  offset /= 2) {
499
        // top half write to shared memory
500
        if (threadIdx.y >= offset && threadIdx.y < 2*offset) {
501
          const int write_idx = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
502
          buf[write_idx] = sum_gamma;
503
          buf[write_idx+nbsize3] = sum_beta;
504
        }
505
        __syncthreads();
506
        // bottom half sums
507
        if (threadIdx.y < offset) {
508
          const int read_idx = threadIdx.y * blockDim.x + threadIdx.x;
509
          sum_gamma += buf[read_idx];
510
          sum_beta += buf[read_idx+nbsize3];
511
        }
512
        __syncthreads();
513
      }
514
      // write out fully summed gradients
515
      if (threadIdx.y == 0) {
516
        grad_gamma[i2] = sum_gamma;
517
        grad_beta[i2] = sum_beta;
518
      }
519
    }
520
}
521

522
template<typename T, typename U> __global__
523
void cuComputeGradInput(
524
    const T* __restrict__ dout,
525
    const T* __restrict__ input,
526
    const int n1,
527
    const int n2,
528
    const U* __restrict__ mean,
529
    const U* __restrict__ invvar,
530
    U epsilon,
531
    const T* gamma,
532
    T* grad_input)
533
{
534
  for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
535
    U sum_loss1 = U(0);
536
    U sum_loss2 = U(0);
537
    const U c_mean = mean[i1];
538
    const U c_invvar = invvar[i1];
539
    const T* k_input = input + i1*n2;
540
    const T* k_dout = dout + i1*n2;
541
    const int numx = blockDim.x * blockDim.y;
542
    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
543
    if (gamma != NULL) {
544
      int l = 4*thrx;
545
      for (;  l+3 < n2;  l+=4*numx) {
546
        for (int k = 0;  k < 4;  ++k) {
547
          const U c_h = static_cast<U>(k_input[l+k]);
548
          const U c_loss = static_cast<U>(k_dout[l+k]);
549
          sum_loss1 += c_loss * gamma[l+k];
550
          sum_loss2 += c_loss * gamma[l+k] * (c_h - c_mean) * c_invvar;
551
        }
552
      }
553
      for (;  l < n2;  ++l) {
554
        const U c_h = static_cast<U>(k_input[l]);
555
        const U c_loss = static_cast<U>(k_dout[l]);
556
        sum_loss1 += c_loss * gamma[l];
557
        sum_loss2 += c_loss * gamma[l] * (c_h - c_mean) * c_invvar;
558
      }
559
    } else {
560
      int l = 4*thrx;
561
      for (;  l+3 < n2;  l+=4*numx) {
562
        for (int k = 0;  k < 4;  ++k) {
563
          const U c_h = static_cast<U>(k_input[l+k]);
564
          const U c_loss = static_cast<U>(k_dout[l+k]);
565
          sum_loss1 += c_loss;
566
          sum_loss2 += c_loss * (c_h - c_mean) * c_invvar;
567
        }
568
      }
569
      for (;  l < n2;  ++l) {
570
        const U c_h = static_cast<U>(k_input[l]);
571
        const U c_loss = static_cast<U>(k_dout[l]);
572
        sum_loss1 += c_loss;
573
        sum_loss2 += c_loss * (c_h - c_mean) * c_invvar;
574
      }
575
    }
576
    // intra-warp reductions
577
    for (int mask = blockDim.x/2;  mask > 0;  mask /= 2) {
578
      sum_loss1 += WARP_SHFL_XOR(sum_loss1, mask);
579
      sum_loss2 += WARP_SHFL_XOR(sum_loss2, mask);
580
    }
581
    // inter-warp reductions
582
    if (blockDim.y > 1) {
583
      SharedMemory<U> shared;
584
      U* buf = shared.getPointer(); 
585
      for (int offset = blockDim.y/2;  offset > 0;  offset /= 2) {
586
        // upper half of warps write to shared
587
        if (threadIdx.y >= offset && threadIdx.y < 2*offset) {
588
          const int wrt_i = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
589
          buf[2*wrt_i] = sum_loss1;
590
          buf[2*wrt_i+1] = sum_loss2;
591
        }
592
        __syncthreads();
593
        // lower half merges
594
        if (threadIdx.y < offset) {
595
          const int read_i = threadIdx.y * blockDim.x + threadIdx.x;
596
          sum_loss1 += buf[2*read_i];
597
          sum_loss2 += buf[2*read_i+1];
598
        }
599
        __syncthreads();
600
      }
601
      if (threadIdx.y == 0) {
602
        buf[2*threadIdx.x] = sum_loss1;
603
        buf[2*threadIdx.x+1] = sum_loss2;
604
      }
605
      __syncthreads();
606
      if (threadIdx.y !=0) {
607
        sum_loss1 = buf[2*threadIdx.x];
608
        sum_loss2 = buf[2*threadIdx.x+1];
609
      } 
610
    }
611
    // all threads now have the two sums over l
612
    U fH = (U)n2;
613
    U term1 = (U(1) / fH) * c_invvar;
614
    T* k_grad_input = grad_input + i1*n2;
615
    if (gamma != NULL) {
616
      for (int l = thrx;  l < n2;  l+=numx) {
617
        const U c_h = static_cast<U>(k_input[l]);
618
        const U c_loss = static_cast<U>(k_dout[l]);
619
        U f_grad_input = fH * c_loss * gamma[l];
620
        f_grad_input -= sum_loss1;
621
        f_grad_input -= (c_h - c_mean) * c_invvar * sum_loss2;
622
        f_grad_input *= term1;
623
        k_grad_input[l] = static_cast<T>(f_grad_input);
624
      }
625
    } else {
626
      for (int l = thrx;  l < n2;  l+=numx) {
627
        const U c_h = static_cast<U>(k_input[l]);
628
        const U c_loss = static_cast<U>(k_dout[l]);
629
        U f_grad_input = fH * c_loss;
630
        f_grad_input -= sum_loss1;
631
        f_grad_input -= (c_h - c_mean) * c_invvar * sum_loss2;
632
        f_grad_input *= term1;
633
        k_grad_input[l] = static_cast<T>(f_grad_input);
634
      }
635
    }
636
  }
637
}
638

639
template<typename T, typename U> 
640
void HostApplyLayerNorm(
641
    T* output,
642
    U* mean,
643
    U* invvar,
644
    const T* input,
645
    int n1,
646
    int n2,
647
    double epsilon,
648
    const T* gamma,
649
    const T* beta
650
    )
651
{
652
    auto stream = at::cuda::getCurrentCUDAStream().stream();
653
    const dim3 threads(32,4,1);
654
    const uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
655
    const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
656
    int nshared = 
657
        threads.y > 1 ? 
658
	    threads.y*sizeof(U)+(threads.y/2)*sizeof(U) : 
659
	    0;
660
    cuApplyLayerNorm<<<blocks, threads, nshared, stream>>>(
661
		    output,
662
		    mean,
663
		    invvar,
664
		    input,
665
		    n1,n2,
666
		    U(epsilon),
667
                    gamma,beta);
668
}
669

670
void cuda_layer_norm(
671
    at::Tensor* output,
672
    at::Tensor* mean,
673
    at::Tensor* invvar,
674
    at::Tensor* input,
675
    int n1,
676
    int n2,
677
    #ifdef VERSION_GE_1_1
678
    at::IntArrayRef normalized_shape,
679
    #else
680
    at::IntList normalized_shape,
681
    #endif
682
    at::Tensor* gamma,
683
    at::Tensor* beta,
684
    double epsilon)
685
{
686
    using namespace at;
687
    DISPATCH_DOUBLE_FLOAT_AND_HALF(input->scalar_type(), 0, "layer_norm_cuda_kernel",
688
        using accscalar_t = at::acc_type<scalar_t_0, true>;
689
        HostApplyLayerNorm(
690
            output->DATA_PTR<scalar_t_0>(),
691
	    mean->DATA_PTR<accscalar_t>(),
692
	    invvar->DATA_PTR<accscalar_t>(),
693
	    input->DATA_PTR<scalar_t_0>(),
694
	    n1,n2,
695
	    epsilon,
696
	    gamma != NULL ? gamma->DATA_PTR<scalar_t_0>() : NULL,
697
	    beta != NULL ? beta->DATA_PTR<scalar_t_0>() : NULL);
698
      )
699
}
700

701
template<typename T, typename U> 
702
void HostLayerNormGradient(
703
    const T* dout,
704
    const U* mean,
705
    const U* invvar,
706
    at::Tensor* input,
707
    int n1,
708
    int n2,
709
    const T* gamma,
710
    const T* beta,
711
    double epsilon,
712
    T* grad_input,
713
    T* grad_gamma,
714
    T* grad_beta
715
    )
716
{
717
    auto stream = at::cuda::getCurrentCUDAStream().stream();
718

719
    if (gamma != NULL && beta != NULL) {
720
      // compute grad_gamma(j) and grad_beta(j)
721
      const int part_size = 16;
722
      const dim3 threads2(32,4,1);
723
      const dim3 blocks2((n2+threads2.x-1)/threads2.x,part_size,1);
724
      const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y * (threads2.x + 1);
725
      const int nshared2_b = threads2.x * threads2.y * sizeof(U);
726
      const int nshared2 = nshared2_a > nshared2_b ? nshared2_a : nshared2_b;
727
      at::Tensor part_grad_gamma = at::empty({part_size,n2}, input->options().dtype(input->scalar_type()==at::ScalarType::Half ? at::ScalarType::Float : input->scalar_type()));
728
      at::Tensor part_grad_beta = at::empty_like(part_grad_gamma);
729
      cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
730
		      dout,
731
		      input->DATA_PTR<T>(),
732
		      n1,n2,
733
		      mean,
734
		      invvar,
735
		      U(epsilon),
736
		      part_grad_gamma.DATA_PTR<U>(),
737
		      part_grad_beta.DATA_PTR<U>());
738

739
      const dim3 threads3(32,8,1);
740
      const dim3 blocks3((n2+threads2.x-1)/threads2.x,1,1);
741
      const int nshared3 = threads3.x * threads3.y * sizeof(U);
742
      cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
743
		      part_grad_gamma.DATA_PTR<U>(),
744
		      part_grad_beta.DATA_PTR<U>(),
745
		      part_size,
746
		      n1,n2,
747
		      grad_gamma,
748
		      grad_beta);
749
    }
750

751
    // compute grad_input
752
    const uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
753
    const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1);
754
    const dim3 threads1(32,4,1);
755
    int nshared =
756
	    threads1.y > 1 ?
757
	    threads1.y*threads1.x*sizeof(U) :
758
	    0;
759
    cuComputeGradInput<<<blocks1, threads1, nshared, stream>>>(
760
            dout,
761
            input->DATA_PTR<T>(),
762
            n1,n2,
763
            mean,
764
            invvar,
765
            U(epsilon),
766
            gamma,
767
            grad_input);
768
}
769

770
void cuda_layer_norm_gradient(
771
    at::Tensor* dout,
772
    at::Tensor* mean,
773
    at::Tensor* invvar,
774
    at::Tensor* input,
775
    int n1,
776
    int n2,
777
    #ifdef VERSION_GE_1_1
778
    at::IntArrayRef normalized_shape,
779
    #else
780
    at::IntList normalized_shape,
781
    #endif
782
    at::Tensor* gamma,
783
    at::Tensor* beta,
784
    double epsilon,
785
    at::Tensor* grad_input,
786
    at::Tensor* grad_gamma,
787
    at::Tensor* grad_beta)
788
{
789
    using namespace at;
790
    DISPATCH_FLOAT_AND_HALF(input->scalar_type(), 0, "cuComputeGradInput",
791
        using accscalar_t = at::acc_type<scalar_t_0, true>;
792
        HostLayerNormGradient(
793
	    dout->DATA_PTR<scalar_t_0>(),
794
	    mean->DATA_PTR<accscalar_t>(),
795
	    invvar->DATA_PTR<accscalar_t>(),
796
	    input,
797
	    n1,n2,
798
            // TMJ pass NULL argument for gamma, beta, grad_gamma and grad_beta
799
            // if gamma Tensor is NULL on input.
800
	    gamma != NULL ? gamma->DATA_PTR<scalar_t_0>() : NULL,
801
	    gamma != NULL ? beta->DATA_PTR<scalar_t_0>() : NULL,
802
	    epsilon,
803
	    grad_input->DATA_PTR<scalar_t_0>(),
804
	    gamma != NULL ? grad_gamma->DATA_PTR<scalar_t_0>() : NULL,
805
	    gamma != NULL ? grad_beta->DATA_PTR<scalar_t_0>() : NULL);
806
      )
807
}
808

809
Product

Resources

Company