CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ai-forever

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: ai-forever/sber-swap
Path: blob/main/apex/csrc/layer_norm_cuda_kernel.cu
Views: 792
1
#include "ATen/ATen.h"
2
#include "ATen/AccumulateType.h"
3
#include "ATen/cuda/CUDAContext.h"
4
#include <THC/THCDeviceUtils.cuh>
5
6
#include <cuda.h>
7
#include <cuda_runtime.h>
8
9
#include "type_shim.h"
10
11
template<typename U> __device__
12
void cuWelfordOnlineSum(
13
const U curr,
14
U& mu,
15
U& sigma2,
16
U& count)
17
{
18
count = count + U(1);
19
U delta = curr - mu;
20
U lmean = mu + delta / count;
21
mu = lmean;
22
U delta2 = curr - lmean;
23
sigma2 = sigma2 + delta * delta2;
24
}
25
26
template<typename U> __device__
27
void cuChanOnlineSum(
28
const U muB,
29
const U sigma2B,
30
const U countB,
31
U& mu,
32
U& sigma2,
33
U& count)
34
{
35
U delta = muB - mu;
36
U nA = count;
37
U nB = countB;
38
count = count + countB;
39
U nX = count;
40
if (nX > U(0)) {
41
nA = nA / nX;
42
nB = nB / nX;
43
mu = nA*mu + nB*muB;
44
sigma2 = sigma2 + sigma2B + delta * delta * nA * nB * nX;
45
} else {
46
mu = U(0);
47
sigma2 = U(0);
48
}
49
}
50
51
template<typename T, typename U> __device__
52
void cuWelfordMuSigma2(
53
const T* __restrict__ vals,
54
const int n1,
55
const int n2,
56
const int i1,
57
U& mu,
58
U& sigma2,
59
U* buf)
60
{
61
// Assumptions:
62
// 1) blockDim.x == warpSize
63
// 2) Tensor is contiguous
64
// 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
65
//
66
// compute variance and mean over n2
67
U count = U(0);
68
mu= U(0);
69
sigma2 = U(0);
70
if (i1 < n1) {
71
// one warp normalizes one n1 index,
72
// synchronization is implicit
73
// initialize with standard Welford algorithm
74
const int numx = blockDim.x * blockDim.y;
75
const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
76
const T* lvals = vals + i1*n2;
77
int l = 4*thrx;
78
for (; l+3 < n2; l+=4*numx) {
79
for (int k = 0; k < 4; ++k) {
80
U curr = static_cast<U>(lvals[l+k]);
81
cuWelfordOnlineSum<U>(curr,mu,sigma2,count);
82
}
83
}
84
for (; l < n2; ++l) {
85
U curr = static_cast<U>(lvals[l]);
86
cuWelfordOnlineSum<U>(curr,mu,sigma2,count);
87
}
88
// intra-warp reductions
89
for (int l = 0; l <= 4; ++l) {
90
int srcLaneB = (threadIdx.x+(1<<l))&31;
91
U muB = WARP_SHFL(mu, srcLaneB);
92
U countB = WARP_SHFL(count, srcLaneB);
93
U sigma2B = WARP_SHFL(sigma2, srcLaneB);
94
cuChanOnlineSum<U>(muB,sigma2B,countB,mu,sigma2,count);
95
}
96
// threadIdx.x == 0 has correct values for each warp
97
// inter-warp reductions
98
if (blockDim.y > 1) {
99
U* ubuf = (U*)buf;
100
U* ibuf = (U*)(ubuf + blockDim.y);
101
for (int offset = blockDim.y/2; offset > 0; offset /= 2) {
102
// upper half of warps write to shared
103
if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2*offset) {
104
const int wrt_y = threadIdx.y - offset;
105
ubuf[2*wrt_y] = mu;
106
ubuf[2*wrt_y+1] = sigma2;
107
ibuf[wrt_y] = count;
108
}
109
__syncthreads();
110
// lower half merges
111
if (threadIdx.x == 0 && threadIdx.y < offset) {
112
U muB = ubuf[2*threadIdx.y];
113
U sigma2B = ubuf[2*threadIdx.y+1];
114
U countB = ibuf[threadIdx.y];
115
cuChanOnlineSum<U>(muB,sigma2B,countB,mu,sigma2,count);
116
}
117
__syncthreads();
118
}
119
// threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
120
if (threadIdx.x == 0 && threadIdx.y == 0) {
121
ubuf[0] = mu;
122
ubuf[1] = sigma2;
123
}
124
__syncthreads();
125
mu = ubuf[0];
126
sigma2 = ubuf[1]/U(n2);
127
// don't care about final value of count, we know count == n2
128
} else {
129
mu = WARP_SHFL(mu, 0);
130
sigma2 = WARP_SHFL(sigma2/U(n2), 0);
131
}
132
}
133
}
134
135
template<> __device__
136
void cuWelfordMuSigma2(
137
const at::Half* __restrict__ vals,
138
const int n1,
139
const int n2,
140
const int i1,
141
float& mu,
142
float& sigma2,
143
float* buf)
144
{
145
// Assumptions:
146
// 1) blockDim.x == warpSize
147
// 2) Tensor is contiguous
148
// 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
149
//
150
// compute variance and mean over n2
151
float count = 0.0f;
152
mu= float(0);
153
sigma2 = float(0);
154
if (i1 < n1) {
155
// one warp normalizes one n1 index,
156
// synchronization is implicit
157
// initialize with standard Welford algorithm
158
const int numx = blockDim.x * blockDim.y;
159
const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
160
const at::Half* lvals = vals + i1*n2;
161
int l = 8*thrx;
162
if ((((size_t)lvals)&3) != 0) {
163
// 16 bit alignment
164
// first thread consumes first point
165
if (thrx == 0) {
166
float curr = static_cast<float>(lvals[0]);
167
cuWelfordOnlineSum(curr,mu,sigma2,count);
168
}
169
++l;
170
}
171
// at this point, lvals[l] are 32 bit aligned for all threads.
172
for (; l+7 < n2; l+=8*numx) {
173
for (int k = 0; k < 8; k+=2) {
174
float2 curr = __half22float2(*((__half2*)(lvals+l+k)));
175
cuWelfordOnlineSum(curr.x,mu,sigma2,count);
176
cuWelfordOnlineSum(curr.y,mu,sigma2,count);
177
}
178
}
179
for (; l < n2; ++l) {
180
float curr = static_cast<float>(lvals[l]);
181
cuWelfordOnlineSum(curr,mu,sigma2,count);
182
}
183
// intra-warp reductions
184
for (int l = 0; l <= 4; ++l) {
185
int srcLaneB = (threadIdx.x+(1<<l))&31;
186
float muB = WARP_SHFL(mu, srcLaneB);
187
float countB = WARP_SHFL(count, srcLaneB);
188
float sigma2B = WARP_SHFL(sigma2, srcLaneB);
189
cuChanOnlineSum(muB,sigma2B,countB,mu,sigma2,count);
190
}
191
// threadIdx.x == 0 has correct values for each warp
192
// inter-warp reductions
193
if (blockDim.y > 1) {
194
float* ubuf = (float*)buf;
195
float* ibuf = (float*)(ubuf + blockDim.y);
196
for (int offset = blockDim.y/2; offset > 0; offset /= 2) {
197
// upper half of warps write to shared
198
if (threadIdx.x == 0 && threadIdx.y >= offset && threadIdx.y < 2*offset) {
199
const int wrt_y = threadIdx.y - offset;
200
ubuf[2*wrt_y] = mu;
201
ubuf[2*wrt_y+1] = sigma2;
202
ibuf[wrt_y] = count;
203
}
204
__syncthreads();
205
// lower half merges
206
if (threadIdx.x == 0 && threadIdx.y < offset) {
207
float muB = ubuf[2*threadIdx.y];
208
float sigma2B = ubuf[2*threadIdx.y+1];
209
float countB = ibuf[threadIdx.y];
210
cuChanOnlineSum(muB,sigma2B,countB,mu,sigma2,count);
211
}
212
__syncthreads();
213
}
214
// threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
215
if (threadIdx.x == 0 && threadIdx.y == 0) {
216
ubuf[0] = mu;
217
ubuf[1] = sigma2;
218
}
219
__syncthreads();
220
mu = ubuf[0];
221
sigma2 = ubuf[1]/float(n2);
222
// don't care about final value of count, we know count == n2
223
} else {
224
mu = WARP_SHFL(mu, 0);
225
sigma2 = WARP_SHFL(sigma2/float(n2), 0);
226
}
227
}
228
}
229
230
template<typename U> U rsqrt(U v) {
231
return U(1) / sqrt(v);
232
}
233
template<> float rsqrt(float v) {
234
return rsqrtf(v);
235
}
236
template<> double rsqrt(double v) {
237
return rsqrt(v);
238
}
239
240
namespace {
241
// This is the un-specialized struct. Note that we prevent instantiation of this
242
// struct by putting an undefined symbol in the function body so it won't compile.
243
// template <typename T>
244
// struct SharedMemory
245
// {
246
// // Ensure that we won't compile any un-specialized types
247
// __device__ T *getPointer()
248
// {
249
// extern __device__ void error(void);
250
// error();
251
// return NULL;
252
// }
253
// };
254
// https://github.com/NVIDIA/apex/issues/246
255
template <typename T>
256
struct SharedMemory;
257
258
template <>
259
struct SharedMemory <float>
260
{
261
__device__ float *getPointer()
262
{
263
extern __shared__ float s_float[];
264
return s_float;
265
}
266
};
267
268
template <>
269
struct SharedMemory <double>
270
{
271
__device__ double *getPointer()
272
{
273
extern __shared__ double s_double[];
274
return s_double;
275
}
276
};
277
}
278
279
template<typename T, typename U> __global__
280
void cuApplyLayerNorm(
281
T* __restrict__ output_vals,
282
U* __restrict__ mean,
283
U* __restrict__ invvar,
284
const T* __restrict__ vals,
285
const int n1,
286
const int n2,
287
const U epsilon,
288
const T* __restrict__ gamma,
289
const T* __restrict__ beta
290
)
291
{
292
// Assumptions:
293
// 1) blockDim.x == warpSize
294
// 2) Tensors are contiguous
295
//
296
for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
297
SharedMemory<U> shared;
298
U* buf = shared.getPointer();
299
U mu,sigma2;
300
cuWelfordMuSigma2(vals,n1,n2,i1,mu,sigma2,buf);
301
const T* lvals = vals + i1*n2;
302
T* ovals = output_vals + i1*n2;
303
U c_invvar = rsqrt(sigma2 + epsilon);
304
const int numx = blockDim.x * blockDim.y;
305
const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
306
if (gamma != NULL && beta != NULL) {
307
for (int i = thrx; i < n2; i+=numx) {
308
U curr = static_cast<U>(lvals[i]);
309
ovals[i] = gamma[i] * static_cast<T>(c_invvar * (curr - mu)) + beta[i];
310
}
311
} else {
312
for (int i = thrx; i < n2; i+=numx) {
313
U curr = static_cast<U>(lvals[i]);
314
ovals[i] = static_cast<T>(c_invvar * (curr - mu));
315
}
316
}
317
if (threadIdx.x == 0 && threadIdx.y == 0) {
318
mean[i1] = mu;
319
invvar[i1] = c_invvar;
320
}
321
}
322
}
323
324
template<typename T, typename U> __device__
325
void cuLoadWriteStridedInputs(
326
const int i1_block,
327
const int thr_load_row_off,
328
const int thr_load_col_off,
329
const int i2_off,
330
const int row_stride,
331
U* warp_buf1,
332
U* warp_buf2,
333
const T* input,
334
const T* dout,
335
const int i1_end,
336
const int n2,
337
const U* __restrict__ mean,
338
const U* __restrict__ invvar
339
)
340
{
341
int i1 = i1_block+thr_load_row_off;
342
if (i1 < i1_end) {
343
U curr_mean = mean[i1];
344
U curr_invvar = invvar[i1];
345
for (int k = 0; k < blockDim.y; ++k) {
346
int i2 = i2_off + k;
347
int load_idx = i1*n2+i2;
348
int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
349
if (i2<n2) {
350
U curr_input = static_cast<U>(input[load_idx]);
351
U curr_dout = static_cast<U>(dout[load_idx]);
352
warp_buf1[write_idx] = curr_dout;
353
warp_buf2[write_idx] = curr_dout * (curr_input - curr_mean) * curr_invvar;
354
} else {
355
warp_buf1[write_idx] = U(0);
356
warp_buf2[write_idx] = U(0);
357
}
358
}
359
} else {
360
for (int k = 0; k < blockDim.y; ++k) {
361
int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
362
warp_buf1[write_idx] = U(0);
363
warp_buf2[write_idx] = U(0);
364
}
365
}
366
}
367
368
template<typename T, typename U> __device__
369
void cuLoadAddStridedInputs(
370
const int i1_block,
371
const int thr_load_row_off,
372
const int thr_load_col_off,
373
const int i2_off,
374
const int row_stride,
375
U* warp_buf1,
376
U* warp_buf2,
377
const T* input,
378
const T* dout,
379
const int i1_end,
380
const int n2,
381
const U* __restrict__ mean,
382
const U* __restrict__ invvar
383
)
384
{
385
int i1 = i1_block+thr_load_row_off;
386
if (i1 < i1_end) {
387
U curr_mean = mean[i1];
388
U curr_invvar = invvar[i1];
389
for (int k = 0; k < blockDim.y; ++k) {
390
int i2 = i2_off + k;
391
int load_idx = i1*n2+i2;
392
int write_idx = thr_load_row_off*row_stride+thr_load_col_off+k;
393
if (i2<n2) {
394
U curr_input = static_cast<U>(input[load_idx]);
395
U curr_dout = static_cast<U>(dout[load_idx]);
396
warp_buf1[write_idx] += curr_dout;
397
warp_buf2[write_idx] += curr_dout * (curr_input - curr_mean) * curr_invvar;
398
}
399
}
400
}
401
}
402
403
template<typename T, typename U> __global__
404
void cuComputePartGradGammaBeta(
405
const T* __restrict__ dout,
406
const T* __restrict__ input,
407
const int n1,
408
const int n2,
409
const U* __restrict__ mean,
410
const U* __restrict__ invvar,
411
U epsilon,
412
U* part_grad_gamma,
413
U* part_grad_beta)
414
{
415
const int numsegs_n1 = (n1+blockDim.y*blockDim.y-1) / (blockDim.y*blockDim.y);
416
const int segs_per_block = (numsegs_n1 + gridDim.y - 1) / gridDim.y;
417
const int i1_beg = blockIdx.y * segs_per_block * blockDim.y*blockDim.y;
418
const int i1_beg_plus_one = (blockIdx.y+1) * segs_per_block * blockDim.y*blockDim.y;
419
const int i1_end = i1_beg_plus_one < n1 ? i1_beg_plus_one : n1;
420
const int row_stride = blockDim.x+1;
421
const int thr_load_col_off = (threadIdx.x*blockDim.y)&(blockDim.x-1);
422
const int thr_load_row_off = (threadIdx.x*blockDim.y)/blockDim.x + threadIdx.y*blockDim.y;
423
const int i2_off = blockIdx.x * blockDim.x + thr_load_col_off;
424
SharedMemory<U> shared;
425
U* buf = shared.getPointer(); // buf has at least blockDim.x * blockDim.y * blockDim.y + (blockDim.y - 1)*(blockDim.x/blockDim.y) elements
426
U* warp_buf1 = (U*)buf;
427
U* warp_buf2 = warp_buf1 + blockDim.y * blockDim.y * row_stride;
428
// compute partial sums from strided inputs
429
// do this to increase number of loads in flight
430
cuLoadWriteStridedInputs(i1_beg,thr_load_row_off,thr_load_col_off,i2_off,row_stride,warp_buf1,warp_buf2,input,dout,i1_end,n2,mean,invvar);
431
for (int i1_block = i1_beg+blockDim.y*blockDim.y; i1_block < i1_end; i1_block+=blockDim.y*blockDim.y) {
432
cuLoadAddStridedInputs(i1_block,thr_load_row_off,thr_load_col_off,i2_off,row_stride,warp_buf1,warp_buf2,input,dout,i1_end,n2,mean,invvar);
433
}
434
__syncthreads();
435
// inter-warp reductions
436
// sum within each warp
437
U acc1 = U(0);
438
U acc2 = U(0);
439
for (int k = 0; k < blockDim.y; ++k) {
440
int row1 = threadIdx.y + k*blockDim.y;
441
int idx1 = row1*row_stride + threadIdx.x;
442
acc1 += warp_buf1[idx1];
443
acc2 += warp_buf2[idx1];
444
}
445
warp_buf1[threadIdx.y*row_stride+threadIdx.x] = acc1;
446
warp_buf2[threadIdx.y*row_stride+threadIdx.x] = acc2;
447
__syncthreads();
448
// sum all warps
449
for (int offset = blockDim.y/2; offset > 1; offset /= 2) {
450
if (threadIdx.y < offset) {
451
int row1 = threadIdx.y;
452
int row2 = threadIdx.y + offset;
453
int idx1 = row1*row_stride + threadIdx.x;
454
int idx2 = row2*row_stride + threadIdx.x;
455
warp_buf1[idx1] += warp_buf1[idx2];
456
warp_buf2[idx1] += warp_buf2[idx2];
457
}
458
__syncthreads();
459
}
460
int i2 = blockIdx.x * blockDim.x + threadIdx.x;
461
if (threadIdx.y == 0 && i2 < n2) {
462
int row1 = threadIdx.y;
463
int row2 = threadIdx.y + 1;
464
int idx1 = row1*row_stride + threadIdx.x;
465
int idx2 = row2*row_stride + threadIdx.x;
466
part_grad_beta[blockIdx.y*n2+i2] = warp_buf1[idx1] + warp_buf1[idx2];
467
part_grad_gamma[blockIdx.y*n2+i2] = warp_buf2[idx1] + warp_buf2[idx2];
468
}
469
}
470
471
template<typename T, typename U> __global__
472
void cuComputeGradGammaBeta(
473
const U* part_grad_gamma,
474
const U* part_grad_beta,
475
const int part_size,
476
const int n1,
477
const int n2,
478
T* grad_gamma,
479
T* grad_beta)
480
{
481
// sum partial gradients for gamma and beta
482
SharedMemory<U> shared;
483
U* buf = shared.getPointer();
484
int i2 = blockIdx.x * blockDim.x + threadIdx.x;
485
if (i2 < n2) {
486
// each warp does sequential reductions until reduced part_size is num_warps
487
int num_warp_reductions = part_size / blockDim.y;
488
U sum_gamma = U(0);
489
U sum_beta = U(0);
490
const U* part_grad_gamma_ptr = part_grad_gamma + threadIdx.y * num_warp_reductions * n2 + i2;
491
const U* part_grad_beta_ptr = part_grad_beta + threadIdx.y * num_warp_reductions * n2 + i2;
492
for (int warp_offset = 0; warp_offset < num_warp_reductions; ++warp_offset) {
493
sum_gamma += part_grad_gamma_ptr[warp_offset*n2];
494
sum_beta += part_grad_beta_ptr[warp_offset*n2];
495
}
496
// inter-warp reductions
497
const int nbsize3 = blockDim.x * blockDim.y / 2;
498
for (int offset = blockDim.y/2; offset >= 1; offset /= 2) {
499
// top half write to shared memory
500
if (threadIdx.y >= offset && threadIdx.y < 2*offset) {
501
const int write_idx = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
502
buf[write_idx] = sum_gamma;
503
buf[write_idx+nbsize3] = sum_beta;
504
}
505
__syncthreads();
506
// bottom half sums
507
if (threadIdx.y < offset) {
508
const int read_idx = threadIdx.y * blockDim.x + threadIdx.x;
509
sum_gamma += buf[read_idx];
510
sum_beta += buf[read_idx+nbsize3];
511
}
512
__syncthreads();
513
}
514
// write out fully summed gradients
515
if (threadIdx.y == 0) {
516
grad_gamma[i2] = sum_gamma;
517
grad_beta[i2] = sum_beta;
518
}
519
}
520
}
521
522
template<typename T, typename U> __global__
523
void cuComputeGradInput(
524
const T* __restrict__ dout,
525
const T* __restrict__ input,
526
const int n1,
527
const int n2,
528
const U* __restrict__ mean,
529
const U* __restrict__ invvar,
530
U epsilon,
531
const T* gamma,
532
T* grad_input)
533
{
534
for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
535
U sum_loss1 = U(0);
536
U sum_loss2 = U(0);
537
const U c_mean = mean[i1];
538
const U c_invvar = invvar[i1];
539
const T* k_input = input + i1*n2;
540
const T* k_dout = dout + i1*n2;
541
const int numx = blockDim.x * blockDim.y;
542
const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
543
if (gamma != NULL) {
544
int l = 4*thrx;
545
for (; l+3 < n2; l+=4*numx) {
546
for (int k = 0; k < 4; ++k) {
547
const U c_h = static_cast<U>(k_input[l+k]);
548
const U c_loss = static_cast<U>(k_dout[l+k]);
549
sum_loss1 += c_loss * gamma[l+k];
550
sum_loss2 += c_loss * gamma[l+k] * (c_h - c_mean) * c_invvar;
551
}
552
}
553
for (; l < n2; ++l) {
554
const U c_h = static_cast<U>(k_input[l]);
555
const U c_loss = static_cast<U>(k_dout[l]);
556
sum_loss1 += c_loss * gamma[l];
557
sum_loss2 += c_loss * gamma[l] * (c_h - c_mean) * c_invvar;
558
}
559
} else {
560
int l = 4*thrx;
561
for (; l+3 < n2; l+=4*numx) {
562
for (int k = 0; k < 4; ++k) {
563
const U c_h = static_cast<U>(k_input[l+k]);
564
const U c_loss = static_cast<U>(k_dout[l+k]);
565
sum_loss1 += c_loss;
566
sum_loss2 += c_loss * (c_h - c_mean) * c_invvar;
567
}
568
}
569
for (; l < n2; ++l) {
570
const U c_h = static_cast<U>(k_input[l]);
571
const U c_loss = static_cast<U>(k_dout[l]);
572
sum_loss1 += c_loss;
573
sum_loss2 += c_loss * (c_h - c_mean) * c_invvar;
574
}
575
}
576
// intra-warp reductions
577
for (int mask = blockDim.x/2; mask > 0; mask /= 2) {
578
sum_loss1 += WARP_SHFL_XOR(sum_loss1, mask);
579
sum_loss2 += WARP_SHFL_XOR(sum_loss2, mask);
580
}
581
// inter-warp reductions
582
if (blockDim.y > 1) {
583
SharedMemory<U> shared;
584
U* buf = shared.getPointer();
585
for (int offset = blockDim.y/2; offset > 0; offset /= 2) {
586
// upper half of warps write to shared
587
if (threadIdx.y >= offset && threadIdx.y < 2*offset) {
588
const int wrt_i = (threadIdx.y - offset) * blockDim.x + threadIdx.x;
589
buf[2*wrt_i] = sum_loss1;
590
buf[2*wrt_i+1] = sum_loss2;
591
}
592
__syncthreads();
593
// lower half merges
594
if (threadIdx.y < offset) {
595
const int read_i = threadIdx.y * blockDim.x + threadIdx.x;
596
sum_loss1 += buf[2*read_i];
597
sum_loss2 += buf[2*read_i+1];
598
}
599
__syncthreads();
600
}
601
if (threadIdx.y == 0) {
602
buf[2*threadIdx.x] = sum_loss1;
603
buf[2*threadIdx.x+1] = sum_loss2;
604
}
605
__syncthreads();
606
if (threadIdx.y !=0) {
607
sum_loss1 = buf[2*threadIdx.x];
608
sum_loss2 = buf[2*threadIdx.x+1];
609
}
610
}
611
// all threads now have the two sums over l
612
U fH = (U)n2;
613
U term1 = (U(1) / fH) * c_invvar;
614
T* k_grad_input = grad_input + i1*n2;
615
if (gamma != NULL) {
616
for (int l = thrx; l < n2; l+=numx) {
617
const U c_h = static_cast<U>(k_input[l]);
618
const U c_loss = static_cast<U>(k_dout[l]);
619
U f_grad_input = fH * c_loss * gamma[l];
620
f_grad_input -= sum_loss1;
621
f_grad_input -= (c_h - c_mean) * c_invvar * sum_loss2;
622
f_grad_input *= term1;
623
k_grad_input[l] = static_cast<T>(f_grad_input);
624
}
625
} else {
626
for (int l = thrx; l < n2; l+=numx) {
627
const U c_h = static_cast<U>(k_input[l]);
628
const U c_loss = static_cast<U>(k_dout[l]);
629
U f_grad_input = fH * c_loss;
630
f_grad_input -= sum_loss1;
631
f_grad_input -= (c_h - c_mean) * c_invvar * sum_loss2;
632
f_grad_input *= term1;
633
k_grad_input[l] = static_cast<T>(f_grad_input);
634
}
635
}
636
}
637
}
638
639
template<typename T, typename U>
640
void HostApplyLayerNorm(
641
T* output,
642
U* mean,
643
U* invvar,
644
const T* input,
645
int n1,
646
int n2,
647
double epsilon,
648
const T* gamma,
649
const T* beta
650
)
651
{
652
auto stream = at::cuda::getCurrentCUDAStream().stream();
653
const dim3 threads(32,4,1);
654
const uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
655
const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
656
int nshared =
657
threads.y > 1 ?
658
threads.y*sizeof(U)+(threads.y/2)*sizeof(U) :
659
0;
660
cuApplyLayerNorm<<<blocks, threads, nshared, stream>>>(
661
output,
662
mean,
663
invvar,
664
input,
665
n1,n2,
666
U(epsilon),
667
gamma,beta);
668
}
669
670
void cuda_layer_norm(
671
at::Tensor* output,
672
at::Tensor* mean,
673
at::Tensor* invvar,
674
at::Tensor* input,
675
int n1,
676
int n2,
677
#ifdef VERSION_GE_1_1
678
at::IntArrayRef normalized_shape,
679
#else
680
at::IntList normalized_shape,
681
#endif
682
at::Tensor* gamma,
683
at::Tensor* beta,
684
double epsilon)
685
{
686
using namespace at;
687
DISPATCH_DOUBLE_FLOAT_AND_HALF(input->scalar_type(), 0, "layer_norm_cuda_kernel",
688
using accscalar_t = at::acc_type<scalar_t_0, true>;
689
HostApplyLayerNorm(
690
output->DATA_PTR<scalar_t_0>(),
691
mean->DATA_PTR<accscalar_t>(),
692
invvar->DATA_PTR<accscalar_t>(),
693
input->DATA_PTR<scalar_t_0>(),
694
n1,n2,
695
epsilon,
696
gamma != NULL ? gamma->DATA_PTR<scalar_t_0>() : NULL,
697
beta != NULL ? beta->DATA_PTR<scalar_t_0>() : NULL);
698
)
699
}
700
701
template<typename T, typename U>
702
void HostLayerNormGradient(
703
const T* dout,
704
const U* mean,
705
const U* invvar,
706
at::Tensor* input,
707
int n1,
708
int n2,
709
const T* gamma,
710
const T* beta,
711
double epsilon,
712
T* grad_input,
713
T* grad_gamma,
714
T* grad_beta
715
)
716
{
717
auto stream = at::cuda::getCurrentCUDAStream().stream();
718
719
if (gamma != NULL && beta != NULL) {
720
// compute grad_gamma(j) and grad_beta(j)
721
const int part_size = 16;
722
const dim3 threads2(32,4,1);
723
const dim3 blocks2((n2+threads2.x-1)/threads2.x,part_size,1);
724
const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y * (threads2.x + 1);
725
const int nshared2_b = threads2.x * threads2.y * sizeof(U);
726
const int nshared2 = nshared2_a > nshared2_b ? nshared2_a : nshared2_b;
727
at::Tensor part_grad_gamma = at::empty({part_size,n2}, input->options().dtype(input->scalar_type()==at::ScalarType::Half ? at::ScalarType::Float : input->scalar_type()));
728
at::Tensor part_grad_beta = at::empty_like(part_grad_gamma);
729
cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
730
dout,
731
input->DATA_PTR<T>(),
732
n1,n2,
733
mean,
734
invvar,
735
U(epsilon),
736
part_grad_gamma.DATA_PTR<U>(),
737
part_grad_beta.DATA_PTR<U>());
738
739
const dim3 threads3(32,8,1);
740
const dim3 blocks3((n2+threads2.x-1)/threads2.x,1,1);
741
const int nshared3 = threads3.x * threads3.y * sizeof(U);
742
cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
743
part_grad_gamma.DATA_PTR<U>(),
744
part_grad_beta.DATA_PTR<U>(),
745
part_size,
746
n1,n2,
747
grad_gamma,
748
grad_beta);
749
}
750
751
// compute grad_input
752
const uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
753
const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1);
754
const dim3 threads1(32,4,1);
755
int nshared =
756
threads1.y > 1 ?
757
threads1.y*threads1.x*sizeof(U) :
758
0;
759
cuComputeGradInput<<<blocks1, threads1, nshared, stream>>>(
760
dout,
761
input->DATA_PTR<T>(),
762
n1,n2,
763
mean,
764
invvar,
765
U(epsilon),
766
gamma,
767
grad_input);
768
}
769
770
void cuda_layer_norm_gradient(
771
at::Tensor* dout,
772
at::Tensor* mean,
773
at::Tensor* invvar,
774
at::Tensor* input,
775
int n1,
776
int n2,
777
#ifdef VERSION_GE_1_1
778
at::IntArrayRef normalized_shape,
779
#else
780
at::IntList normalized_shape,
781
#endif
782
at::Tensor* gamma,
783
at::Tensor* beta,
784
double epsilon,
785
at::Tensor* grad_input,
786
at::Tensor* grad_gamma,
787
at::Tensor* grad_beta)
788
{
789
using namespace at;
790
DISPATCH_FLOAT_AND_HALF(input->scalar_type(), 0, "cuComputeGradInput",
791
using accscalar_t = at::acc_type<scalar_t_0, true>;
792
HostLayerNormGradient(
793
dout->DATA_PTR<scalar_t_0>(),
794
mean->DATA_PTR<accscalar_t>(),
795
invvar->DATA_PTR<accscalar_t>(),
796
input,
797
n1,n2,
798
// TMJ pass NULL argument for gamma, beta, grad_gamma and grad_beta
799
// if gamma Tensor is NULL on input.
800
gamma != NULL ? gamma->DATA_PTR<scalar_t_0>() : NULL,
801
gamma != NULL ? beta->DATA_PTR<scalar_t_0>() : NULL,
802
epsilon,
803
grad_input->DATA_PTR<scalar_t_0>(),
804
gamma != NULL ? grad_gamma->DATA_PTR<scalar_t_0>() : NULL,
805
gamma != NULL ? grad_beta->DATA_PTR<scalar_t_0>() : NULL);
806
)
807
}
808
809