void gemm(float *A, float *B, float *C, unsigned int N) {
__global__ unsigned int row = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
for (unsigned int i = 0; i < N; i++) {
+= A[row * N + i] + B[col + N * i];
sum }
[row*N + col] = sum;
C}
Example:
C = A · B
C[0][0] = A[0][0] · B[0][0] + A[0][1] · B[1][0]
(A[0][0], B[0][0])
and
(A[0][1], B[1][0])
are shared by the same
thread block to decrease global memory read.void tiled_gemm(float *A, float *B, float *C, unsigned int N) {
__global__ float A_s[TILE_DIM][TILE_DIM];
__shared__ float B_s[TILE_DIM][TILE_DIM];
__shared__ unsigned int row = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
for (unsigned int t = 0; t < N / TILE_DIM; ++t) {
// Load tile to shared memory
[threadIdx.y][threadIdx.x] = A[row * N + t * TILE_DIM + threadIdx.x];
A_s[threadIdx.y][threadIdx.x] = B[col + (t * TILE_DIM + threadIdx.y) * N];
B_s(); // wait for others to finish before computing
__syncthreads
// Compute with tile
for (unsigned int i = 0; i < TILE_DIM; ++i) {
+= A_s[threadIdx.y][i] * B_s[i][threadIdx.x];
sum }
(); // wait for others to finish computing before loading
__syncthreads}
[row * N + col] = sum;
C}
float partialSums[];
__shared__
unsigned int t = threadIdx.x;
for (int stride = 1; stride < blockDim.x / 2; stride *= 2) {
if (t % (2 * stride) == 0) {
[t] += partialSums[t + stride];
partialSums}
}
float partialSums[];
__shared__ unsigned int t = threadIdx.x;
for (int stride = blockDim.x; stride > 0; stride >> 1) {
();
__syncthreadsif (t < stride) {
[t] = partialSums
partialSums}
}
void reduce_kernel(int *input, int *partialSums, unsigned int N) {
__global__ // Load data into shared memory
...
// Reduction tree in shared memory
for(unsigned int stride = BLOCK_DIM/2; stride > WARP_SIZE; stride /= 2) {
if(threadIdx.x < stride) {
[threadIdx.x] += input_s[threadIdx.x + stride];
input_s}
();
__syncthreads}
// Reduction tree with shuffle instructions
float sum;
if(threadIdx.x < WARP_SIZE) {
= input_s[threadIdx.x] + input_s[threadIdx.x + WARP_SIZE];
sum for(unsigned int stride = WARP_SIZE/2; stride > 0; stride /= 2) {
+= __shfl_down_sync(0xffffffff, sum, stride);
sum }
}
// Store partial sum
if(threadIdx.x == 0) {
[blockIdx.x] = sum;
partialSums}
}
void reduce_kernel(int *input, int *partialSums, unsigned int N) {
__global__ int input_s[BLOCK_SIZE];
__shared__
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; // global thread index
const unsigned int gridSize = gridDim.x * blockDim.x; // total number of threads
int muSum = 0; // Local (per-thread) sum
while (i < N) {
+= input[i];
mySum += gridSize;
i }
[threadIdx.x] = mySum;
input_s
...
}
void histogram_calc(unsigned int *histo,
unsigned int *input,
unsigned int input_size) {
int i = 0;
while (i < input_size) {
unsigned int val = input[i];
[val] += 1;
histo++;
i}
}
void histogram_calc(unsigned int *histo,
__gloabl__ unsigned int *input,
unsigned int input_size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
while (i < input_size) {
unsigned int val = input[i];
(&histo[val], 1);
atomicAdd+= stride;
i }
}
Privatization is an optimization technique where multiple private copies of an output are maintained, then the global copy is updated on completion
Coarsening: Each block processes several chunks.
void histogram_kernel(unsigned int *histo, unsigned int *input, unsigned int input_size){
__global__ int tid = blockIdx.x * blockDim.x + threadIdx.x; // Thread index
int stride = blockDim.x * gridDim.x; // Total number of threads
unsigned int histo_s[BINS]; // Private per-block sub-histogram
__shared__
// Sub-histogram initialization
for(int i = threadIdx.x; i < BINS; i += blockDim.x) {
[i] = 0;
histo_s}
(); // Intra-block synchronization
__syncthreads
// Main loop to compute per-block sub-histograms
for(int i = tid; i < input_size ; i += stride) {
unsigned int val = input[i]; // Global memory read (coalesced)
(&histo_s[val], 1); // Atomic addition in shared memory
atomicAdd}
(); // Intra-block synchronization
__syncthreads
// Merge per-block sub-histograms and write to global memory
for(int i = threadIdx.x; i < BINS; i += blockDim.x) {
// Atomic addition in global memory
(histo + i, histo_s[i]);
atomicAdd}
}
Convolution applies a filter or mask or kernel* on eachelement of the input (e.g., a signal, an image, a frame) toobtain a new value, which is a weighted sum of a set ofneighboring input elements
void convolution_1D_basic_kernel(float *N, float *M, float *P,
__global__ int Mask_Width, int Width) {
int i = blockIdx.x * blockDim.x + threadIdx.x; // Index of output element
float Pvalue = 0;
int N_start_point = i - (Mask_Width/2); // Index of first neighbor
for (int j = 0; j < Mask_Width; j++) {
// Check the boundaries
if (N_start_point + j >= 0 && N_start_point + j < Width) {
+= N[N_start_point + j] * M[j]; // Multiply and accumulate
Pvalue }
}
[i] = Pvalue; // Store output element
P}