Lecture5 2
Lecture5 2
! Read/write per-thread
local memory Registers Registers Registers Registers
! Read/write per-block
shared memory Thread (0, 0) Thread (1, 0) Thread (0, 0) Thread (1, 0)
! Read/write per-grid
global memory Host Global Memory
! Read/only per-grid
constant memory Constant Memory
CUDA Variable Type Qualifiers
Variable declaration Memory Scope Lifetime
int var; register thread thread
int array_var[10]; local thread thread
__shared__ int shared_var; shared block block
__device__ int global_var; global grid application
__constant__ int constant_var; constant grid application
if(i > 0)
{
// each thread loads two elements from global memory
int x_i = input[i];
int x_i_minus_one = input[i-1];
if(i > 0)
{
// what are the bandwidth requirements of this kernel?
int x_i = input[i];
int x_i_minus_one = input[i-1];
Two loads
if(i > 0)
{
// How many times does this kernel load input[i]?
int x_i = input[i]; // once by thread i
int x_i_minus_one = input[i-1]; // again by thread i+1
if(i > 0)
{
// Idea: eliminate redundancy by sharing data
int x_i = input[i];
int x_i_minus_one = input[i-1];
int *ptr = 0;
if(input[threadIdx.x] % 2)
ptr = &my_global_variable;
else
ptr = &my_shared_variable;
// where does ptr point?
}
! Question:
Σ0
Σ1
Σι
...
}
Hierarchical Atomics
__global__ void sum(int *input, int *result)
{
...
// each thread updates the partial sum
atomicAdd(&partial_sum,
input[threadIdx.x]);
__syncthreads();
! Generalize adjacent_difference
example
! AB = A * B
! Each element ABij
! = dot(row(A,i),col(B,j))
! Parallelization strategy
! Thread à ABij
! 2D kernel
First Implementation
__global__ void mat_mul(float *a, float *b,
float *ab, int width)
{
// calculate the row & col index of the element
int row = blockIdx.y*blockDim.y + threadIdx.y;
int col = blockIdx.x*blockDim.x + threadIdx.x;
float result = 0;
ab[row*width+col] = result;
}
How will this perform?
width
Tiled Multiply
TILE_WIDTH
float result = 0;
Better Implementation
// loop over the tiles of the input in phases
for(int p = 0; p < width/TILE_WIDTH; ++p)
{
// collaboratively load tiles into __shared__
s_a[ty][tx] = a[row*width + (p*TILE_WIDTH + tx)];
s_b[ty][tx] = b[(m*TILE_WIDTH + ty)*width + col];
__syncthreads();
ab[row*width+col] = result;
}
Use of Barriers in mat_mul