0% found this document useful (0 votes)

52 views46 pages

Lecture5 2

This document summarizes CUDA memory types and how variables are stored in different memories. It discusses: 1) The different memory types in CUDA including registers, shared memory, local/thread-local memory, global memory, and constant memory and the scope and performance characteristics of each. 2) CUDA variable type qualifiers like __shared__, __device__, and __constant__ that determine which memory a variable is stored in. 3) Best practices for using different memories like storing frequently accessed data in fast shared memory and tiling data to fit in shared memory. 4) How to use pointers in CUDA and common programming strategies like loading data from global to shared memory within a thread block.

Uploaded by

raghunaath

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

52 views46 pages

Lecture5 2

Uploaded by

raghunaath

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 46

GPU computing with CUDA

Lecture 3: CUDA Memories

Hardware Implementation of CUDA
Memories
Grid

! Each thread can:

Block (0, 0) Block (1, 0)
! Read/write per-thread
registers Shared Memory Shared Memory

! Read/write per-thread
local memory Registers Registers Registers Registers

! Read/write per-block
shared memory Thread (0, 0) Thread (1, 0) Thread (0, 0) Thread (1, 0)

! Read/write per-grid
global memory Host Global Memory
! Read/only per-grid
constant memory Constant Memory
CUDA Variable Type Qualifiers
Variable declaration Memory Scope Lifetime
int var; register thread thread
int array_var[10]; local thread thread
__shared__ int shared_var; shared block block
__device__ int global_var; global grid application
__constant__ int constant_var; constant grid application

! automatic scalar variables without qualifier reside

in a register
! compiler will spill to thread local memory
! automatic array variables without qualifier reside
in thread-local memory
CUDA Variable Type Performance
Variable declaration Memory Penalty
int var; register 1x
int array_var[10]; local 100x
__shared__ int shared_var; shared 1x
__device__ int global_var; global 100x
__constant__ int constant_var; constant 1x

! scalar variables reside in fast, on-chip registers

! shared variables reside in fast, on-chip memories
! thread-local arrays & global variables reside in
uncached off-chip memory
! constant variables reside in cached off-chip memory
CUDA Variable Type Scale
Variable declaration Instances Visibility
int var; 100,000s 1
int array_var[10]; 100,000s 1
__shared__ int shared_var; 100s 100s
__device__ int global_var; 1 100,000s
__constant__ int constant_var; 1 100,000s

! 100Ks per-thread variables, R/W by 1 thread

! 100s shared variables, each R/W by 100s of threads
! 1 global variable is R/W by 100Ks threads
! 1 constant variable is readable by 100Ks threads
Where to declare variables?

Can host access it?

Yes No

Outside of any In the kernel

function

constant int constant_var; int var;

__device__ int global_var; int array_var[10];
__shared__ int shared_var;
Example – thread-local variables
// motivate per-thread variables with
// Ten Nearest Neighbors application
__global__ void ten_nn(float2 *result, float2 *ps, float2 *qs,
size_t num_qs)
{
// p goes in a register
float2 p = ps[threadIdx.x];

// per-thread heap goes in off-chip memory

float2 heap[10];

// read through num_qs points, maintaining

// the nearest 10 qs to p in the heap
...
// write out the contents of heap to result
...
}
Example – shared variables
// motivate shared variables with
// Adjacent Difference application:
// compute result[i] = input[i] – input[i-1]
__global__ void adj_diff_naive(int *result, int *input)
{
// compute this thread s global index
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;

if(i > 0)
{
// each thread loads two elements from global memory
int x_i = input[i];
int x_i_minus_one = input[i-1];

result[i] = x_i – x_i_minus_one;

}
}
Example – shared variables
// motivate shared variables with
// Adjacent Difference application:
// compute result[i] = input[i] – input[i-1]
__global__ void adj_diff_naive(int *result, int *input)
{
// compute this thread s global index
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;

if(i > 0)
{
// what are the bandwidth requirements of this kernel?
int x_i = input[i];
int x_i_minus_one = input[i-1];
Two loads

result[i] = x_i – x_i_minus_one;

if(i > 0)
{
// How many times does this kernel load input[i]?
int x_i = input[i]; // once by thread i
int x_i_minus_one = input[i-1]; // again by thread i+1

result[i] = x_i – x_i_minus_one;

if(i > 0)
{
// Idea: eliminate redundancy by sharing data
int x_i = input[i];
int x_i_minus_one = input[i-1];

result[i] = x_i – x_i_minus_one;

}
}
Example – shared variables
// optimized version of adjacent difference
__global__ void adj_diff(int *result, int *input)
{
// shorthand for threadIdx.x
int tx = threadIdx.x;
// allocate a __shared__ array, one element per thread
__shared__ int s_data[BLOCK_SIZE];
// each thread reads one element to s_data
unsigned int i = blockDim.x * blockIdx.x + tx;
s_data[tx] = input[i];

// avoid race condition: ensure all loads

// complete before continuing
__syncthreads();
...
}
Example – shared variables
// optimized version of adjacent difference
__global__ void adj_diff(int *result, int *input)
{
...
if(tx > 0)
result[i] = s_data[tx] – s_data[tx–1];
else if(i > 0)
{
// handle thread block boundary
result[i] = s_data[tx] – input[i-1];
}
}
Example – shared variables
// when the size of the array isn t known at compile time...
__global__ void adj_diff(int *result, int *input)
{
// use extern to indicate a __shared__ array will be
// allocated dynamically at kernel launch time
extern __shared__ int s_data[];
...
}

// pass the size of the per-block array, in bytes, as the third

// argument to the triple chevrons
adj_diff<<<num_blocks, block_size, block_size * sizeof(int)>>>(r,i);
About Pointers

! Yes, you can use them!

! You can point at any memory space per se:
__device__ int my_global_variable;
__constant__ int my_constant_variable = 13;

global void foo(void)

{
__shared__ int my_shared_variable;

int *ptr_to_global = &my_global_variable;

const int *ptr_to_constant = &my_constant_variable;
int *ptr_to_shared = &my_shared_variable;
...
*ptr_to_global = *ptr_to_shared;
}
About Pointers
! The address obtained by taking the address of a
__device__, __shared__ or __constant__ variable
can only be used in device code.

! The address of a device or constant

variable obtained through cudaGetSymbolAddress()
can only be used in host code.
Don t confuse the compiler!
__device__ int my_global_variable;
__global__ void foo(int *input)
{
__shared__ int my_shared_variable;

int *ptr = 0;
if(input[threadIdx.x] % 2)
ptr = &my_global_variable;
else
ptr = &my_shared_variable;
// where does ptr point?
}

Warning: Cannot tell what pointer points to, assuming global

memory space
Advice

! Prefer dereferencing pointers in simple, regular

access patterns
! Avoid propagating pointers
! Avoid pointers to pointers
! The GPU would rather not pointer chase
! Linked lists will not perform well
! Pay attention to compiler warning messages
! Warning: Cannot tell what pointer points to,
assuming global memory space
! Crash waiting to happen
A Common Programming Strategy

! Global memory resides in device memory (DRAM)

! Much slower access than shared memory
! Tile data to take advantage of fast shared memory:
! Generalize from adjacent_difference
example
! Divide and conquer
A Common Programming Strategy

! Partition data into subsets that fit into shared memory

A Common Programming Strategy

! Handle each data subset with one thread block

A Common Programming Strategy

! Load the subset from global memory to shared

memory, using multiple threads to exploit memory-
level parallelism
A Common Programming Strategy

! Perform the computation on the subset from shared

memory
A Common Programming Strategy

! Copy the result from shared memory back to global

memory
A Common Programming Strategy

! Carefully partition data according to access patterns

! Read-only è __constant__ memory (fast)
! R/W & shared within block è __shared__ memory
(fast)
! R/W within each thread è registers (fast)
! Indexed R/W within each thread è local memory
(slow)
! R/W inputs/results è cudaMalloc ed global
memory (slow)
Communication Through Memory

! Question:

global void race(void)

{
__shared__ int my_shared_variable;
my_shared_variable = threadIdx.x;

// what is the value of

// my_shared_variable?
}
Communication Through Memory

! This is a race condition

! The result is undefined
! The order in which threads access the variable is
undefined without explicit coordination
! Use barriers (e.g., __syncthreads) or atomic
operations (e.g., atomicAdd) to enforce well-defined
semantics
Communication Through Memory
! Use __syncthreads to ensure data is ready for
access

global void share_data(int *input)

{
__shared__ int data[BLOCK_SIZE];
data[threadIdx.x] = input[threadIdx.x];
__syncthreads();
// the state of the entire data array
// is now well-defined for all threads
// in this block
}
Communication Through Memory
! Use atomic operations to ensure exclusive access to
a variable

// assume *result is initialized to 0

__global__ void sum(int *input, int *result)
{
atomicAdd(result, input[threadIdx.x]);

// after this kernel exits, the value of

// *result will be the sum of the input
}
Resource Contention

! Atomic operations aren t cheap!

! They imply serialized access to a variable

global void sum(int input, int result)

{
atomicAdd(result, input[threadIdx.x]);
}
...
// how many threads will contend
// for exclusive access to result?
sum<<<B,N/B>>>(input,result);
Hierarchical Atomics
Σ

Σ0
Σ1
Σι

! Divide & Conquer

! Per-thread atomicAdd to a __shared__ partial sum
! Per-block atomicAdd to the total sum
Hierarchical Atomics
__global__ void sum(int *input, int *result)
{
__shared__ int partial_sum;

// thread 0 is responsible for

// initializing partial_sum
if(threadIdx.x == 0)
partial_sum = 0;
__syncthreads();

...
}
Hierarchical Atomics
__global__ void sum(int *input, int *result)
{
...
// each thread updates the partial sum
atomicAdd(&partial_sum,
input[threadIdx.x]);
__syncthreads();

// thread 0 updates the total sum

if(threadIdx.x == 0)
atomicAdd(result, partial_sum);
}
Advice

! Use barriers such as __syncthreads to wait until

__shared__ data is ready
! Prefer barriers to atomics when data access patterns
are regular or predictable
! Prefer atomics to barriers when data access patterns
are sparse or unpredictable
! Atomics to __shared__ variables are much faster
than atomics to global variables
! Don t synchronize or serialize unnecessarily
Matrix Multiplication Example

! Generalize adjacent_difference
example
! AB = A * B
! Each element ABij
! = dot(row(A,i),col(B,j))
! Parallelization strategy
! Thread à ABij
! 2D kernel
First Implementation
__global__ void mat_mul(float *a, float *b,
float *ab, int width)
{
// calculate the row & col index of the element
int row = blockIdx.y*blockDim.y + threadIdx.y;
int col = blockIdx.x*blockDim.x + threadIdx.x;

float result = 0;

// do dot product between row of a and col of b

for(int k = 0; k < width; ++k)
result += a[row*width+k] * b[k*width+col];

ab[row*width+col] = result;
}
How will this perform?

How many loads per term of dot 2 floats (a & b) =

product? 8 Bytes
How many floating point operations? 2 (multiply & addition)
Global memory access to flop ratio 8 Bytes / 2 ops =
(GMAC) 4 B/op
What is the peak fp performance of 1.35 TFLOPS
GeForce GTX 480?
Lower bound on bandwidth required to GMAC * Peak FLOPS = 4 * 1.350 =
reach peak fp performance 5.4 TB/s
What is the actual memory bandwidth 177 GB/s
of GeForce GTX 480?
Then what is an upper bound on Actual BW / GMAC = 177 / 4 =
performance of our implementation? 44 GFLOPS
Idea: Use __shared__ memory to
reuse global data

! Each input element is

read by width threads
! Load each element into
__shared__ memory
and have several threads
use the local version to
reduce the memory
bandwidth

width
Tiled Multiply
TILE_WIDTH

! Partition kernel loop

into phases
! Load a tile of both
matrices into
__shared__ each
phase
! Each phase, each
thread computes a
partial result
Better Implementation
__global__ void mat_mul(float *a, float *b,
float *ab, int width)
{
// shorthand
int tx = threadIdx.x, ty = threadIdx.y;
int bx = blockIdx.x, by = blockIdx.y;
// allocate tiles in __shared__ memory
__shared__ float s_a[TILE_WIDTH][TILE_WIDTH];
__shared__ float s_b[TILE_WIDTH][TILE_WIDTH];
// calculate the row & col index
int row = by*blockDim.y + ty;
int col = bx*blockDim.x + tx;

float result = 0;
Better Implementation
// loop over the tiles of the input in phases
for(int p = 0; p < width/TILE_WIDTH; ++p)
{
// collaboratively load tiles into __shared__
s_a[ty][tx] = a[row*width + (p*TILE_WIDTH + tx)];
s_b[ty][tx] = b[(m*TILE_WIDTH + ty)*width + col];
__syncthreads();

// dot product between row of s_a and col of s_b

for(int k = 0; k < TILE_WIDTH; ++k)
result += s_a[ty][k] * s_b[k][tx];
__syncthreads();
}

ab[row*width+col] = result;
}
Use of Barriers in mat_mul

! Two barriers per phase:

! __syncthreads after all data is loaded into __shared__
memory
! __syncthreads after all data is read from __shared__
memory
! Note that second __syncthreads in phase p guards the
load in phase p+1

! Use barriers to guard data

! Guard against using uninitialized data
! Guard against bashing live data
First Order Size Considerations

! Each thread block should have many threads

! TILE_WIDTH = 16 à 16*16 = 256 threads

! There should be many thread blocks

! 1024*1024 matrices à 64*64 = 4096 thread blocks
! TILE_WIDTH = 16 à gives each SM 4 blocks, 1024 threads
! Full occupancy

! Each thread block performs 2 * 256 = 512 x 4B loads

for 256 * (2 * 16) = 8,192 fp ops (0.25 B/op)
! Compare to 4B/op
TILE_SIZE Effects
Memory Resources as Limit to
Parallelism
Resource Per GTX480 SM Full Occupancy on
GTX480
Registers 32768 <= 32768 / 1024 threads
= 32 per thread
__shared__ Memory 48KB <= 48KB / 8 blocks
= 6KB per block

! Effective use of different memory resources reduces

the number of accesses to global memory
! These resources are finite!
! The more memory locations each thread requires à
the fewer threads an SM can accommodate
Final Thoughts

! Effective use of CUDA memory hierarchy decreases

bandwidth consumption to increase throughput
! Use __shared__ memory to eliminate redundant
loads from global memory
! Use __syncthreads barriers to protect __shared__ data
! Use atomics if access patterns are sparse or unpredictable
! Optimization comes with a development cost
! Memory resources ultimately limit parallelism

CSED405 Lec3-Memory and Locality - 240912 - 113301
No ratings yet
CSED405 Lec3-Memory and Locality - 240912 - 113301
65 pages
CUDA Memory Architecture Explained
No ratings yet
CUDA Memory Architecture Explained
28 pages
Advanced CUDA Programming Guide
No ratings yet
Advanced CUDA Programming Guide
64 pages
Multithreaded Architectures: Memory and Data Locality
No ratings yet
Multithreaded Architectures: Memory and Data Locality
39 pages
CUDA Memory
No ratings yet
CUDA Memory
56 pages
GPUs and GPGPU
No ratings yet
GPUs and GPGPU
15 pages
3 Some Commonly Used CUDA API: 3.1 Function Type Qualifiers
No ratings yet
3 Some Commonly Used CUDA API: 3.1 Function Type Qualifiers
7 pages
VSCSE Lecture3 Cuda Memory Model 2012
No ratings yet
VSCSE Lecture3 Cuda Memory Model 2012
31 pages
002 - Introduction To CUDA Programming - 1
No ratings yet
002 - Introduction To CUDA Programming - 1
54 pages
Cuda Notes From Udacity Lecture
No ratings yet
Cuda Notes From Udacity Lecture
3 pages
217 Lec3
No ratings yet
217 Lec3
46 pages
GPU Computing 2
No ratings yet
GPU Computing 2
28 pages
Introduction To CUDA: CAP 4730 Spring 2012
No ratings yet
Introduction To CUDA: CAP 4730 Spring 2012
35 pages
Lec6 Cuda Memory
No ratings yet
Lec6 Cuda Memory
18 pages
GPU Programming Basics - Slides
No ratings yet
GPU Programming Basics - Slides
68 pages
CUDA Programming for Developers
No ratings yet
CUDA Programming for Developers
29 pages
GPU History & CUDA Programming Basics
No ratings yet
GPU History & CUDA Programming Basics
44 pages
Lecture2 Cuda Basic 2010
No ratings yet
Lecture2 Cuda Basic 2010
44 pages
Intro to CUDA Programming Guide
No ratings yet
Intro to CUDA Programming Guide
33 pages
CUDA Programming for Engineers
No ratings yet
CUDA Programming for Engineers
84 pages
06-CUDA Thread Organization
No ratings yet
06-CUDA Thread Organization
27 pages
Cuda Mode Lecture2
No ratings yet
Cuda Mode Lecture2
33 pages
Threads
No ratings yet
Threads
54 pages
Multithreaded Architectures: Lecture 5: Performance Considerations
No ratings yet
Multithreaded Architectures: Lecture 5: Performance Considerations
49 pages
02 CUDA Shared Memory
No ratings yet
02 CUDA Shared Memory
21 pages
Module 4.1 - Memory and Data Locality: GPU Teaching Kit
No ratings yet
Module 4.1 - Memory and Data Locality: GPU Teaching Kit
132 pages
Main Memory: Prof. Mike Giles
No ratings yet
Main Memory: Prof. Mike Giles
9 pages
02 RTVis GPGPU CUDA
No ratings yet
02 RTVis GPGPU CUDA
34 pages
ECE408 S19 ZJUI Exam1 Study Guide
No ratings yet
ECE408 S19 ZJUI Exam1 Study Guide
25 pages
5 Computation
No ratings yet
5 Computation
13 pages
CUDA Programming: Johan Seland Johan - Seland@sintef - No
No ratings yet
CUDA Programming: Johan Seland Johan - Seland@sintef - No
76 pages
Class 13
No ratings yet
Class 13
19 pages
Summary Exam 2015
No ratings yet
Summary Exam 2015
30 pages
CUDA Part-1
No ratings yet
CUDA Part-1
52 pages
217 Lec2
No ratings yet
217 Lec2
24 pages
Class 10
No ratings yet
Class 10
13 pages
3 Cuda
No ratings yet
3 Cuda
5 pages
Topic GPU1
No ratings yet
Topic GPU1
32 pages
2023 CSC14120 Lecture05 CUDAMemories
No ratings yet
2023 CSC14120 Lecture05 CUDAMemories
48 pages
CSE 599 I Accelerated Computing - Programming GPUs Lecture 15
No ratings yet
CSE 599 I Accelerated Computing - Programming GPUs Lecture 15
42 pages
Gpu Cuda
No ratings yet
Gpu Cuda
204 pages
Lect11 12 Cuda Threads
No ratings yet
Lect11 12 Cuda Threads
25 pages
CUDA Exercises
No ratings yet
CUDA Exercises
185 pages
01 Cuda C Basics
No ratings yet
01 Cuda C Basics
32 pages
HPC Int2 Key
No ratings yet
HPC Int2 Key
10 pages
CUDA Programming Quiz
100% (5)
CUDA Programming Quiz
4 pages
CUDA Introduction
No ratings yet
CUDA Introduction
39 pages
Cuda Talk
100% (1)
Cuda Talk
82 pages
Arallel Rocessing NIT
No ratings yet
Arallel Rocessing NIT
58 pages
Lecture 11 Programming On Gpus Part 1 Zxu2acms60212 40212 S15lec 11 Gpupdf
No ratings yet
Lecture 11 Programming On Gpus Part 1 Zxu2acms60212 40212 S15lec 11 Gpupdf
121 pages
Developing Kernels: Part 2: Algorithm Considerations, Multi-Kernel Programs and Optimization
No ratings yet
Developing Kernels: Part 2: Algorithm Considerations, Multi-Kernel Programs and Optimization
23 pages
Chapter 8
No ratings yet
Chapter 8
58 pages
Introduction to GPGPU Programming
No ratings yet
Introduction to GPGPU Programming
32 pages
Cuda Firstprograms PDF
No ratings yet
Cuda Firstprograms PDF
6 pages
Cuda C/C++ Basics: NVIDIA Corporation
No ratings yet
Cuda C/C++ Basics: NVIDIA Corporation
67 pages
Matrix-Matrix Multiplication Using Shared Memory
No ratings yet
Matrix-Matrix Multiplication Using Shared Memory
27 pages
CSED405 Lec5-Threads and Atomics - 240921 - 193053
No ratings yet
CSED405 Lec5-Threads and Atomics - 240921 - 193053
34 pages
3 Computation
No ratings yet
3 Computation
28 pages
Lecture 6
No ratings yet
Lecture 6
28 pages
GPU Computing Course Overview
No ratings yet
GPU Computing Course Overview
17 pages
Lecture 4
No ratings yet
Lecture 4
48 pages
Segmentation and Classification of CT Renal Images Using Deep Networks
No ratings yet
Segmentation and Classification of CT Renal Images Using Deep Networks
8 pages
Lecture 2
No ratings yet
Lecture 2
77 pages
Dell PowerEdge VRTX and M-Series Compute Nodes Configuration Study
No ratings yet
Dell PowerEdge VRTX and M-Series Compute Nodes Configuration Study
33 pages
Holiday Worksheet Class 12
No ratings yet
Holiday Worksheet Class 12
5 pages
Operating System - In-Depth Study
No ratings yet
Operating System - In-Depth Study
4 pages
Abhishek - Kumar - (2022) - (1) Mentor Scaler AMEX CV
No ratings yet
Abhishek - Kumar - (2022) - (1) Mentor Scaler AMEX CV
2 pages
CC Unit 4 CY
No ratings yet
CC Unit 4 CY
19 pages
At Command Set
No ratings yet
At Command Set
60 pages
IBM Data and AI Quiz Results
No ratings yet
IBM Data and AI Quiz Results
11 pages
Just-in-Time (JIT) System Overview
No ratings yet
Just-in-Time (JIT) System Overview
7 pages
HP Sonos 5500 Using 3d User S Guide
No ratings yet
HP Sonos 5500 Using 3d User S Guide
35 pages
E-Tech Test Questions
No ratings yet
E-Tech Test Questions
11 pages
KX DRIVER v75 ReadMe PDF
No ratings yet
KX DRIVER v75 ReadMe PDF
11 pages
Risch Algorithm
No ratings yet
Risch Algorithm
10 pages
Kan Gazı İng. Broşür
No ratings yet
Kan Gazı İng. Broşür
8 pages
Excel Financial Option Pricing Guide
No ratings yet
Excel Financial Option Pricing Guide
19 pages
Logistic Regression for Ad Clicks
No ratings yet
Logistic Regression for Ad Clicks
14 pages
How To Setup Your Own OpenVPN Server in Pfsense
No ratings yet
How To Setup Your Own OpenVPN Server in Pfsense
23 pages
Project 1 - Cookie: 1 Lab Overview
No ratings yet
Project 1 - Cookie: 1 Lab Overview
9 pages
Gallagher Resume202306130516 AUS 27062023
No ratings yet
Gallagher Resume202306130516 AUS 27062023
3 pages
Shrihas - Devalekar - Automation Tester01 PDF
No ratings yet
Shrihas - Devalekar - Automation Tester01 PDF
3 pages
PCEP-30-02 Exam - Free Actual Q&as, Page 4 Exam
No ratings yet
PCEP-30-02 Exam - Free Actual Q&as, Page 4 Exam
1 page
Practical Labs: Linux Kernel and Driver Development Training
No ratings yet
Practical Labs: Linux Kernel and Driver Development Training
52 pages
Seagate 1 en
No ratings yet
Seagate 1 en
107 pages
Safety Integrity Level (SIL) Classification, Verification and Validation
0% (1)
Safety Integrity Level (SIL) Classification, Verification and Validation
2 pages
Four Bar Mechanism and Analysis in Creo
No ratings yet
Four Bar Mechanism and Analysis in Creo
9 pages
Python Debugging Handbook - Marking
No ratings yet
Python Debugging Handbook - Marking
220 pages
SCP Foxboro Control Simulation
No ratings yet
SCP Foxboro Control Simulation
4 pages
SJ 20140617195408 007 ZXA10 C320 V2 0 1 Optical Access Convergence
No ratings yet
SJ 20140617195408 007 ZXA10 C320 V2 0 1 Optical Access Convergence
95 pages
Pos Training Document
No ratings yet
Pos Training Document
20 pages
Helicopter Performance Stability and Control Prouty PDF
0% (1)
Helicopter Performance Stability and Control Prouty PDF
4 pages
01 Unit 3 A Digital Portfolio - Suhaib
No ratings yet
01 Unit 3 A Digital Portfolio - Suhaib
20 pages

Lecture5 2

Uploaded by

Lecture5 2

Uploaded by

GPU computing with CUDA

Lecture 3: CUDA Memories

! Each thread can:

! automatic scalar variables without qualifier reside

! scalar variables reside in fast, on-chip registers

! 100Ks per-thread variables, R/W by 1 thread

Can host access it?

Outside of any In the kernel

__constant__ int constant_var; int var;

// per-thread heap goes in off-chip memory

// read through num_qs points, maintaining

result[i] = x_i – x_i_minus_one;

result[i] = x_i – x_i_minus_one;

result[i] = x_i – x_i_minus_one;

result[i] = x_i – x_i_minus_one;

// avoid race condition: ensure all loads

// pass the size of the per-block array, in bytes, as the third

! Yes, you can use them!

__global__ void foo(void)

int *ptr_to_global = &my_global_variable;

! The address of a __device__ or __constant__

Warning: Cannot tell what pointer points to, assuming global

! Prefer dereferencing pointers in simple, regular

! Global memory resides in device memory (DRAM)

! Partition data into subsets that fit into shared memory

! Handle each data subset with one thread block

! Load the subset from global memory to shared

! Perform the computation on the subset from shared

! Copy the result from shared memory back to global

! Carefully partition data according to access patterns

__global__ void race(void)

// what is the value of

! This is a race condition

__global__ void share_data(int *input)

// assume *result is initialized to 0

// after this kernel exits, the value of

! Atomic operations aren t cheap!

__global__ void sum(int *input, int *result)

! Divide & Conquer

// thread 0 is responsible for

// thread 0 updates the total sum

! Use barriers such as __syncthreads to wait until

// do dot product between row of a and col of b

How many loads per term of dot 2 floats (a & b) =

! Each input element is

! Partition kernel loop

// dot product between row of s_a and col of s_b

! Two barriers per phase:

! Use barriers to guard data

! Each thread block should have many threads

! There should be many thread blocks

! Each thread block performs 2 * 256 = 512 x 4B loads

! Effective use of different memory resources reduces

! Effective use of CUDA memory hierarchy decreases

You might also like

constant int constant_var; int var;

global void foo(void)

! The address of a device or constant

global void race(void)

global void share_data(int *input)

global void sum(int input, int result)