0% found this document useful (0 votes)
12 views28 pages

3-computation

The document provides a series of CUDA programming exercises and examples, including converting sequential C code to CUDA, handling memory between CPU and GPU, and executing kernels. It emphasizes the need for separate memory management for CPU and GPU, and outlines typical CUDA program flow. Additionally, it includes classwork and homework assignments to reinforce the concepts learned.

Uploaded by

webbstu1
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
12 views28 pages

3-computation

The document provides a series of CUDA programming exercises and examples, including converting sequential C code to CUDA, handling memory between CPU and GPU, and executing kernels. It emphasizes the need for separate memory management for CPU and GPU, and outlines typical CUDA program flow. Additionally, it includes classwork and homework assignments to reinforce the concepts learned.

Uploaded by

webbstu1
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 28

CUDA Programming

Recap

Write a CUDA code corresponding to the
following sequential C code.
Note
Notethat
thatthere
thereisis
#include <cuda.h> no
noloop
loophere.
here.
#include <stdio.h>
#define N 100
#define N 100 __global__ void fun() {
int main() { printf("%d\n", threadIdx.x *
int i; threadIdx.x);
for (i = 0; i < N; ++i) }
printf("%d\n", i * i); int main() {
return 0; fun<<<1, N>>>();
} cudaDeviceSynchronize();
return 0;
2
}
Classwork

Write a CUDA code corresponding to the
following sequential C code.
#include <stdio.h>
#define N 100
int main() {
int a[N], i;
for (i = 0; i < N; ++i)
a[i] = i * i;
return 0;
}

3
Classwork

Write a CUDA code corresponding to the
following sequential C code.
#include <stdio.h>
#include <stdio.h> #include <cuda.h>
#define N 100
#define N 100 __global__ void fun(int *a) {
int main() { a[threadIdx.x] = threadIdx.x * threadIdx.x;
}
int a[N], i; int main() {
int a[N], *da;
for (i = 0; i < N; ++i) int i;
a[i] = i * i; cudaMalloc(&da, N * sizeof(int));
fun<<<1, N>>>(da);
return 0; cudaMemcpy(a, da, N * sizeof(int),
} cudaMemcpyDeviceToHost);
for (i = 0; i < N; ++i)
printf("%d\n", a[i]);
return 0; 4
}
Classwork

Write a CUDA code corresponding to the
following sequential C code.
#include <stdio.h>
#include <stdio.h> #include <cuda.h>
#define N 100
#define N 100 __global__ void fun(int *a) {
int main() { a[threadIdx.x] = threadIdx.x * threadIdx.x;
}
int a[N], i; int main() {
int a[N], *da;
for (i = 0; i < N; ++i) int i;
a[i] = i * i; cudaMalloc(&da, N * sizeof(int));
fun<<<1, N>>>(da);
return 0; cudaMemcpy(a, da, N * sizeof(int),
} cudaMemcpyDeviceToHost);
for (i = 0; i < N; ++i)
printf("%d\n", a[i]);
Observation return 0; 5
No cudaDeviceSynchronize required. }
Hello World with a Global.
#include <stdio.h>
#include <cuda.h>
const char *msg = "Hello World.\n";
__global__ void dkernel() {
// no-op
}
int main() {
printf(msg);
return 0;
}

6
GPU Hello World with a Global.
#include <stdio.h>
#include <cuda.h>
const char *msg = "Hello World.\n";
__global__ void dkernel() {
printf(msg);
}
int main() {
dkernel<<<1, 32>>>();
cudaDeviceSynchronize();
return 0;
}

7
GPU Hello World with a Global.
#include <stdio.h>
#include <cuda.h>
const char *msg = "Hello World.\n";
__global__ void dkernel() {
printf(msg);
}
int main() {
dkernel<<<1, 32>>>();
cudaDeviceSynchronize();
return 0;
}

Compile: nvcc hello.cu 8


error: identifier "msg" is undefined in device code
GPU Hello World with a Global.
#include <stdio.h>
#include <cuda.h>
Takeaway
const char *msg = "Hello World.\n";
__global__ void dkernel() { CPU and GPU
printf(msg); memories are
separate
} (for discrete GPUs).
int main() {
dkernel<<<1, 32>>>();
cudaDeviceSynchronize();
return 0;
}

Compile: nvcc hello.cu 9


error: identifier "msg" is undefined in device code
GPU Hello World with a Global.
#include <stdio.h>
#include <cuda.h>
Takeaway
#define msg "Hello World.\n"
__global__ void dkernel() { CPU and GPU
printf(msg); memories are
separate
} (for discrete GPUs).
int main() {
dkernel<<<1, 32>>>();
cudaDeviceSynchronize();
return 0;
}

10
GPU Hello World with a Global.
#include <stdio.h>
#include <cuda.h>
Takeaway
#define msg "Hello World.\n"
__global__ void dkernel() { CPU and GPU
printf(msg); memories are
separate
} (for discrete GPUs).
int main() {
dkernel<<<1, 32>>>();
#define msg “Hello World.\n”
cudaDeviceSynchronize();
is okay.
return 0;
} Compile: nvcc hello.cu
Run: ./a.out
Hello World.
11
Hello World.
...
Separate Memories
D R A M D R A M

PCI Express
Bus
CPU GPU

12
Separate Memories
D R A M D R A M

PCI Express
Bus
CPU GPU


CPU and its associated (discrete) GPUs have
separate physical memory (RAM).

A variable in CPU memory cannot be accessed
directly in a GPU kernel.

13
Separate Memories
D R A M D R A M

PCI Express
Bus
CPU GPU


CPU and its associated (discrete) GPUs have
separate physical memory (RAM).

A variable in CPU memory cannot be accessed
directly in a GPU kernel.

A programmer needs to maintain copies of variables.

It is programmer's responsibility to keep them in sync.14
Typical CUDA Program Flow

CPU
CPU GPU
GPU

Load data
into CPU 1
memory.

File
System

15
Typical CUDA Program Flow
Copy data from CPU
to GPU memory.

CPU
CPU GPU
GPU

Load data
into CPU 1
memory.

File
System

16
Typical CUDA Program Flow
Copy data from CPU
to GPU memory.
Execute
2 3 GPU
kernel.

CPU
CPU GPU
GPU

Load data
into CPU 1
memory.

File
System

17
Typical CUDA Program Flow
Copy data from CPU
to GPU memory.
Execute
2 3 GPU
kernel.

CPU
CPU GPU
GPU

Load data 4
into CPU 1
memory. Copy results from
GPU to CPU memory.
File
System

18
Typical CUDA Program Flow
Copy data from CPU
to GPU memory.
Execute
Use 5 2 3 GPU
results on kernel.
CPU.
CPU
CPU GPU
GPU

Load data 4
into CPU 1
memory. Copy results from
GPU to CPU memory.
File
System

19
Typical CUDA Program Flow
1 Load data into CPU memory.
- fread / rand
2 Copy data from CPU to GPU memory.
- cudaMemcpy(..., cudaMemcpyHostToDevice)
3 Call GPU kernel.
- mykernel<<<x, y>>>(...)
4 Copy results from GPU to CPU memory.
- cudaMemcpy(..., cudaMemcpyDeviceToHost)
5 Use results on CPU. 20
Typical CUDA Program Flow

2 Copy data from CPU to GPU memory.


- cudaMemcpy(..., cudaMemcpyHostToDevice)

This means we need two copies of the same


variable – one on CPU another on GPU.
e.g., int *cpuarr, *gpuarr;
Matrix cpumat, gpumat;
21
Graph cpug, gpug;
CPU-GPU Communication
#include <stdio.h>
#include <cuda.h>
__global__ void dkernel(char *arr, int arrlen) {
unsigned id = threadIdx.x;
if (id < arrlen) {
++arr[id];
}
}

int main() {
char cpuarr[] = "Gdkkn\x1fVnqkc-",
*gpuarr;

cudaMalloc(&gpuarr, sizeof(char) * (1 + strlen(cpuarr)));


cudaMemcpy(gpuarr, cpuarr, sizeof(char) * (1 + strlen(cpuarr)), cudaMemcpyHostToDevice);
dkernel<<<1, 32>>>(gpuarr, strlen(cpuarr) + 1 );
cudaDeviceSynchronize(); // unnecessary, but okay.
cudaMemcpy(cpuarr, gpuarr, sizeof(char) * (1 + strlen(cpuarr)), cudaMemcpyDeviceToHost);
printf(cpuarr);

return 0;
} 22
CPU-GPU Communication
#include <stdio.h>
#include <cuda.h>
__global__ void dkernel(char *arr, int arrlen) {
unsigned id = threadIdx.x;
if (id < arrlen) {
++arr[id];
}
}

int main() {
char cpuarr[] = "Gdkkn\x1fVnqkc-",
*gpuarr;

cudaMalloc(&gpuarr, sizeof(char) * (1 + strlen(cpuarr)));


cudaMemcpy(gpuarr, cpuarr, sizeof(char) * (1 + strlen(cpuarr)), cudaMemcpyHostToDevice);
dkernel<<<1, 32>>>(gpuarr, strlen(cpuarr));
cudaDeviceSynchronize(); // unnecessary, but okay.
cudaMemcpy(cpuarr, gpuarr, sizeof(char) * (1 + strlen(cpuarr)), cudaMemcpyDeviceToHost);
printf(cpuarr);

return 0;
} 23
Classwork
1. Write a CUDA program to initialize an array of
size 32 to all zeros in parallel.

24
Classwork
1. Write a CUDA program to initialize an array of
size 32 to all zeros in parallel.
2. Change the array size to 1024.

25
Classwork
1. Write a CUDA program to initialize an array of
size 32 to all zeros in parallel.
2. Change the array size to 1024.
3. Create another kernel that adds i to array[i].

26
Classwork
1. Write a CUDA program to initialize an array of
size 32 to all zeros in parallel.
2. Change the array size to 1024.
3. Create another kernel that adds i to array[i].
4. Change the array size to 8000.
5. Check if answer to problem 3 still works.

27
Homework (z = x + y ) 2 3


Read a sequence of integers from a file.

Square each number.

Read another sequence of integers from
another file.

Cube each number.

Sum the two sequences element-wise, store in
the third sequence.

Print the computed sequence.

28

You might also like