3-computation
3-computation
Recap
●
Write a CUDA code corresponding to the
following sequential C code.
Note
Notethat
thatthere
thereisis
#include <cuda.h> no
noloop
loophere.
here.
#include <stdio.h>
#define N 100
#define N 100 __global__ void fun() {
int main() { printf("%d\n", threadIdx.x *
int i; threadIdx.x);
for (i = 0; i < N; ++i) }
printf("%d\n", i * i); int main() {
return 0; fun<<<1, N>>>();
} cudaDeviceSynchronize();
return 0;
2
}
Classwork
●
Write a CUDA code corresponding to the
following sequential C code.
#include <stdio.h>
#define N 100
int main() {
int a[N], i;
for (i = 0; i < N; ++i)
a[i] = i * i;
return 0;
}
3
Classwork
●
Write a CUDA code corresponding to the
following sequential C code.
#include <stdio.h>
#include <stdio.h> #include <cuda.h>
#define N 100
#define N 100 __global__ void fun(int *a) {
int main() { a[threadIdx.x] = threadIdx.x * threadIdx.x;
}
int a[N], i; int main() {
int a[N], *da;
for (i = 0; i < N; ++i) int i;
a[i] = i * i; cudaMalloc(&da, N * sizeof(int));
fun<<<1, N>>>(da);
return 0; cudaMemcpy(a, da, N * sizeof(int),
} cudaMemcpyDeviceToHost);
for (i = 0; i < N; ++i)
printf("%d\n", a[i]);
return 0; 4
}
Classwork
●
Write a CUDA code corresponding to the
following sequential C code.
#include <stdio.h>
#include <stdio.h> #include <cuda.h>
#define N 100
#define N 100 __global__ void fun(int *a) {
int main() { a[threadIdx.x] = threadIdx.x * threadIdx.x;
}
int a[N], i; int main() {
int a[N], *da;
for (i = 0; i < N; ++i) int i;
a[i] = i * i; cudaMalloc(&da, N * sizeof(int));
fun<<<1, N>>>(da);
return 0; cudaMemcpy(a, da, N * sizeof(int),
} cudaMemcpyDeviceToHost);
for (i = 0; i < N; ++i)
printf("%d\n", a[i]);
Observation return 0; 5
No cudaDeviceSynchronize required. }
Hello World with a Global.
#include <stdio.h>
#include <cuda.h>
const char *msg = "Hello World.\n";
__global__ void dkernel() {
// no-op
}
int main() {
printf(msg);
return 0;
}
6
GPU Hello World with a Global.
#include <stdio.h>
#include <cuda.h>
const char *msg = "Hello World.\n";
__global__ void dkernel() {
printf(msg);
}
int main() {
dkernel<<<1, 32>>>();
cudaDeviceSynchronize();
return 0;
}
7
GPU Hello World with a Global.
#include <stdio.h>
#include <cuda.h>
const char *msg = "Hello World.\n";
__global__ void dkernel() {
printf(msg);
}
int main() {
dkernel<<<1, 32>>>();
cudaDeviceSynchronize();
return 0;
}
10
GPU Hello World with a Global.
#include <stdio.h>
#include <cuda.h>
Takeaway
#define msg "Hello World.\n"
__global__ void dkernel() { CPU and GPU
printf(msg); memories are
separate
} (for discrete GPUs).
int main() {
dkernel<<<1, 32>>>();
#define msg “Hello World.\n”
cudaDeviceSynchronize();
is okay.
return 0;
} Compile: nvcc hello.cu
Run: ./a.out
Hello World.
11
Hello World.
...
Separate Memories
D R A M D R A M
PCI Express
Bus
CPU GPU
12
Separate Memories
D R A M D R A M
PCI Express
Bus
CPU GPU
●
CPU and its associated (discrete) GPUs have
separate physical memory (RAM).
●
A variable in CPU memory cannot be accessed
directly in a GPU kernel.
13
Separate Memories
D R A M D R A M
PCI Express
Bus
CPU GPU
●
CPU and its associated (discrete) GPUs have
separate physical memory (RAM).
●
A variable in CPU memory cannot be accessed
directly in a GPU kernel.
●
A programmer needs to maintain copies of variables.
●
It is programmer's responsibility to keep them in sync.14
Typical CUDA Program Flow
CPU
CPU GPU
GPU
Load data
into CPU 1
memory.
File
System
15
Typical CUDA Program Flow
Copy data from CPU
to GPU memory.
CPU
CPU GPU
GPU
Load data
into CPU 1
memory.
File
System
16
Typical CUDA Program Flow
Copy data from CPU
to GPU memory.
Execute
2 3 GPU
kernel.
CPU
CPU GPU
GPU
Load data
into CPU 1
memory.
File
System
17
Typical CUDA Program Flow
Copy data from CPU
to GPU memory.
Execute
2 3 GPU
kernel.
CPU
CPU GPU
GPU
Load data 4
into CPU 1
memory. Copy results from
GPU to CPU memory.
File
System
18
Typical CUDA Program Flow
Copy data from CPU
to GPU memory.
Execute
Use 5 2 3 GPU
results on kernel.
CPU.
CPU
CPU GPU
GPU
Load data 4
into CPU 1
memory. Copy results from
GPU to CPU memory.
File
System
19
Typical CUDA Program Flow
1 Load data into CPU memory.
- fread / rand
2 Copy data from CPU to GPU memory.
- cudaMemcpy(..., cudaMemcpyHostToDevice)
3 Call GPU kernel.
- mykernel<<<x, y>>>(...)
4 Copy results from GPU to CPU memory.
- cudaMemcpy(..., cudaMemcpyDeviceToHost)
5 Use results on CPU. 20
Typical CUDA Program Flow
int main() {
char cpuarr[] = "Gdkkn\x1fVnqkc-",
*gpuarr;
return 0;
} 22
CPU-GPU Communication
#include <stdio.h>
#include <cuda.h>
__global__ void dkernel(char *arr, int arrlen) {
unsigned id = threadIdx.x;
if (id < arrlen) {
++arr[id];
}
}
int main() {
char cpuarr[] = "Gdkkn\x1fVnqkc-",
*gpuarr;
return 0;
} 23
Classwork
1. Write a CUDA program to initialize an array of
size 32 to all zeros in parallel.
24
Classwork
1. Write a CUDA program to initialize an array of
size 32 to all zeros in parallel.
2. Change the array size to 1024.
25
Classwork
1. Write a CUDA program to initialize an array of
size 32 to all zeros in parallel.
2. Change the array size to 1024.
3. Create another kernel that adds i to array[i].
26
Classwork
1. Write a CUDA program to initialize an array of
size 32 to all zeros in parallel.
2. Change the array size to 1024.
3. Create another kernel that adds i to array[i].
4. Change the array size to 8000.
5. Check if answer to problem 3 still works.
27
Homework (z = x + y ) 2 3
●
Read a sequence of integers from a file.
●
Square each number.
●
Read another sequence of integers from
another file.
●
Cube each number.
●
Sum the two sequences element-wise, store in
the third sequence.
●
Print the computed sequence.
28