Doug Miles, PGI Compilers & Tools, NVIDIA
High Performance Computing Advisory Council
February 21, 2018
ACCELERATING HPC APPLICATIONS
ON NVIDIA GPUS WITH OPENACC
2
PGI — THE NVIDIA HPC SDK
Fortran, C & C++ Compilers
Optimizing, SIMD Vectorizing, OpenMP
Accelerated Computing Features
CUDA Fortran, OpenACC Directives
Multi-Platform Solution
X86-64 and OpenPOWER Multicore CPUs
NVIDIA Tesla GPUs
Supported on Linux, macOS, Windows
MPI/OpenMP/OpenACC Tools
Debugger
Performance Profiler
Interoperable with DDT, TotalView
3
Programming GPU-Accelerated Systems
Separate CPU System and GPU Memories
GPU Developer View
System
Memory
GPU Memory
PCIe
4
NVLink
Programming GPU-Accelerated Systems
Separate CPU System and GPU Memories
GPU Developer View
System
Memory
GPU Memory
5
attributes(global) subroutine mm_kernel
( A, B, C, N, M, L )
real :: A(N,M), B(M,L), C(N,L), Cij
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
real, shared :: Asub(16,16),Bsub(16,16)
tx = threadidx%x
ty = threadidx%y
i = blockidx%x * 16 + tx
j = blockidx%y * 16 + ty
Cij = 0.0
do kb = 1, M, 16
Asub(tx,ty) = A(i,kb+tx-1)
Bsub(tx,ty) = B(kb+ty-1,j)
call syncthreads()
do k = 1,16
Cij = Cij + Asub(tx,k) * Bsub(k,ty)
enddo
call syncthreads()
enddo
C(i,j) = Cij
end subroutine mmul_kernel
real, device, allocatable, dimension(:,:) ::
Adev,Bdev,Cdev
. . .
allocate (Adev(N,M), Bdev(M,L), Cdev(N,L))
Adev = A(1:N,1:M)
Bdev = B(1:M,1:L)
call mm_kernel <<<dim3(N/16,M/16),dim3(16,16)>>>
( Adev, Bdev, Cdev, N, M, L )
C(1:N,1:L) = Cdev
deallocate ( Adev, Bdev, Cdev )
. . .
CPU Code Tesla Code
CUDA FORTRAN
6
CUDA FORTRAN
module madd_device_module
use cudafor
contains
subroutine madd_dev(a,b,c,sum,n1,n2)
real,dimension(:,:),device :: a,b,c
real :: sum
integer :: n1,n2
type(dim3) :: grid, block
!$cuf kernel do (2) <<<(*,*),(32,4)>>>
do j = 1,n2
do i = 1,n1
a(i,j) = b(i,j) + c(i,j)
sum = sum + a(i,j)
enddo
enddo
end subroutine
end module
Equivalent
hand-written
CUDA kernels
module madd_device_module
use cudafor
implicit none
contains
attributes(global) subroutine madd_kernel(a,b,c,blocksum,n1,n2)
real, dimension(:,:) :: a,b,c
real, dimension(:) :: blocksum
integer, value :: n1,n2
integer :: i,j,tindex,tneighbor,bindex
real :: mysum
real, shared :: bsum(256)
! Do this thread's work
mysum = 0.0
do j = threadidx%y + (blockidx%y-1)*blockdim%y, n2, blockdim%y*griddim%y
do i = threadidx%x + (blockidx%x-1)*blockdim%x, n1, blockdim%x*griddim%x
a(i,j) = b(i,j) + c(i,j)
mysum = mysum + a(i,j) ! accumulates partial sum per thread
enddo
enddo
! Now add up all partial sums for the whole thread block
! Compute this thread's linear index in the thread block
! We assume 256 threads in the thread block
tindex = threadidx%x + (threadidx%y-1)*blockdim%x
! Store this thread's partial sum in the shared memory block
bsum(tindex) = mysum
call syncthreads()
! Accumulate all the partial sums for this thread block to a single value
tneighbor = 128
do while( tneighbor >= 1 )
if( tindex <= tneighbor ) &
bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor)
tneighbor = tneighbor / 2
call syncthreads()
enddo
! Store the partial sum for the thread block
bindex = blockidx%x + (blockidx%y-1)*griddim%x
if( tindex == 1 ) blocksum(bindex) = bsum(1)
end subroutine
! Add up partial sums for all thread blocks to a single cumulative sum
attributes(global) subroutine madd_sum_kernel(blocksum,dsum,nb)
real, dimension(:) :: blocksum
real :: dsum
integer, value :: nb
real, shared :: bsum(256)
integer :: tindex,tneighbor,i
! Again, we assume 256 threads in the thread block
! accumulate a partial sum for each thread
tindex = threadidx%x
bsum(tindex) = 0.0
do i = tindex, nb, blockdim%x
bsum(tindex) = bsum(tindex) + blocksum(i)
enddo
call syncthreads()
! This code is copied from the previous kernel
! Accumulate all the partial sums for this thread block to a single value
! Since there is only one thread block, this single value is the final result
tneighbor = 128
do while( tneighbor >= 1 )
if( tindex <= tneighbor ) &
bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor)
tneighbor = tneighbor / 2
call syncthreads()
enddo
if( tindex == 1 ) dsum = bsum(1)
end subroutine
subroutine madd_dev(a,b,c,dsum,n1,n2)
real, dimension(:,:), device :: a,b,c
real, device :: dsum
real, dimension(:), allocatable, device :: blocksum
integer :: n1,n2,nb
type(dim3) :: grid, block
integer :: r
! Compute grid/block size; block size must be 256 threads
grid = dim3((n1+31)/32, (n2+7)/8, 1)
block = dim3(32,8,1)
nb = grid%x * grid%y
allocate(blocksum(1:nb))
call madd_kernel<<< grid, block >>>(a,b,c,blocksum,n1,n2)
call madd_sum_kernel<<< 1, 256 >>>(blocksum,dsum,nb)
r = cudaThreadSynchronize() ! don't deallocate too early
deallocate(blocksum)
end subroutine
!$CUF KERNEL Directives
7
OpenACC Directives
Manage
Data
Movement
Initiate
Parallel
Execution
Optimize
Loop
Mappings
#pragma acc data copyin(a,b) copyout(c)
{
...
#pragma acc parallel
{
#pragma acc loop gang vector
for (i = 0; i < n; ++i) {
c[i] = a[i] + b[i];
...
}
}
...
}
CPU, GPU, Manycore
Performance portable
Interoperable
Single source
Incremental
8
GPU
Memory
System
Memory
...
#pragma acc data copy(b[0:n][0:m]) 
create(a[0:n][0:m])
{
for (iter = 1; iter <= p; ++iter){
#pragma acc parallel loop
for (i = 1; i < n-1; ++i){
for (j = 1; j < m-1; ++j){
a[i][j]=w0*b[i][j]+
w1*(b[i-1][j]+b[i+1][j]+
b[i][j-1]+b[i][j+1])+
w2*(b[i-1][j-1]+b[i-1][j+1]+
b[i+1][j-1]+b[i+1][j+1]);
} }
#pragma acc parallel loop
for( i = 1; i < n-1; ++i )
for( j = 1; j < m-1; ++j )
b[i][j] = a[i][j];
}
}
...
AA
BB
S2
(B)S1
(B)S1
(B)S2
(B)
S1
(B)
Sp
(B)Sp
(B)
Sp
(B)
OpenACC for GPUs in a Nutshell
9
Multicore CPU
OpenACC is for Multicore, Manycore & GPUs
% pgfortran -ta=multicore –fast –Minfo=acc -c 
update_tile_halo_kernel.f90
. . .
100, Loop is parallelizable
Generating Multicore code
100, !$acc loop gang
102, Loop is parallelizable
Tesla GPU
% pgfortran -ta=tesla –fast -Minfo=acc –c 
update_tile_halo_kernel.f90
. . .
100, Loop is parallelizable
102, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
100, !$acc loop gang, vector(4) ! blockidx%y threadidx%y
102, !$acc loop gang, vector(32) ! blockidx%x threadidx%x
98 !$acc parallel
99 !$acc loop independent
100 do k=y_min-depth,y_max+depth
101 !$acc loop independent
102 do j=1,depth
103 density0(x_min-j,k)=left_density0(left_xmax+1-j,k)
104 enddo
105 enddo
106 !$acc end parallel
Performance measured February, 2018. Skylake: Two 20 core Intel Xeon Gold 6148 CPUs @ 2.4GHz w/ 376GB memory, hyperthreading enabled. EPYC: Two 24 core AMD EPYC 7451 CPUs
@ 2.3GHz w/ 256GB memory. Broadwell: Two 20 core Intel Xeon E5-2698 v4 CPUs @ 3.6GHz w/ 256GB memory, hyperthreading enabled. Volta: NVIDIA DGX1 system with two 20 core
Intel Xeon E5-2698 v4 CPUs @ 2.20GHz, 256GB memory, one NVIDIA Tesla V100-SXM2-16GB GPU @ 1.53GHz. SPEC® is a registered trademark of the Standard Performance Evaluation
Corporation (www.spec.org).
SPEC ACCEL 1.2 BENCHMARKS
0
50
100
150
200
2-socket Skylake 2-socket EPYC 2-socket BroadwellGEOMEANSeconds
Intel 2018 PGI 18.1
OpenMP 4.5
40 cores / 80 threads 48 cores / 48 threads 40 cores / 80 threads
0
50
100
150
200
GEOMEANSeconds
PGI 18.1
OpenACC
2-socket
Broadwell
1x Volta
V100
4.4x
Speed-up
11
OPENACC APPLICATIONS
12
GAUSSIAN 16
Using OpenACC allowed us to continue
development of our fundamental
algorithms and software capabilities
simultaneously with the GPU-related
work. In the end, we could use the
same code base for SMP, cluster/
network and GPU parallelism. PGI's
compilers were essential to the success
of our efforts.
Mike Frisch, Ph.D.
President and CEO
Gaussian, Inc.
Gaussian, Inc.
340QuinnipiacSt. Bldg. 40
Wallingford, CT 06492USA
custserv@gaussian.com
Gaussian isa registered trademark of Gaussian, Inc. All other trademarksand
thepropertiesof their respectiveholders. Specif cationssubject tochangewitho
Copyright © 2017, Gaussian, Inc. All rightsreserved.
Roberto Gomperts
NVIDIA
Michael Frisch
Gaussian
Brent Leback
NVIDIA/PGI
Gio
Project Contributors
%GPUCPU=0 - 7 =0 - 7 UseGPUs0-7with CPUs0-7astheir controllers.
Detailed information isavailableon our website.
13
ANSYS FLUENT
We’ve effectively used
OpenACC for heterogeneous
computing in ANSYS Fluent
with impressive performance.
We’re now applying this work
to more of our models and
new platforms.
Sunil Sathe
Lead Software Developer
ANSYS Fluent
Image courtesy: ANSYS
14
VASP
For VASP, OpenACC is the way
forward for GPU acceleration.
Performance is similar and in some
cases better than CUDA C, and
OpenACC dramatically decreases
GPU development and maintenance
efforts. We’re excited to collaborate
with NVIDIA and PGI as an early
adopter of CUDA Unified Memory.
Prof. Georg Kresse
Computational Materials Physics
University of Vienna
15
David Gutzwiller
Lead Software Developer
NUMECA
NUMECA FINE/Open
Porting our unstructured C++ CFD
solver FINE/Open to GPUs using
OpenACC would have been
impossible two or three years ago,
but OpenACC has developed
enough that we’re now getting
some really good results.
16
MPAS-A
Our team has been evaluating
OpenACC as a pathway to
performance portability for the Model
for Prediction (MPAS) atmospheric
model. Using this approach on the
MPAS dynamical core, we have
achieved performance on a single
P100 GPU equivalent to 2.7 dual
socketed Intel Xeon nodes on our new
Cheyenne supercomputer.
Richard Loft
Director, Technology Development
NCAR
Image courtesy: NCAR
17
OpenACC made it practical to
develop for GPU-based hardware
while retaining a single source for
almost all the COSMO physics
code.
Dr. Oliver Fuhrer
Senior Scientist
Meteoswiss
COSMO
18
GAMERA FOR GPU
With OpenACC and a compute
node based on NVIDIA's Tesla
P100 GPU, we achieved more
than a 14X speed up over a K
Computer node running our
earthquake disaster simulation
code
Takuma Yamaguchi, Kohei Fujita, Tsuyoshi Ichimura, Muneo
Hori, Lalith Wijerathne
The University of Tokyo
Map courtesy University of Tokyo
19
QUANTUM ESPRESSO
CUDA Fortran gives us the full
performance potential of the
CUDA programming model and
NVIDIA GPUs. !$CUF KERNELS
directives give us productivity and
source code maintainability. It’s
the best of both worlds.
Filippo Spiga
Head of Research Software Engineering
University of Cambridge
20
OPENACC AND CUDA UNIFIED MEMORY
21
Programming GPU-Accelerated Systems
CUDA Unified Memory for Dynamically Allocated Data
GPU Developer View With
CUDA Unified Memory
Unified Memory
GPU Developer View
System
Memory
GPU Memory
PCIe
22
How CUDA Unified Memory Works on TESLA GPUs
Servicing CPU and GPU Page Faults for Allocatable Data
GPU Memory MappingCPU Memory Mapping
PCIe or NVLink
Page
Fault
Page
Fault
arrayarray
__global__
void setValue(char *ptr, int index, char val)
{
ptr[index] = val;
}
cudaMallocManaged(&array, size);
memset(array, size);
setValue<<<...>>>(array, size/2, 5);
...
23
#pragma acc data copyin(a,b) copyout(c)
{
...
#pragma acc parallel
{
#pragma acc loop gang vector
for (i = 0; i < n; ++i) {
c[i] = a[i] + b[i];
...
}
}
...
}
PGI OpenACC and CUDA Unified Memory
Compiling with the –ta=tesla:managed option
C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory
GPU Developer View With
CUDA Unified Memory
Unified Memory
24
PGI OpenACC and CUDA Unified Memory
Compiling with the –ta=tesla:managed option
C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory
...
#pragma acc parallel
{
#pragma acc loop gang vector
for (i = 0; i < n; ++i) {
c[i] = a[i] + b[i];
...
}
}
...
GPU Developer View With
CUDA Unified Memory
Unified Memory
25
Center for Accelerated Application
Readiness (CAAR)
Oak Ridge Leadership Computing Facility
IBM POWER9 CPUs
NVIDIA Volta V100 GPUs
26
GTC: An OpenACC Production Application
The gyrokinetic toroidal
code (GTC) is a massively
parallel, particle-in-cell
production code for
turbulence simulation in
support of the burning
plasma experiment ITER,
the crucial next step in the
quest for fusion energy.
Being ported for runs on the ORNL Summit supercomputer
http://phoenix.ps.uci.edu/gtc_group
27
GTC Performance using OpenACC
P8 : IBM POWER8NVL, 2 sockets, 20 cores, NVLINK
UM : No Data Directives in sources, compiled with –ta=tesla:managed
2x
4x
6x
8x
10x
12x
OpenPOWER | NVLink | Unified Memory | P100 | V100
14x
16x
Data Directives Data Directives Data Directives
6.1X 5.9X
12.1X 12X
16.5X
20-core P8 P8+2xP100
UM
P8+2xP100 P8+4xP100
UM
P8+4xP100 x64+4xV100
28
DEEP COPY
29
Managing Aggregate Data Structures with OpenACC
An Example from the OpenACC port of VASP
Derived Type 1
Members:
3 dynamic
1 derived type 2
Derived Type 2
Members:
21 dynamic
1 derived type 3
1 derived type 4
Derived Type 3
Members:
only static
Derived Type 4
Members:
8 dynamic
4 derived type 5
2 derived type 6
Derived Type 5
Members:
3 dynamic
Derived Type 6
Members:
8 dynamic
• Real-world applications often have complex,
aggregate data structures
• CUDA Unified Memory can automatically
manage Deep Copy, but …
30
Managing Aggregate Data Structures with OpenACC
An Example from the OpenACC port of VASP
Derived Type 1
Members:
3 dynamic
1 derived type 2
Derived Type 2
Members:
21 dynamic
1 derived type 3
1 derived type 4
Derived Type 3
Members:
only static
Derived Type 4
Members:
8 dynamic
4 derived type 5
2 derived type 6
Derived Type 5
Members:
3 dynamic
Derived Type 6
Members:
8 dynamic
• Real-world applications often have complex,
aggregate data structures
• CUDA Unified Memory can automatically
manage Deep Copy, but …
• CUDA Unified Memory is only for allocatable
data today
31
FORTRAN AUTOMATIC FULL DEEP COPY
Fortran Derived Types
32
OPENACC 2.6 MANUAL DEEP COPY
typedef struct points {
float* x; float* y; float* z;
int n;
float coef, direction;
} points;
void sub ( int n, float* y ) {
points p;
#pragma acc data create (p)
{
p.n = n;
p.x = ( float*) malloc ( sizeof ( float )*n );
p.y = ( float*) malloc ( sizeof ( float )*n );
p.z = ( float*) malloc ( sizeof ( float )*n );
#pragma acc update device (p.n)
#pragma acc data copyin (p.x[0:n], p.y[0: n])
{
#pragma acc parallel loop
for ( i =0; i<p.n; ++I ) p.x[i] += p.y[i];
. . .
Supported Today in PGI Compilers
33
DRAFT OPENACC 3.0 TRUE DEEP COPY
typedef struct points {
float* x; float* y; float* z;
int n;
float coef, direction;
#pragma acc policy inout(x[0:n],y[0:n])
} points;
void sub ( int n, float* y ) {
points p;
p.n = n;
p.x = ( float*) malloc ( sizeof ( float )*n );
p.y = ( float*) malloc ( sizeof ( float )*n );
p.z = ( float*) malloc ( sizeof ( float )*n );
#pragma acc data copy (p)
{
#pragma acc parallel loop
for ( i =0; i<p.n; ++I ) p.x[i] += p.y[i];
. . .
Still in definition by the OpenACC Committee
34
WHITHER OPENACC?
35
0
20
40
60
80
100
120
140
160
Multicore Haswell Multicore Broadwell Multicore Skylake Kepler Pascal
PGI 18.1 OpenACC
Intel 2018 OpenMP
7.6x 7.9x 10x 10x 11x
40x
14.8x 15x
Volta V100
CLOVERLEAF
SpeedupvsSingleHaswellCore
Systems: Haswell: 2x16 core Haswell server, four K80s, CentOS 7.2 (perf-hsw10), Broadwell: 2x20 core Broadwell server, eight P100s (dgx1-prd-01), Broadwell server, eight V100s (dgx07), Skylake 2x20 core Xeon Gold server (sky-4).
Compilers: Intel 2018.0.128, PGI 18.1
Benchmark: CloverLeaf v1.3 downloaded from http://uk-mac.github.io/CloverLeaf the week of November 7 2016; CloverlLeaf_Serial; CloverLeaf_ref (MPI+OpenMP); CloverLeaf_OpenACC (MPI+OpenACC)
Data compiled by PGI February 2018.
AWE Hydrodynamics mini-App, bm32 data set
http://uk-mac.github.io/CloverLeaf
109x
67x
142x
1x 2x 4x
36
OPENACC DIRECTIVES FOR GPUS
75 !$ACC KERNELS
76 !$ACC LOOP INDEPENDENT
77 DO k=y_min,y_max
78 !$ACC LOOP INDEPENDENT PRIVATE(right_flux,left_flux,top_flux,bottom_flux,total_flux,
min_cell_volume,energy_change,recip_volume)
79 DO j=x_min,x_max
80
81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) &
82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5
83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) &
84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5
85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) &
86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5
87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) &
88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5
89 total_flux=right_flux-left_flux+top_flux-bottom_flux
90
91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux)
92
93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux &
94 ,volume(j,k)+right_flux-left_flux &
95 ,volume(j,k)+top_flux-bottom_flux)
97 recip_volume=1.0/volume(j,k)
99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*...
101 energy1(j,k)=energy0(j,k)-energy_change
103 density1(j,k)=density0(j,k)*volume_change(j,k)
105 ENDDO
106 ENDDO
107 !$ACC END KERNELS
% pgfortran –fast –ta=tesla –Minfo -c PdV_kernel.f90
pdv_kernel:
...
77, Loop is parallelizable
79, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
77, !$acc loop gang, vector(4) ! blockidx%y
! threadidx%y
79, !$acc loop gang, vector(32)! blockidx%x
! threadidx%x
...
http://uk-mac.github.io/CloverLeaf
37
OPENACC DIRECTIVES FOR MULTICORE CPUS
75 !$ACC KERNELS
76 !$ACC LOOP INDEPENDENT
77 DO k=y_min,y_max
78 !$ACC LOOP INDEPENDENT PRIVATE(right_flux,left_flux,top_flux,bottom_flux,total_flux,
min_cell_volume,energy_change,recip_volume)
79 DO j=x_min,x_max
80
81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) &
82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5
83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) &
84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5
85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) &
86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5
87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) &
88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5
89 total_flux=right_flux-left_flux+top_flux-bottom_flux
90
91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux)
92
93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux &
94 ,volume(j,k)+right_flux-left_flux &
95 ,volume(j,k)+top_flux-bottom_flux)
97 recip_volume=1.0/volume(j,k)
99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*...
101 energy1(j,k)=energy0(j,k)-energy_change
103 density1(j,k)=density0(j,k)*volume_change(j,k)
105 ENDDO
106 ENDDO
107 !$ACC END KERNELS
% pgfortran –fast –ta=multicore ... PdV_kernel.f90
pdv_kernel:
...
77, Loop is parallelizable
Generating Multicore code
77, !$acc loop gang
79, Loop is parallelizable
3 loop-carried redundant expressions removed
with 9 operations and 9 arrays
Innermost loop distributed: 2 new loops
Generated vector SIMD code for the loop
Generated 2 prefetch instructions for the loop
Generated 12 prefetch instructions for the loop
...
http://uk-mac.github.io/CloverLeaf
38
FORTRAN 2018 DO CONCURRENT
75
76
77 DO CONCURRENT (k=y_min:y_max, j=x_min:x_max) &
78 LOCAL (right_flux,left_flux,top_flux,bottom_flux,total_flux, &
min_cell_volume,energy_change,recip_volume)
79
80
81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) &
82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5
83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) &
84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5
85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) &
86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5
87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) &
88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5
89 total_flux=right_flux-left_flux+top_flux-bottom_flux
90
91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux)
92
93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux &
94 ,volume(j,k)+right_flux-left_flux &
95 ,volume(j,k)+top_flux-bottom_flux)
97 recip_volume=1.0/volume(j,k)
99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*...
101 energy1(j,k)=energy0(j,k)-energy_change
103 density1(j,k)=density0(j,k)*volume_change(j,k)
105
106 ENDDO
107
Fortran 2018 DO CONCURRENT
+ True Parallel Loops
+ Loop-scope shared/private data
− No support for reductions
− No support for atomics
− No support for data management
39
OPENACC FOR EVERYONE
The PGI Community Edition, pgicompilers.com/community
PROGRAMMING MODELS
OpenACC, CUDA Fortran, OpenMP,
C/C++/Fortran Compilers and Tools
PLATFORMS
X86, OpenPOWER, NVIDIA GPU
UPDATES 1-2 times a year 6-9 times a year 6-9 times a year
SUPPORT User Forums PGI Support
PGI Premier
Services
LICENSE Annual Perpetual Volume/Site
FREE

More Related Content

PPTX
LLVM Backend Porting
PDF
Lcu14 107- op-tee on ar mv8
PPT
001 linux revision
PPT
Process and Threads in Linux - PPT
PDF
Linux Internals - Interview essentials - 1.0
PDF
Xen in Safety-Critical Systems - Critical Summit 2022
ODP
Dpdk performance
PDF
Kernel Recipes 2015: Representing device-tree peripherals in ACPI
LLVM Backend Porting
Lcu14 107- op-tee on ar mv8
001 linux revision
Process and Threads in Linux - PPT
Linux Internals - Interview essentials - 1.0
Xen in Safety-Critical Systems - Critical Summit 2022
Dpdk performance
Kernel Recipes 2015: Representing device-tree peripherals in ACPI

What's hot (20)

PDF
FPGAX2019
ODP
eBPF maps 101
PDF
Ixgbe internals
PPTX
Windows Internals for Linux Kernel Developers
PDF
HKG15-107: ACPI Power Management on ARM64 Servers (v2)
PPTX
Mmap failure analysis
PPTX
Understanding eBPF in a Hurry!
PPTX
Linux Initialization Process (2)
PDF
Embedded Android : System Development - Part III (Audio / Video HAL)
PDF
Automate Your Kafka Cluster with Kubernetes Custom Resources
PDF
ACRi HLSチャレンジ紹介
PPT
"Learning AOSP" - Android Hardware Abstraction Layer (HAL)
ODP
Lisa 2015-gluster fs-introduction
PPTX
Memory model
PDF
Device Tree for Dummies (ELC 2014)
PDF
Arm device tree and linux device drivers
PDF
Linux Instrumentation
PDF
Performance Analysis: The USE Method
PDF
How Linux Processes Your Network Packet - Elazar Leibovich
FPGAX2019
eBPF maps 101
Ixgbe internals
Windows Internals for Linux Kernel Developers
HKG15-107: ACPI Power Management on ARM64 Servers (v2)
Mmap failure analysis
Understanding eBPF in a Hurry!
Linux Initialization Process (2)
Embedded Android : System Development - Part III (Audio / Video HAL)
Automate Your Kafka Cluster with Kubernetes Custom Resources
ACRi HLSチャレンジ紹介
"Learning AOSP" - Android Hardware Abstraction Layer (HAL)
Lisa 2015-gluster fs-introduction
Memory model
Device Tree for Dummies (ELC 2014)
Arm device tree and linux device drivers
Linux Instrumentation
Performance Analysis: The USE Method
How Linux Processes Your Network Packet - Elazar Leibovich
Ad

Similar to Accelerating HPC Applications on NVIDIA GPUs with OpenACC (20)

PPTX
PGI Compilers & Tools Update- March 2018
PDF
PL-4044, OpenACC on AMD APUs and GPUs with the PGI Accelerator Compilers, by ...
PDF
Application Optimisation using OpenPOWER and Power 9 systems
PDF
Using GPUs for parallel processing
PDF
The Rise of Parallel Computing
PPTX
OpenACC Monthly Highlights: June 2020
PPTX
OpenACC Monthly Highlights: November 2020
PDF
OpenACC and Hackathons Monthly Highlights: April 2023
PPTX
OpenACC Monthly Highlights: January 2021
PDF
[01][gpu 컴퓨팅을 위한 언어, 도구 및 api] miller languages tools
PPT
Introduction to parallel computing using CUDA
PDF
Directive-based approach to Heterogeneous Computing
PPTX
OpenACC Monthly Highlights: May 2020
PPTX
OpenACC Monthly Highlights: August 2020
PPTX
OpenACC Monthly Highlights September 2020
PDF
GPU Programming
PDF
Task based Programming with OmpSs and its Application
PDF
CUDA by Example : Thread Cooperation : Notes
PDF
Programar para GPUs
PPTX
OpenACC Monthly Highlights: October2020
PGI Compilers & Tools Update- March 2018
PL-4044, OpenACC on AMD APUs and GPUs with the PGI Accelerator Compilers, by ...
Application Optimisation using OpenPOWER and Power 9 systems
Using GPUs for parallel processing
The Rise of Parallel Computing
OpenACC Monthly Highlights: June 2020
OpenACC Monthly Highlights: November 2020
OpenACC and Hackathons Monthly Highlights: April 2023
OpenACC Monthly Highlights: January 2021
[01][gpu 컴퓨팅을 위한 언어, 도구 및 api] miller languages tools
Introduction to parallel computing using CUDA
Directive-based approach to Heterogeneous Computing
OpenACC Monthly Highlights: May 2020
OpenACC Monthly Highlights: August 2020
OpenACC Monthly Highlights September 2020
GPU Programming
Task based Programming with OmpSs and its Application
CUDA by Example : Thread Cooperation : Notes
Programar para GPUs
OpenACC Monthly Highlights: October2020
Ad

More from inside-BigData.com (20)

PDF
Major Market Shifts in IT
PDF
Preparing to program Aurora at Exascale - Early experiences and future direct...
PPTX
Transforming Private 5G Networks
PDF
The Incorporation of Machine Learning into Scientific Simulations at Lawrence...
PDF
How to Achieve High-Performance, Scalable and Distributed DNN Training on Mod...
PDF
Evolving Cyberinfrastructure, Democratizing Data, and Scaling AI to Catalyze ...
PDF
HPC Impact: EDA Telemetry Neural Networks
PDF
Biohybrid Robotic Jellyfish for Future Applications in Ocean Monitoring
PDF
Machine Learning for Weather Forecasts
PPTX
HPC AI Advisory Council Update
PDF
Fugaku Supercomputer joins fight against COVID-19
PDF
Energy Efficient Computing using Dynamic Tuning
PDF
HPC at Scale Enabled by DDN A3i and NVIDIA SuperPOD
PDF
State of ARM-based HPC
PDF
Versal Premium ACAP for Network and Cloud Acceleration
PDF
Zettar: Moving Massive Amounts of Data across Any Distance Efficiently
PDF
Scaling TCO in a Post Moore's Era
PDF
CUDA-Python and RAPIDS for blazing fast scientific computing
PDF
Introducing HPC with a Raspberry Pi Cluster
PDF
Overview of HPC Interconnects
Major Market Shifts in IT
Preparing to program Aurora at Exascale - Early experiences and future direct...
Transforming Private 5G Networks
The Incorporation of Machine Learning into Scientific Simulations at Lawrence...
How to Achieve High-Performance, Scalable and Distributed DNN Training on Mod...
Evolving Cyberinfrastructure, Democratizing Data, and Scaling AI to Catalyze ...
HPC Impact: EDA Telemetry Neural Networks
Biohybrid Robotic Jellyfish for Future Applications in Ocean Monitoring
Machine Learning for Weather Forecasts
HPC AI Advisory Council Update
Fugaku Supercomputer joins fight against COVID-19
Energy Efficient Computing using Dynamic Tuning
HPC at Scale Enabled by DDN A3i and NVIDIA SuperPOD
State of ARM-based HPC
Versal Premium ACAP for Network and Cloud Acceleration
Zettar: Moving Massive Amounts of Data across Any Distance Efficiently
Scaling TCO in a Post Moore's Era
CUDA-Python and RAPIDS for blazing fast scientific computing
Introducing HPC with a Raspberry Pi Cluster
Overview of HPC Interconnects

Recently uploaded (20)

PDF
Lung cancer patients survival prediction using outlier detection and optimize...
PDF
giants, standing on the shoulders of - by Daniel Stenberg
PDF
Aug23rd - Mulesoft Community Workshop - Hyd, India.pdf
PDF
Dell Pro Micro: Speed customer interactions, patient processing, and learning...
PDF
Transform-Your-Streaming-Platform-with-AI-Driven-Quality-Engineering.pdf
PPTX
Presentation - Principles of Instructional Design.pptx
PDF
Rapid Prototyping: A lecture on prototyping techniques for interface design
PDF
Transform-Quality-Engineering-with-AI-A-60-Day-Blueprint-for-Digital-Success.pdf
PDF
Data Virtualization in Action: Scaling APIs and Apps with FME
PDF
Planning-an-Audit-A-How-To-Guide-Checklist-WP.pdf
PPTX
Build automations faster and more reliably with UiPath ScreenPlay
PDF
MENA-ECEONOMIC-CONTEXT-VC MENA-ECEONOMIC
PDF
4 layer Arch & Reference Arch of IoT.pdf
PDF
The AI Revolution in Customer Service - 2025
PDF
The-2025-Engineering-Revolution-AI-Quality-and-DevOps-Convergence.pdf
PPTX
Module 1 Introduction to Web Programming .pptx
PDF
CEH Module 2 Footprinting CEH V13, concepts
PDF
Altius execution marketplace concept.pdf
PDF
Introduction to MCP and A2A Protocols: Enabling Agent Communication
PDF
A symptom-driven medical diagnosis support model based on machine learning te...
Lung cancer patients survival prediction using outlier detection and optimize...
giants, standing on the shoulders of - by Daniel Stenberg
Aug23rd - Mulesoft Community Workshop - Hyd, India.pdf
Dell Pro Micro: Speed customer interactions, patient processing, and learning...
Transform-Your-Streaming-Platform-with-AI-Driven-Quality-Engineering.pdf
Presentation - Principles of Instructional Design.pptx
Rapid Prototyping: A lecture on prototyping techniques for interface design
Transform-Quality-Engineering-with-AI-A-60-Day-Blueprint-for-Digital-Success.pdf
Data Virtualization in Action: Scaling APIs and Apps with FME
Planning-an-Audit-A-How-To-Guide-Checklist-WP.pdf
Build automations faster and more reliably with UiPath ScreenPlay
MENA-ECEONOMIC-CONTEXT-VC MENA-ECEONOMIC
4 layer Arch & Reference Arch of IoT.pdf
The AI Revolution in Customer Service - 2025
The-2025-Engineering-Revolution-AI-Quality-and-DevOps-Convergence.pdf
Module 1 Introduction to Web Programming .pptx
CEH Module 2 Footprinting CEH V13, concepts
Altius execution marketplace concept.pdf
Introduction to MCP and A2A Protocols: Enabling Agent Communication
A symptom-driven medical diagnosis support model based on machine learning te...

Accelerating HPC Applications on NVIDIA GPUs with OpenACC

  • 1. Doug Miles, PGI Compilers & Tools, NVIDIA High Performance Computing Advisory Council February 21, 2018 ACCELERATING HPC APPLICATIONS ON NVIDIA GPUS WITH OPENACC
  • 2. 2 PGI — THE NVIDIA HPC SDK Fortran, C & C++ Compilers Optimizing, SIMD Vectorizing, OpenMP Accelerated Computing Features CUDA Fortran, OpenACC Directives Multi-Platform Solution X86-64 and OpenPOWER Multicore CPUs NVIDIA Tesla GPUs Supported on Linux, macOS, Windows MPI/OpenMP/OpenACC Tools Debugger Performance Profiler Interoperable with DDT, TotalView
  • 3. 3 Programming GPU-Accelerated Systems Separate CPU System and GPU Memories GPU Developer View System Memory GPU Memory PCIe
  • 4. 4 NVLink Programming GPU-Accelerated Systems Separate CPU System and GPU Memories GPU Developer View System Memory GPU Memory
  • 5. 5 attributes(global) subroutine mm_kernel ( A, B, C, N, M, L ) real :: A(N,M), B(M,L), C(N,L), Cij integer, value :: N, M, L integer :: i, j, kb, k, tx, ty real, shared :: Asub(16,16),Bsub(16,16) tx = threadidx%x ty = threadidx%y i = blockidx%x * 16 + tx j = blockidx%y * 16 + ty Cij = 0.0 do kb = 1, M, 16 Asub(tx,ty) = A(i,kb+tx-1) Bsub(tx,ty) = B(kb+ty-1,j) call syncthreads() do k = 1,16 Cij = Cij + Asub(tx,k) * Bsub(k,ty) enddo call syncthreads() enddo C(i,j) = Cij end subroutine mmul_kernel real, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev . . . allocate (Adev(N,M), Bdev(M,L), Cdev(N,L)) Adev = A(1:N,1:M) Bdev = B(1:M,1:L) call mm_kernel <<<dim3(N/16,M/16),dim3(16,16)>>> ( Adev, Bdev, Cdev, N, M, L ) C(1:N,1:L) = Cdev deallocate ( Adev, Bdev, Cdev ) . . . CPU Code Tesla Code CUDA FORTRAN
  • 6. 6 CUDA FORTRAN module madd_device_module use cudafor contains subroutine madd_dev(a,b,c,sum,n1,n2) real,dimension(:,:),device :: a,b,c real :: sum integer :: n1,n2 type(dim3) :: grid, block !$cuf kernel do (2) <<<(*,*),(32,4)>>> do j = 1,n2 do i = 1,n1 a(i,j) = b(i,j) + c(i,j) sum = sum + a(i,j) enddo enddo end subroutine end module Equivalent hand-written CUDA kernels module madd_device_module use cudafor implicit none contains attributes(global) subroutine madd_kernel(a,b,c,blocksum,n1,n2) real, dimension(:,:) :: a,b,c real, dimension(:) :: blocksum integer, value :: n1,n2 integer :: i,j,tindex,tneighbor,bindex real :: mysum real, shared :: bsum(256) ! Do this thread's work mysum = 0.0 do j = threadidx%y + (blockidx%y-1)*blockdim%y, n2, blockdim%y*griddim%y do i = threadidx%x + (blockidx%x-1)*blockdim%x, n1, blockdim%x*griddim%x a(i,j) = b(i,j) + c(i,j) mysum = mysum + a(i,j) ! accumulates partial sum per thread enddo enddo ! Now add up all partial sums for the whole thread block ! Compute this thread's linear index in the thread block ! We assume 256 threads in the thread block tindex = threadidx%x + (threadidx%y-1)*blockdim%x ! Store this thread's partial sum in the shared memory block bsum(tindex) = mysum call syncthreads() ! Accumulate all the partial sums for this thread block to a single value tneighbor = 128 do while( tneighbor >= 1 ) if( tindex <= tneighbor ) & bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor) tneighbor = tneighbor / 2 call syncthreads() enddo ! Store the partial sum for the thread block bindex = blockidx%x + (blockidx%y-1)*griddim%x if( tindex == 1 ) blocksum(bindex) = bsum(1) end subroutine ! Add up partial sums for all thread blocks to a single cumulative sum attributes(global) subroutine madd_sum_kernel(blocksum,dsum,nb) real, dimension(:) :: blocksum real :: dsum integer, value :: nb real, shared :: bsum(256) integer :: tindex,tneighbor,i ! Again, we assume 256 threads in the thread block ! accumulate a partial sum for each thread tindex = threadidx%x bsum(tindex) = 0.0 do i = tindex, nb, blockdim%x bsum(tindex) = bsum(tindex) + blocksum(i) enddo call syncthreads() ! This code is copied from the previous kernel ! Accumulate all the partial sums for this thread block to a single value ! Since there is only one thread block, this single value is the final result tneighbor = 128 do while( tneighbor >= 1 ) if( tindex <= tneighbor ) & bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor) tneighbor = tneighbor / 2 call syncthreads() enddo if( tindex == 1 ) dsum = bsum(1) end subroutine subroutine madd_dev(a,b,c,dsum,n1,n2) real, dimension(:,:), device :: a,b,c real, device :: dsum real, dimension(:), allocatable, device :: blocksum integer :: n1,n2,nb type(dim3) :: grid, block integer :: r ! Compute grid/block size; block size must be 256 threads grid = dim3((n1+31)/32, (n2+7)/8, 1) block = dim3(32,8,1) nb = grid%x * grid%y allocate(blocksum(1:nb)) call madd_kernel<<< grid, block >>>(a,b,c,blocksum,n1,n2) call madd_sum_kernel<<< 1, 256 >>>(blocksum,dsum,nb) r = cudaThreadSynchronize() ! don't deallocate too early deallocate(blocksum) end subroutine !$CUF KERNEL Directives
  • 7. 7 OpenACC Directives Manage Data Movement Initiate Parallel Execution Optimize Loop Mappings #pragma acc data copyin(a,b) copyout(c) { ... #pragma acc parallel { #pragma acc loop gang vector for (i = 0; i < n; ++i) { c[i] = a[i] + b[i]; ... } } ... } CPU, GPU, Manycore Performance portable Interoperable Single source Incremental
  • 8. 8 GPU Memory System Memory ... #pragma acc data copy(b[0:n][0:m]) create(a[0:n][0:m]) { for (iter = 1; iter <= p; ++iter){ #pragma acc parallel loop for (i = 1; i < n-1; ++i){ for (j = 1; j < m-1; ++j){ a[i][j]=w0*b[i][j]+ w1*(b[i-1][j]+b[i+1][j]+ b[i][j-1]+b[i][j+1])+ w2*(b[i-1][j-1]+b[i-1][j+1]+ b[i+1][j-1]+b[i+1][j+1]); } } #pragma acc parallel loop for( i = 1; i < n-1; ++i ) for( j = 1; j < m-1; ++j ) b[i][j] = a[i][j]; } } ... AA BB S2 (B)S1 (B)S1 (B)S2 (B) S1 (B) Sp (B)Sp (B) Sp (B) OpenACC for GPUs in a Nutshell
  • 9. 9 Multicore CPU OpenACC is for Multicore, Manycore & GPUs % pgfortran -ta=multicore –fast –Minfo=acc -c update_tile_halo_kernel.f90 . . . 100, Loop is parallelizable Generating Multicore code 100, !$acc loop gang 102, Loop is parallelizable Tesla GPU % pgfortran -ta=tesla –fast -Minfo=acc –c update_tile_halo_kernel.f90 . . . 100, Loop is parallelizable 102, Loop is parallelizable Accelerator kernel generated Generating Tesla code 100, !$acc loop gang, vector(4) ! blockidx%y threadidx%y 102, !$acc loop gang, vector(32) ! blockidx%x threadidx%x 98 !$acc parallel 99 !$acc loop independent 100 do k=y_min-depth,y_max+depth 101 !$acc loop independent 102 do j=1,depth 103 density0(x_min-j,k)=left_density0(left_xmax+1-j,k) 104 enddo 105 enddo 106 !$acc end parallel
  • 10. Performance measured February, 2018. Skylake: Two 20 core Intel Xeon Gold 6148 CPUs @ 2.4GHz w/ 376GB memory, hyperthreading enabled. EPYC: Two 24 core AMD EPYC 7451 CPUs @ 2.3GHz w/ 256GB memory. Broadwell: Two 20 core Intel Xeon E5-2698 v4 CPUs @ 3.6GHz w/ 256GB memory, hyperthreading enabled. Volta: NVIDIA DGX1 system with two 20 core Intel Xeon E5-2698 v4 CPUs @ 2.20GHz, 256GB memory, one NVIDIA Tesla V100-SXM2-16GB GPU @ 1.53GHz. SPEC® is a registered trademark of the Standard Performance Evaluation Corporation (www.spec.org). SPEC ACCEL 1.2 BENCHMARKS 0 50 100 150 200 2-socket Skylake 2-socket EPYC 2-socket BroadwellGEOMEANSeconds Intel 2018 PGI 18.1 OpenMP 4.5 40 cores / 80 threads 48 cores / 48 threads 40 cores / 80 threads 0 50 100 150 200 GEOMEANSeconds PGI 18.1 OpenACC 2-socket Broadwell 1x Volta V100 4.4x Speed-up
  • 12. 12 GAUSSIAN 16 Using OpenACC allowed us to continue development of our fundamental algorithms and software capabilities simultaneously with the GPU-related work. In the end, we could use the same code base for SMP, cluster/ network and GPU parallelism. PGI's compilers were essential to the success of our efforts. Mike Frisch, Ph.D. President and CEO Gaussian, Inc. Gaussian, Inc. 340QuinnipiacSt. Bldg. 40 Wallingford, CT 06492USA [email protected] Gaussian isa registered trademark of Gaussian, Inc. All other trademarksand thepropertiesof their respectiveholders. Specif cationssubject tochangewitho Copyright © 2017, Gaussian, Inc. All rightsreserved. Roberto Gomperts NVIDIA Michael Frisch Gaussian Brent Leback NVIDIA/PGI Gio Project Contributors %GPUCPU=0 - 7 =0 - 7 UseGPUs0-7with CPUs0-7astheir controllers. Detailed information isavailableon our website.
  • 13. 13 ANSYS FLUENT We’ve effectively used OpenACC for heterogeneous computing in ANSYS Fluent with impressive performance. We’re now applying this work to more of our models and new platforms. Sunil Sathe Lead Software Developer ANSYS Fluent Image courtesy: ANSYS
  • 14. 14 VASP For VASP, OpenACC is the way forward for GPU acceleration. Performance is similar and in some cases better than CUDA C, and OpenACC dramatically decreases GPU development and maintenance efforts. We’re excited to collaborate with NVIDIA and PGI as an early adopter of CUDA Unified Memory. Prof. Georg Kresse Computational Materials Physics University of Vienna
  • 15. 15 David Gutzwiller Lead Software Developer NUMECA NUMECA FINE/Open Porting our unstructured C++ CFD solver FINE/Open to GPUs using OpenACC would have been impossible two or three years ago, but OpenACC has developed enough that we’re now getting some really good results.
  • 16. 16 MPAS-A Our team has been evaluating OpenACC as a pathway to performance portability for the Model for Prediction (MPAS) atmospheric model. Using this approach on the MPAS dynamical core, we have achieved performance on a single P100 GPU equivalent to 2.7 dual socketed Intel Xeon nodes on our new Cheyenne supercomputer. Richard Loft Director, Technology Development NCAR Image courtesy: NCAR
  • 17. 17 OpenACC made it practical to develop for GPU-based hardware while retaining a single source for almost all the COSMO physics code. Dr. Oliver Fuhrer Senior Scientist Meteoswiss COSMO
  • 18. 18 GAMERA FOR GPU With OpenACC and a compute node based on NVIDIA's Tesla P100 GPU, we achieved more than a 14X speed up over a K Computer node running our earthquake disaster simulation code Takuma Yamaguchi, Kohei Fujita, Tsuyoshi Ichimura, Muneo Hori, Lalith Wijerathne The University of Tokyo Map courtesy University of Tokyo
  • 19. 19 QUANTUM ESPRESSO CUDA Fortran gives us the full performance potential of the CUDA programming model and NVIDIA GPUs. !$CUF KERNELS directives give us productivity and source code maintainability. It’s the best of both worlds. Filippo Spiga Head of Research Software Engineering University of Cambridge
  • 20. 20 OPENACC AND CUDA UNIFIED MEMORY
  • 21. 21 Programming GPU-Accelerated Systems CUDA Unified Memory for Dynamically Allocated Data GPU Developer View With CUDA Unified Memory Unified Memory GPU Developer View System Memory GPU Memory PCIe
  • 22. 22 How CUDA Unified Memory Works on TESLA GPUs Servicing CPU and GPU Page Faults for Allocatable Data GPU Memory MappingCPU Memory Mapping PCIe or NVLink Page Fault Page Fault arrayarray __global__ void setValue(char *ptr, int index, char val) { ptr[index] = val; } cudaMallocManaged(&array, size); memset(array, size); setValue<<<...>>>(array, size/2, 5); ...
  • 23. 23 #pragma acc data copyin(a,b) copyout(c) { ... #pragma acc parallel { #pragma acc loop gang vector for (i = 0; i < n; ++i) { c[i] = a[i] + b[i]; ... } } ... } PGI OpenACC and CUDA Unified Memory Compiling with the –ta=tesla:managed option C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory GPU Developer View With CUDA Unified Memory Unified Memory
  • 24. 24 PGI OpenACC and CUDA Unified Memory Compiling with the –ta=tesla:managed option C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory ... #pragma acc parallel { #pragma acc loop gang vector for (i = 0; i < n; ++i) { c[i] = a[i] + b[i]; ... } } ... GPU Developer View With CUDA Unified Memory Unified Memory
  • 25. 25 Center for Accelerated Application Readiness (CAAR) Oak Ridge Leadership Computing Facility IBM POWER9 CPUs NVIDIA Volta V100 GPUs
  • 26. 26 GTC: An OpenACC Production Application The gyrokinetic toroidal code (GTC) is a massively parallel, particle-in-cell production code for turbulence simulation in support of the burning plasma experiment ITER, the crucial next step in the quest for fusion energy. Being ported for runs on the ORNL Summit supercomputer http://phoenix.ps.uci.edu/gtc_group
  • 27. 27 GTC Performance using OpenACC P8 : IBM POWER8NVL, 2 sockets, 20 cores, NVLINK UM : No Data Directives in sources, compiled with –ta=tesla:managed 2x 4x 6x 8x 10x 12x OpenPOWER | NVLink | Unified Memory | P100 | V100 14x 16x Data Directives Data Directives Data Directives 6.1X 5.9X 12.1X 12X 16.5X 20-core P8 P8+2xP100 UM P8+2xP100 P8+4xP100 UM P8+4xP100 x64+4xV100
  • 29. 29 Managing Aggregate Data Structures with OpenACC An Example from the OpenACC port of VASP Derived Type 1 Members: 3 dynamic 1 derived type 2 Derived Type 2 Members: 21 dynamic 1 derived type 3 1 derived type 4 Derived Type 3 Members: only static Derived Type 4 Members: 8 dynamic 4 derived type 5 2 derived type 6 Derived Type 5 Members: 3 dynamic Derived Type 6 Members: 8 dynamic • Real-world applications often have complex, aggregate data structures • CUDA Unified Memory can automatically manage Deep Copy, but …
  • 30. 30 Managing Aggregate Data Structures with OpenACC An Example from the OpenACC port of VASP Derived Type 1 Members: 3 dynamic 1 derived type 2 Derived Type 2 Members: 21 dynamic 1 derived type 3 1 derived type 4 Derived Type 3 Members: only static Derived Type 4 Members: 8 dynamic 4 derived type 5 2 derived type 6 Derived Type 5 Members: 3 dynamic Derived Type 6 Members: 8 dynamic • Real-world applications often have complex, aggregate data structures • CUDA Unified Memory can automatically manage Deep Copy, but … • CUDA Unified Memory is only for allocatable data today
  • 31. 31 FORTRAN AUTOMATIC FULL DEEP COPY Fortran Derived Types
  • 32. 32 OPENACC 2.6 MANUAL DEEP COPY typedef struct points { float* x; float* y; float* z; int n; float coef, direction; } points; void sub ( int n, float* y ) { points p; #pragma acc data create (p) { p.n = n; p.x = ( float*) malloc ( sizeof ( float )*n ); p.y = ( float*) malloc ( sizeof ( float )*n ); p.z = ( float*) malloc ( sizeof ( float )*n ); #pragma acc update device (p.n) #pragma acc data copyin (p.x[0:n], p.y[0: n]) { #pragma acc parallel loop for ( i =0; i<p.n; ++I ) p.x[i] += p.y[i]; . . . Supported Today in PGI Compilers
  • 33. 33 DRAFT OPENACC 3.0 TRUE DEEP COPY typedef struct points { float* x; float* y; float* z; int n; float coef, direction; #pragma acc policy inout(x[0:n],y[0:n]) } points; void sub ( int n, float* y ) { points p; p.n = n; p.x = ( float*) malloc ( sizeof ( float )*n ); p.y = ( float*) malloc ( sizeof ( float )*n ); p.z = ( float*) malloc ( sizeof ( float )*n ); #pragma acc data copy (p) { #pragma acc parallel loop for ( i =0; i<p.n; ++I ) p.x[i] += p.y[i]; . . . Still in definition by the OpenACC Committee
  • 35. 35 0 20 40 60 80 100 120 140 160 Multicore Haswell Multicore Broadwell Multicore Skylake Kepler Pascal PGI 18.1 OpenACC Intel 2018 OpenMP 7.6x 7.9x 10x 10x 11x 40x 14.8x 15x Volta V100 CLOVERLEAF SpeedupvsSingleHaswellCore Systems: Haswell: 2x16 core Haswell server, four K80s, CentOS 7.2 (perf-hsw10), Broadwell: 2x20 core Broadwell server, eight P100s (dgx1-prd-01), Broadwell server, eight V100s (dgx07), Skylake 2x20 core Xeon Gold server (sky-4). Compilers: Intel 2018.0.128, PGI 18.1 Benchmark: CloverLeaf v1.3 downloaded from http://uk-mac.github.io/CloverLeaf the week of November 7 2016; CloverlLeaf_Serial; CloverLeaf_ref (MPI+OpenMP); CloverLeaf_OpenACC (MPI+OpenACC) Data compiled by PGI February 2018. AWE Hydrodynamics mini-App, bm32 data set http://uk-mac.github.io/CloverLeaf 109x 67x 142x 1x 2x 4x
  • 36. 36 OPENACC DIRECTIVES FOR GPUS 75 !$ACC KERNELS 76 !$ACC LOOP INDEPENDENT 77 DO k=y_min,y_max 78 !$ACC LOOP INDEPENDENT PRIVATE(right_flux,left_flux,top_flux,bottom_flux,total_flux, min_cell_volume,energy_change,recip_volume) 79 DO j=x_min,x_max 80 81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) & 82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5 83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) & 84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5 85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) & 86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5 87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) & 88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5 89 total_flux=right_flux-left_flux+top_flux-bottom_flux 90 91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux) 92 93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux & 94 ,volume(j,k)+right_flux-left_flux & 95 ,volume(j,k)+top_flux-bottom_flux) 97 recip_volume=1.0/volume(j,k) 99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*... 101 energy1(j,k)=energy0(j,k)-energy_change 103 density1(j,k)=density0(j,k)*volume_change(j,k) 105 ENDDO 106 ENDDO 107 !$ACC END KERNELS % pgfortran –fast –ta=tesla –Minfo -c PdV_kernel.f90 pdv_kernel: ... 77, Loop is parallelizable 79, Loop is parallelizable Accelerator kernel generated Generating Tesla code 77, !$acc loop gang, vector(4) ! blockidx%y ! threadidx%y 79, !$acc loop gang, vector(32)! blockidx%x ! threadidx%x ... http://uk-mac.github.io/CloverLeaf
  • 37. 37 OPENACC DIRECTIVES FOR MULTICORE CPUS 75 !$ACC KERNELS 76 !$ACC LOOP INDEPENDENT 77 DO k=y_min,y_max 78 !$ACC LOOP INDEPENDENT PRIVATE(right_flux,left_flux,top_flux,bottom_flux,total_flux, min_cell_volume,energy_change,recip_volume) 79 DO j=x_min,x_max 80 81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) & 82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5 83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) & 84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5 85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) & 86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5 87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) & 88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5 89 total_flux=right_flux-left_flux+top_flux-bottom_flux 90 91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux) 92 93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux & 94 ,volume(j,k)+right_flux-left_flux & 95 ,volume(j,k)+top_flux-bottom_flux) 97 recip_volume=1.0/volume(j,k) 99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*... 101 energy1(j,k)=energy0(j,k)-energy_change 103 density1(j,k)=density0(j,k)*volume_change(j,k) 105 ENDDO 106 ENDDO 107 !$ACC END KERNELS % pgfortran –fast –ta=multicore ... PdV_kernel.f90 pdv_kernel: ... 77, Loop is parallelizable Generating Multicore code 77, !$acc loop gang 79, Loop is parallelizable 3 loop-carried redundant expressions removed with 9 operations and 9 arrays Innermost loop distributed: 2 new loops Generated vector SIMD code for the loop Generated 2 prefetch instructions for the loop Generated 12 prefetch instructions for the loop ... http://uk-mac.github.io/CloverLeaf
  • 38. 38 FORTRAN 2018 DO CONCURRENT 75 76 77 DO CONCURRENT (k=y_min:y_max, j=x_min:x_max) & 78 LOCAL (right_flux,left_flux,top_flux,bottom_flux,total_flux, & min_cell_volume,energy_change,recip_volume) 79 80 81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) & 82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5 83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) & 84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5 85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) & 86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5 87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) & 88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5 89 total_flux=right_flux-left_flux+top_flux-bottom_flux 90 91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux) 92 93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux & 94 ,volume(j,k)+right_flux-left_flux & 95 ,volume(j,k)+top_flux-bottom_flux) 97 recip_volume=1.0/volume(j,k) 99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*... 101 energy1(j,k)=energy0(j,k)-energy_change 103 density1(j,k)=density0(j,k)*volume_change(j,k) 105 106 ENDDO 107 Fortran 2018 DO CONCURRENT + True Parallel Loops + Loop-scope shared/private data − No support for reductions − No support for atomics − No support for data management
  • 39. 39 OPENACC FOR EVERYONE The PGI Community Edition, pgicompilers.com/community PROGRAMMING MODELS OpenACC, CUDA Fortran, OpenMP, C/C++/Fortran Compilers and Tools PLATFORMS X86, OpenPOWER, NVIDIA GPU UPDATES 1-2 times a year 6-9 times a year 6-9 times a year SUPPORT User Forums PGI Support PGI Premier Services LICENSE Annual Perpetual Volume/Site FREE