Accelerating HPC Applications on NVIDIA GPUs with OpenACC

Doug Miles, PGI Compilers & Tools, NVIDIA
High Performance Computing Advisory Council
February 21, 2018
ACCELERATING HPC APPLICATIONS
ON NVIDIA GPUS WITH OPENACC

2
PGI — THE NVIDIA HPC SDK
Fortran, C & C++ Compilers
Optimizing, SIMD Vectorizing, OpenMP
Accelerated Computing Features
CUDA Fortran, OpenACC Directives
Multi-Platform Solution
X86-64 and OpenPOWER Multicore CPUs
NVIDIA Tesla GPUs
Supported on Linux, macOS, Windows
MPI/OpenMP/OpenACC Tools
Debugger
Performance Profiler
Interoperable with DDT, TotalView

3
Programming GPU-Accelerated Systems
Separate CPU System and GPU Memories
GPU Developer View
System
Memory
GPU Memory
PCIe

4
NVLink
Separate CPU System and GPU Memories
GPU Developer View
System
Memory
GPU Memory

5
attributes(global) subroutine mm_kernel
( A, B, C, N, M, L )
real :: A(N,M), B(M,L), C(N,L), Cij
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
real, shared :: Asub(16,16),Bsub(16,16)
tx = threadidx%x
ty = threadidx%y
i = blockidx%x * 16 + tx
j = blockidx%y * 16 + ty
Cij = 0.0
do kb = 1, M, 16
Asub(tx,ty) = A(i,kb+tx-1)
Bsub(tx,ty) = B(kb+ty-1,j)
call syncthreads()
do k = 1,16
Cij = Cij + Asub(tx,k) * Bsub(k,ty)
enddo
call syncthreads()
enddo
C(i,j) = Cij
end subroutine mmul_kernel
real, device, allocatable, dimension(:,:) ::
Adev,Bdev,Cdev
. . .
allocate (Adev(N,M), Bdev(M,L), Cdev(N,L))
Adev = A(1:N,1:M)
Bdev = B(1:M,1:L)
call mm_kernel <<<dim3(N/16,M/16),dim3(16,16)>>>
( Adev, Bdev, Cdev, N, M, L )
C(1:N,1:L) = Cdev
deallocate ( Adev, Bdev, Cdev )
. . .
CPU Code Tesla Code
CUDA FORTRAN

6
CUDA FORTRAN
module madd_device_module
use cudafor
contains
subroutine madd_dev(a,b,c,sum,n1,n2)
real,dimension(:,:),device :: a,b,c
real :: sum
integer :: n1,n2
type(dim3) :: grid, block
!$cuf kernel do (2) <<<(*,*),(32,4)>>>
do j = 1,n2
do i = 1,n1
a(i,j) = b(i,j) + c(i,j)
sum = sum + a(i,j)
enddo
enddo
end subroutine
end module
Equivalent
hand-written
CUDA kernels
module madd_device_module
use cudafor
implicit none
contains
attributes(global) subroutine madd_kernel(a,b,c,blocksum,n1,n2)
real, dimension(:,:) :: a,b,c
real, dimension(:) :: blocksum
integer, value :: n1,n2
integer :: i,j,tindex,tneighbor,bindex
real :: mysum
real, shared :: bsum(256)
! Do this thread's work
mysum = 0.0
do j = threadidx%y + (blockidx%y-1)*blockdim%y, n2, blockdim%y*griddim%y
do i = threadidx%x + (blockidx%x-1)*blockdim%x, n1, blockdim%x*griddim%x
a(i,j) = b(i,j) + c(i,j)
mysum = mysum + a(i,j) ! accumulates partial sum per thread
enddo
enddo
! Now add up all partial sums for the whole thread block
! Compute this thread's linear index in the thread block
! We assume 256 threads in the thread block
tindex = threadidx%x + (threadidx%y-1)*blockdim%x
! Store this thread's partial sum in the shared memory block
bsum(tindex) = mysum
call syncthreads()
! Accumulate all the partial sums for this thread block to a single value
tneighbor = 128
do while( tneighbor >= 1 )
if( tindex <= tneighbor ) &
bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor)
tneighbor = tneighbor / 2
call syncthreads()
enddo
! Store the partial sum for the thread block
bindex = blockidx%x + (blockidx%y-1)*griddim%x
if( tindex == 1 ) blocksum(bindex) = bsum(1)
end subroutine
! Add up partial sums for all thread blocks to a single cumulative sum
attributes(global) subroutine madd_sum_kernel(blocksum,dsum,nb)
real, dimension(:) :: blocksum
real :: dsum
integer, value :: nb
real, shared :: bsum(256)
integer :: tindex,tneighbor,i
! Again, we assume 256 threads in the thread block
! accumulate a partial sum for each thread
tindex = threadidx%x
bsum(tindex) = 0.0
do i = tindex, nb, blockdim%x
bsum(tindex) = bsum(tindex) + blocksum(i)
enddo
call syncthreads()
! This code is copied from the previous kernel
! Accumulate all the partial sums for this thread block to a single value
! Since there is only one thread block, this single value is the final result
tneighbor = 128
do while( tneighbor >= 1 )
if( tindex <= tneighbor ) &
bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor)
tneighbor = tneighbor / 2
call syncthreads()
enddo
if( tindex == 1 ) dsum = bsum(1)
end subroutine
subroutine madd_dev(a,b,c,dsum,n1,n2)
real, dimension(:,:), device :: a,b,c
real, device :: dsum
real, dimension(:), allocatable, device :: blocksum
integer :: n1,n2,nb
type(dim3) :: grid, block
integer :: r
! Compute grid/block size; block size must be 256 threads
grid = dim3((n1+31)/32, (n2+7)/8, 1)
block = dim3(32,8,1)
nb = grid%x * grid%y
allocate(blocksum(1:nb))
call madd_kernel<<< grid, block >>>(a,b,c,blocksum,n1,n2)
call madd_sum_kernel<<< 1, 256 >>>(blocksum,dsum,nb)
r = cudaThreadSynchronize() ! don't deallocate too early
deallocate(blocksum)
end subroutine
!$CUF KERNEL Directives

7
OpenACC Directives
Manage
Data
Movement
Initiate
Parallel
Execution
Optimize
Loop
Mappings
#pragma acc data copyin(a,b) copyout(c)
{
...
#pragma acc parallel
{
#pragma acc loop gang vector
for (i = 0; i < n; ++i) {
c[i] = a[i] + b[i];
...
}
}
...
}
CPU, GPU, Manycore
Performance portable
Interoperable
Single source
Incremental

8
GPU
Memory
System
Memory
...
#pragma acc data copy(b[0:n][0:m])
create(a[0:n][0:m])
{
for (iter = 1; iter <= p; ++iter){
#pragma acc parallel loop
for (i = 1; i < n-1; ++i){
for (j = 1; j < m-1; ++j){
a[i][j]=w0*b[i][j]+
w1*(b[i-1][j]+b[i+1][j]+
b[i][j-1]+b[i][j+1])+
w2*(b[i-1][j-1]+b[i-1][j+1]+
b[i+1][j-1]+b[i+1][j+1]);
} }
for( i = 1; i < n-1; ++i )
for( j = 1; j < m-1; ++j )
b[i][j] = a[i][j];
}
}
...
AA
BB
S2
(B)S1
(B)S1
(B)S2
(B)
S1
(B)
Sp
(B)Sp
(B)
Sp
(B)
OpenACC for GPUs in a Nutshell

9
Multicore CPU
OpenACC is for Multicore, Manycore & GPUs
% pgfortran -ta=multicore –fast –Minfo=acc -c
update_tile_halo_kernel.f90
. . .
100, Loop is parallelizable
Generating Multicore code
100, !$acc loop gang
Tesla GPU
% pgfortran -ta=tesla –fast -Minfo=acc –c
update_tile_halo_kernel.f90
. . .
Accelerator kernel generated
Generating Tesla code
100, !$acc loop gang, vector(4) ! blockidx%y threadidx%y
102, !$acc loop gang, vector(32) ! blockidx%x threadidx%x
98 !$acc parallel
99 !$acc loop independent
100 do k=y_min-depth,y_max+depth
101 !$acc loop independent
102 do j=1,depth
103 density0(x_min-j,k)=left_density0(left_xmax+1-j,k)
104 enddo
105 enddo
106 !$acc end parallel

Performance measured February, 2018. Skylake: Two 20 core Intel Xeon Gold 6148 CPUs @ 2.4GHz w/ 376GB memory, hyperthreading enabled. EPYC: Two 24 core AMD EPYC 7451 CPUs
@ 2.3GHz w/ 256GB memory. Broadwell: Two 20 core Intel Xeon E5-2698 v4 CPUs @ 3.6GHz w/ 256GB memory, hyperthreading enabled. Volta: NVIDIA DGX1 system with two 20 core
Intel Xeon E5-2698 v4 CPUs @ 2.20GHz, 256GB memory, one NVIDIA Tesla V100-SXM2-16GB GPU @ 1.53GHz. SPEC® is a registered trademark of the Standard Performance Evaluation
Corporation (www.spec.org).
SPEC ACCEL 1.2 BENCHMARKS
0
50
100
150
200
2-socket Skylake 2-socket EPYC 2-socket BroadwellGEOMEANSeconds
Intel 2018 PGI 18.1
OpenMP 4.5
40 cores / 80 threads 48 cores / 48 threads 40 cores / 80 threads
0
50
100
150
200
GEOMEANSeconds
PGI 18.1
OpenACC
2-socket
Broadwell
1x Volta
V100
4.4x
Speed-up

12
GAUSSIAN 16
Using OpenACC allowed us to continue
development of our fundamental
algorithms and software capabilities
simultaneously with the GPU-related
work. In the end, we could use the
same code base for SMP, cluster/
network and GPU parallelism. PGI's
compilers were essential to the success
of our efforts.
Mike Frisch, Ph.D.
President and CEO
Gaussian, Inc.
Gaussian, Inc.
340QuinnipiacSt. Bldg. 40
Wallingford, CT 06492USA
custserv@gaussian.com
Gaussian isa registered trademark of Gaussian, Inc. All other trademarksand
thepropertiesof their respectiveholders. Specif cationssubject tochangewitho
Copyright © 2017, Gaussian, Inc. All rightsreserved.
Roberto Gomperts
NVIDIA
Michael Frisch
Gaussian
Brent Leback
NVIDIA/PGI
Gio
Project Contributors
%GPUCPU=0 - 7 =0 - 7 UseGPUs0-7with CPUs0-7astheir controllers.
Detailed information isavailableon our website.

13
ANSYS FLUENT
We’ve effectively used
OpenACC for heterogeneous
computing in ANSYS Fluent
with impressive performance.
We’re now applying this work
to more of our models and
new platforms.
Sunil Sathe
Lead Software Developer
ANSYS Fluent
Image courtesy: ANSYS

14
VASP
For VASP, OpenACC is the way
forward for GPU acceleration.
Performance is similar and in some
cases better than CUDA C, and
OpenACC dramatically decreases
GPU development and maintenance
efforts. We’re excited to collaborate
with NVIDIA and PGI as an early
adopter of CUDA Unified Memory.
Prof. Georg Kresse
Computational Materials Physics
University of Vienna

15
David Gutzwiller
Lead Software Developer
NUMECA
NUMECA FINE/Open
Porting our unstructured C++ CFD
solver FINE/Open to GPUs using
OpenACC would have been
impossible two or three years ago,
but OpenACC has developed
enough that we’re now getting
some really good results.

16
MPAS-A
Our team has been evaluating
OpenACC as a pathway to
performance portability for the Model
for Prediction (MPAS) atmospheric
model. Using this approach on the
MPAS dynamical core, we have
achieved performance on a single
P100 GPU equivalent to 2.7 dual
socketed Intel Xeon nodes on our new
Cheyenne supercomputer.
Richard Loft
Director, Technology Development
NCAR
Image courtesy: NCAR

17
OpenACC made it practical to
develop for GPU-based hardware
while retaining a single source for
almost all the COSMO physics
code.
Dr. Oliver Fuhrer
Senior Scientist
Meteoswiss
COSMO

18
GAMERA FOR GPU
With OpenACC and a compute
node based on NVIDIA's Tesla
P100 GPU, we achieved more
than a 14X speed up over a K
Computer node running our
earthquake disaster simulation
code
Takuma Yamaguchi, Kohei Fujita, Tsuyoshi Ichimura, Muneo
Hori, Lalith Wijerathne
The University of Tokyo
Map courtesy University of Tokyo

19
QUANTUM ESPRESSO
CUDA Fortran gives us the full
performance potential of the
CUDA programming model and
NVIDIA GPUs. !$CUF KERNELS
directives give us productivity and
source code maintainability. It’s
the best of both worlds.
Filippo Spiga
Head of Research Software Engineering
University of Cambridge

20
OPENACC AND CUDA UNIFIED MEMORY

21
CUDA Unified Memory for Dynamically Allocated Data
GPU Developer View With
CUDA Unified Memory
Unified Memory
GPU Developer View
System
Memory
GPU Memory
PCIe

22
How CUDA Unified Memory Works on TESLA GPUs
Servicing CPU and GPU Page Faults for Allocatable Data
GPU Memory MappingCPU Memory Mapping
PCIe or NVLink
Page
Fault
Page
Fault
arrayarray
__global__
void setValue(char *ptr, int index, char val)
{
ptr[index] = val;
}
cudaMallocManaged(&array, size);
memset(array, size);
setValue<<<...>>>(array, size/2, 5);
...

23
#pragma acc data copyin(a,b) copyout(c)
{
...
{
for (i = 0; i < n; ++i) {
c[i] = a[i] + b[i];
...
}
}
...
}
PGI OpenACC and CUDA Unified Memory
Compiling with the –ta=tesla:managed option
C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory
CUDA Unified Memory
Unified Memory

24
PGI OpenACC and CUDA Unified Memory
Compiling with the –ta=tesla:managed option
C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory
...
{
for (i = 0; i < n; ++i) {
c[i] = a[i] + b[i];
...
}
}
...
CUDA Unified Memory
Unified Memory

25
Center for Accelerated Application
Readiness (CAAR)
Oak Ridge Leadership Computing Facility
IBM POWER9 CPUs
NVIDIA Volta V100 GPUs

26
GTC: An OpenACC Production Application
The gyrokinetic toroidal
code (GTC) is a massively
parallel, particle-in-cell
production code for
turbulence simulation in
support of the burning
plasma experiment ITER,
the crucial next step in the
quest for fusion energy.
Being ported for runs on the ORNL Summit supercomputer
http://phoenix.ps.uci.edu/gtc_group

27
GTC Performance using OpenACC
P8 : IBM POWER8NVL, 2 sockets, 20 cores, NVLINK
UM : No Data Directives in sources, compiled with –ta=tesla:managed
2x
4x
6x
8x
10x
12x
OpenPOWER | NVLink | Unified Memory | P100 | V100
14x
16x
Data Directives Data Directives Data Directives
6.1X 5.9X
12.1X 12X
16.5X
20-core P8 P8+2xP100
UM
P8+2xP100 P8+4xP100
UM
P8+4xP100 x64+4xV100

29
Managing Aggregate Data Structures with OpenACC
An Example from the OpenACC port of VASP
Derived Type 1
Members:
3 dynamic
1 derived type 2
Derived Type 2
Members:
21 dynamic
1 derived type 3
1 derived type 4
Derived Type 3
Members:
only static
Derived Type 4
Members:
8 dynamic
4 derived type 5
2 derived type 6
Derived Type 5
Members:
3 dynamic
Derived Type 6
Members:
8 dynamic
• Real-world applications often have complex,
aggregate data structures
• CUDA Unified Memory can automatically
manage Deep Copy, but …

30
Managing Aggregate Data Structures with OpenACC
An Example from the OpenACC port of VASP
Derived Type 1
Members:
3 dynamic
1 derived type 2
Derived Type 2
Members:
21 dynamic
1 derived type 3
1 derived type 4
Derived Type 3
Members:
only static
Derived Type 4
Members:
8 dynamic
4 derived type 5
2 derived type 6
Derived Type 5
Members:
3 dynamic
Derived Type 6
Members:
8 dynamic
• Real-world applications often have complex,
aggregate data structures
• CUDA Unified Memory can automatically
manage Deep Copy, but …
• CUDA Unified Memory is only for allocatable
data today

31
FORTRAN AUTOMATIC FULL DEEP COPY
Fortran Derived Types

32
OPENACC 2.6 MANUAL DEEP COPY
typedef struct points {
float* x; float* y; float* z;
int n;
float coef, direction;
} points;
void sub ( int n, float* y ) {
points p;
#pragma acc data create (p)
{
p.n = n;
p.x = ( float*) malloc ( sizeof ( float )*n );
p.y = ( float*) malloc ( sizeof ( float )*n );
p.z = ( float*) malloc ( sizeof ( float )*n );
#pragma acc update device (p.n)
#pragma acc data copyin (p.x[0:n], p.y[0: n])
{
for ( i =0; i<p.n; ++I ) p.x[i] += p.y[i];
. . .
Supported Today in PGI Compilers

33
DRAFT OPENACC 3.0 TRUE DEEP COPY
typedef struct points {
float* x; float* y; float* z;
int n;
float coef, direction;
#pragma acc policy inout(x[0:n],y[0:n])
} points;
void sub ( int n, float* y ) {
points p;
p.n = n;
p.x = ( float*) malloc ( sizeof ( float )*n );
p.y = ( float*) malloc ( sizeof ( float )*n );
p.z = ( float*) malloc ( sizeof ( float )*n );
#pragma acc data copy (p)
{
for ( i =0; i<p.n; ++I ) p.x[i] += p.y[i];
. . .
Still in definition by the OpenACC Committee

35
0
20
40
60
80
100
120
140
160
Multicore Haswell Multicore Broadwell Multicore Skylake Kepler Pascal
PGI 18.1 OpenACC
Intel 2018 OpenMP
7.6x 7.9x 10x 10x 11x
40x
14.8x 15x
Volta V100
CLOVERLEAF
SpeedupvsSingleHaswellCore
Systems: Haswell: 2x16 core Haswell server, four K80s, CentOS 7.2 (perf-hsw10), Broadwell: 2x20 core Broadwell server, eight P100s (dgx1-prd-01), Broadwell server, eight V100s (dgx07), Skylake 2x20 core Xeon Gold server (sky-4).
Compilers: Intel 2018.0.128, PGI 18.1
Benchmark: CloverLeaf v1.3 downloaded from http://uk-mac.github.io/CloverLeaf the week of November 7 2016; CloverlLeaf_Serial; CloverLeaf_ref (MPI+OpenMP); CloverLeaf_OpenACC (MPI+OpenACC)
Data compiled by PGI February 2018.
AWE Hydrodynamics mini-App, bm32 data set
http://uk-mac.github.io/CloverLeaf
109x
67x
142x
1x 2x 4x

36
OPENACC DIRECTIVES FOR GPUS
75 !$ACC KERNELS
76 !$ACC LOOP INDEPENDENT
77 DO k=y_min,y_max
78 !$ACC LOOP INDEPENDENT PRIVATE(right_flux,left_flux,top_flux,bottom_flux,total_flux,
min_cell_volume,energy_change,recip_volume)
79 DO j=x_min,x_max
80
81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) &
82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5
83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) &
84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5
85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) &
86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5
87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) &
88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5
89 total_flux=right_flux-left_flux+top_flux-bottom_flux
90
91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux)
92
93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux &
94 ,volume(j,k)+right_flux-left_flux &
95 ,volume(j,k)+top_flux-bottom_flux)
97 recip_volume=1.0/volume(j,k)
99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*...
101 energy1(j,k)=energy0(j,k)-energy_change
103 density1(j,k)=density0(j,k)*volume_change(j,k)
105 ENDDO
106 ENDDO
107 !$ACC END KERNELS
% pgfortran –fast –ta=tesla –Minfo -c PdV_kernel.f90
pdv_kernel:
...
Accelerator kernel generated
Generating Tesla code
77, !$acc loop gang, vector(4) ! blockidx%y
! threadidx%y
79, !$acc loop gang, vector(32)! blockidx%x
! threadidx%x
...

37
OPENACC DIRECTIVES FOR MULTICORE CPUS
75 !$ACC KERNELS
76 !$ACC LOOP INDEPENDENT
77 DO k=y_min,y_max
78 !$ACC LOOP INDEPENDENT PRIVATE(right_flux,left_flux,top_flux,bottom_flux,total_flux,
79 DO j=x_min,x_max
80
82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5
84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5
86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5
88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5
90
92
105 ENDDO
106 ENDDO
107 !$ACC END KERNELS
% pgfortran –fast –ta=multicore ... PdV_kernel.f90
pdv_kernel:
...
Generating Multicore code
77, !$acc loop gang
3 loop-carried redundant expressions removed
with 9 operations and 9 arrays
Innermost loop distributed: 2 new loops
Generated vector SIMD code for the loop
Generated 2 prefetch instructions for the loop
Generated 12 prefetch instructions for the loop
...

38
FORTRAN 2018 DO CONCURRENT
75
76
77 DO CONCURRENT (k=y_min:y_max, j=x_min:x_max) &
78 LOCAL (right_flux,left_flux,top_flux,bottom_flux,total_flux, &
79
80
82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5
84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5
86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5
88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5
90
92
105
106 ENDDO
107
Fortran 2018 DO CONCURRENT
+ True Parallel Loops
+ Loop-scope shared/private data
− No support for reductions
− No support for atomics
− No support for data management

39
OPENACC FOR EVERYONE
The PGI Community Edition, pgicompilers.com/community
PROGRAMMING MODELS
OpenACC, CUDA Fortran, OpenMP,
C/C++/Fortran Compilers and Tools
PLATFORMS
X86, OpenPOWER, NVIDIA GPU
UPDATES 1-2 times a year 6-9 times a year 6-9 times a year
SUPPORT User Forums PGI Support
PGI Premier
Services
LICENSE Annual Perpetual Volume/Site
FREE

Accelerating HPC Applications on NVIDIA GPUs with OpenACC

More Related Content

What's hot (20)

Similar to Accelerating HPC Applications on NVIDIA GPUs with OpenACC (20)

More from inside-BigData.com (20)

Recently uploaded (20)

Accelerating HPC Applications on NVIDIA GPUs with OpenACC