hpc_performance
hpc_performance
Victor Eijkhout
Fall 2023
‘Streaming’ operations
// bandwidth.cxx
vector<double> results(nthreads,0.);
for ( int t=0; t<nthreads; t++) {
auto start_point = t*stream_length;
threads.push_back
( thread( [=,&results] () {
results[t] = memory.sumstream(
how_many_repeats,stream_length,start_point);
} ) );
}
for ( auto &t : threads )
t.join();
Aggregate bandwidth
12K
256K
25M
1,000
bandwidth
500
0
0 20 40 60
cores
6 Eijkhout – Performance – Fall 2023
Bandwidth numbers strictly a posteriori
Cache size effects
// setup
for (int iword=0; iword<cachesize_in_words; iword++)
memory[iword] = (iword+1) % cachesize_in_words
// use:
ptr = 0
for (int iword=0; iword<cachesize_in_words; iword++)
ptr = memory[ptr];
Bandwidth
2,500
2,000
bandwidth
1,500
1,000
frontera
500 ls6
linear
0
104 105 106 107
dataset size
9 Eijkhout – Performance – Fall 2023
Associativity
skx
5 clx
icx
4
nsec
4 6 8 10 12 14
collisions
// regular.c
for (int i=0; i<N; i++)
for (int j=0; j<N; j++)
A[i][j] = B[j][i];
// blocked.c
for (int ii=0; ii<N; ii+=blocksize)
for (int jj=0; jj<N; jj+=blocksize)
for (int i=ii*blocksize; i<MIN(N,(ii+1)*blocksize); i++)
for (int j=jj*blocksize; j<MIN(N,(jj+1)*blocksize); j++)
A[i][j] = B[j][i];