统计数组的直方图分布具有并行性,因此可以通过并行计算方法来提高程序执行速度。本文尝试基于CUDA实现并行化版本的直方图统计。
代码实现
文中的代码用于统计大小为36 * 192 *10240 的int数组的直方图分布,其统计的规则是数组中每个元素值除以64的余数在[0-63]的分布。参考相关文献中的并行化实现,基本可以分为两种:1.通过原子操作每个线程写入目标数组 2.每个部分统计各自的局部分布结果,再通过规约操作生成最终结果。本文基于第二种实现思路:每个线程使用64个int大小的共享内存数组保存局部分布的结果,再依次在block grid范围进行规约生成最终的分布结果。
同时,应用CUDA9中新特性cooperative_groups,用以在同一个核函数中实现grid级别的同步。
代码实现
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <cooperative_groups.h>
#include <iostream>
#include <sstream>
#define HISTOGRAMCOMMONGRID 128
#define HISTOGRAMCOOPERATIVEGROUPGRID 36
#define HISTOGRAMBLOCK 192
#define HISTOGRAMWORKLOAD 10240
#define HISTOGRAMSOURCEBINS (HISTOGRAMCOOPERATIVEGROUPGRID*HISTOGRAMBLOCK*HISTOGRAMWORKLOAD)
#define GAINSOURCEBINS (8*HISTOGRAMSOURCEBINS)
#define HISTOGRAMDSTBINS 64
//ref sample macro
#define RS_BLOCK 192
#define RS_GRID 72
#define RS_WORKLOAD 5120
#define RS_SM_WIDTH (32*2)
#define RSSOURCEBINS HISTOGRAMSOURCEBINS
namespace cg = cooperative_groups;
//64bins histogram 网络同步 + grid归约在全局内存完成
__global__ void kHistogram(int* d_src, int* d_dst)
{
__shared__ int sm[HISTOGRAMDSTBINS * HISTOGRAMBLOCK];
cg::grid_group grid = cg::this_grid();
int* d_base = d_src + blockIdx.x * HISTOGRAMBLOCK * HISTOGRAMWORKLOAD;
for (size_t i = 0; i < HISTOGRAMDSTBINS; i++)
{
sm[threadIdx.x + i * HISTOGRAMBLOCK] = 0;
}
for (size_t i = 0; i < HISTOGRAMWORKLOAD; i++)
{
//int local = d_base[threadIdx.x + blockDim.x * i];
int pos = d_base[threadIdx.x + HISTOGRAMBLOCK * i] % HISTOGRAMDSTBINS;
sm[threadIdx.x * HISTOGRAMDSTBINS + pos] += 1;
//__syncthreads();
}
__syncthreads();
//reduce opt
size_t i = 1;
if (threadIdx.x < HISTOGRAMDSTBINS)
{
for (; i < blockDim.x; i++)
{
sm[threadIdx.x] += sm[threadIdx.x + i * HISTOGRAMDSTBINS];
}
d_dst[blockIdx.x * HISTOGRAMDSTBINS + threadIdx.x] = sm[threadIdx.x];
}
//cg::sync(grid);
grid.sync();
int tid = threadIdx.x + blockIdx.x * HISTOGRAMBLOCK;
if (tid < HISTOGRAMDSTBINS)
{
for (int i = 1; i < HISTOGRAMCOOPERATIVEGROUPGRID; ++i)
{
d_dst[tid] += d_dst[tid + i * HISTOGRAMDSTBINS];
}
}
}
//64bins histogram 网络同步 + grid归约在全局内存完成
__global__ void kHistogramOpt(int* d_src, int* d_dst)
{
__shared__ int sm[HISTOGRAMDSTBINS * HISTOGRAMBLOCK];
cg::grid_group grid = cg::this_grid();
int* d_base = d_src + blockIdx.x * HISTOGRAMBLOCK * HISTOGRAMWORKLOAD;
#pragma unroll
for (size_t i = 0; i < HISTOGRAMDSTBINS; i++)
{
sm[threadIdx.x + i * HISTOGRAMBLOCK] = 0;
}
#pragma unroll
for (size_t i = 0; i < HISTOGRAMWORKLOAD; i++)
{
//int pos = d_base[threadIdx.x + HISTOGRAMBLOCK * i] % HISTOGRAMDSTBINS;
//opt div
int pos = d_base[threadIdx.x + HISTOGRAMBLOCK * i] & 0x3F;
sm[threadIdx.x * HISTOGRAMDSTBINS + pos] += 1;
}
__syncthreads();
//reduce opt
if (threadIdx.x < HISTOGRAMDSTBINS)
{
{
/*for (; i < blockDim.x; i++)
{
sm[threadIdx.x] += sm[threadIdx.x + i * HISTOGRAMDSTBINS];
}
d_dst[blockIdx.x * HISTOGRAMDSTBINS + threadIdx.x] = sm[threadIdx.x];*/
}
//opt register and unroll
int Tmp = 0;
#pragma unroll
for (size_t i = 0; i < blockDim.x; i++)
{
Tmp += sm[threadIdx.x + i * HISTOGRAMDSTBINS];
}
d_dst[blockIdx.x * HISTOGRAMDSTBINS + threadIdx.x] = Tmp;
}
//cg::sync(grid);
grid.sync();
int tid = threadIdx.x + blockIdx.x * HISTOGRAMBLOCK;
if (tid < HISTOGRAMDSTBINS)
{
{
/*for (int i = 1; i < HISTOGRAMCOOPERATIVEGROUPGRID; ++i)
{
d_dst[tid] += d_dst[tid + i * HISTOGRAMDSTBINS];
}*/
}
//opt register and unroll
int tmp = 0;
for (int i = 0; i < HISTOGRAMCOOPERATIVEGROUPGRID; ++i)
{
tmp += d_dst[tid + i * HISTOGRAMDSTBINS];
}
d_dst[tid] = tmp;
}
}
//模仿CUDA Samples优化方法
__global__ void kHistogram_OptRefSampls(int* d_src, int* d_dst)
{
__shared__ unsigned short sm[HISTOGRAMDSTBINS * RS_BLOCK];
int* d_base = d_src + blockIdx.x * HISTOGRAMBLOCK * HISTOGRAMWORKLOAD;
unsigned int* transSM = (unsigned int*)sm;//一次赋值一个int
#pragma unroll
for (size_t i = 0; i < HISTOGRAMDSTBINS/(sizeof(int)/sizeof(short)); i++)
{
transSM[threadIdx.x + i * RS_BLOCK] = 0;
}
//如果每个线程只初始化自己负责的共享内存,可以去掉这个同步
__syncthreads();
int t = (threadIdx.x & 0x1F)*2 + ((threadIdx.x >> 5) & 0x1) + ((threadIdx.x >> 6) & 0x3)* RS_SM_WIDTH* HISTOGRAMDSTBINS; //超出共享内存范围,其中一个warp中的一个线程超出范围,VS窗口全部提示访问内存越界
unsigned short* threadbase = sm + t;//threadIdx.x * 2 + ((threadIdx.x >> 5) & 0x1) + ((threadIdx.x >> 6) & 0x3)* RS_SM_WIDTH * HISTOGRAMDSTBINS;
#pragma unroll
for (size_t i = 0; i < RS_WORKLOAD; i++)
{
int pos = d_base[threadIdx.x + HISTOGRAMBLOCK * i] & 0x3F;
threadbase[pos * RS_SM_WIDTH] += 1;
}
__syncthreads();
threadbase = sm + threadIdx.x * RS_SM_WIDTH;
int pos = (threadIdx.x & 0x3F)/* * 2*/;
int LocalSum = 0;
#pragma unroll
for (size_t i = 0; i < RS_SM_WIDTH; i++)
{
LocalSum += threadbase[(pos + i) & 0X3f];
}
atomicAdd(d_dst + (threadIdx.x & 0x3F),LocalSum);
}
__device__ void Reduce64bins(int* iReduceSrc, int iReduceSize, int* iReduceDst)
{
for (int i = 1; i < (iReduceSize / HISTOGRAMBLOCK); i++)
{
iReduceSrc[threadIdx.x] += iReduceSrc[threadIdx.x + HISTOGRAMBLOCK * i];
}
__syncthreads();
if (threadIdx.x < HISTOGRAMDSTBINS)
{
iReduceSrc[threadIdx.x] += iReduceSrc[threadIdx.x + HISTOGRAMDSTBINS];
iReduceSrc[threadIdx.x] += iReduceSrc[threadIdx.x + HISTOGRAMDSTBINS * 2];
iReduceDst[threadIdx.x] = iReduceSrc[threadIdx.x];
}
}
//64bins histogram 网络同步 + grid归约在共享内存完成
__global__ void kHistogramReduce(int* d_src, int* d_dst)
{
__shared__ int sm[HISTOGRAMDSTBINS * HISTOGRAMBLOCK];
cg::grid_group grid = cg::this_grid();
int* d_base = d_src + blockIdx.x * HISTOGRAMBLOCK * HISTOGRAMWORKLOAD;
for (size_t i = 0; i < HISTOGRAMDSTBINS; i++)
{
sm[threadIdx.x + i * HISTOGRAMBLOCK] = 0;
}
for (size_t i = 0; i < HISTOGRAMWORKLOAD; i++)
{
//int local = d_base[threadIdx.x + blockDim.x * i];
int pos = d_base[threadIdx.x + HISTOGRAMBLOCK * i] % HISTOGRAMDSTBINS;
sm[threadIdx.x * HISTOGRAMDSTBINS + pos] += 1;
//__syncthreads();
}
__syncthreads();
//reduce opt
Reduce64bins(sm, HISTOGRAMDSTBINS * HISTOGRAMBLOCK, d_dst + HISTOGRAMDSTBINS * blockIdx.x);
//cg::sync(grid);
grid.sync();
if (blockIdx.x == 1)
{
for (size_t i = 0; i < (HISTOGRAMDSTBINS * HISTOGRAMCOOPERATIVEGROUPGRID / HISTOGRAMBLOCK); i++)
{
sm[threadIdx.x + i * HISTOGRAMBLOCK] = d_dst[threadIdx.x + i * HISTOGRAMBLOCK];
}
Reduce64bins(sm, HISTOGRAMDSTBINS * HISTOGRAMCOOPERATIVEGROUPGRID, d_dst);
}
}
void InitHistogramSourceBins(int* h_src, int size)
{
for (size_t i = 0; i < size; i++)
{
h_src[i] = i;
}
}
bool VerifyHistogram(int* h_dst, int size)
{
int dstcount = size / HISTOGRAMDSTBINS;
for (size_t i = 0; i < HISTOGRAMDSTBINS; i++)
{
if (h_dst[i] != dstcount)
{
printf("not equal index %d value %d \n", i, h_dst[i]);
return false;
}
}
printf("equal \n");
return true;
}
//64bins histogram 原始数据生成局部归约结果
__global__ void kHistogramLocal(int* d_src, int* d_dst)
{
__shared__ int sm[HISTOGRAMDSTBINS * HISTOGRAMBLOCK];
int* d_base = d_src + blockIdx.x * HISTOGRAMBLOCK * HISTOGRAMWORKLOAD;
for (size_t i = 0; i < HISTOGRAMDSTBINS; i++)
{
sm[threadIdx.x + i * HISTOGRAMBLOCK] = 0;
}
for (size_t i = 0; i < HISTOGRAMWORKLOAD; i++)
{
//int local = d_base[threadIdx.x + blockDim.x * i];
int pos = d_base[threadIdx.x + HISTOGRAMBLOCK * i] % HISTOGRAMDSTBINS;
sm[threadIdx.x * HISTOGRAMDSTBINS + pos] += 1;
//__syncthreads();
}
__syncthreads();
if (threadIdx.x < HISTOGRAMDSTBINS)
{
for (size_t i = 1; i < blockDim.x; i++)
{
sm[threadIdx.x] += sm[threadIdx.x + i * HISTOGRAMDSTBINS];
}
d_dst[blockIdx.x * HISTOGRAMDSTBINS + threadIdx.x] = sm[threadIdx.x];
}
}
//64bins histogram 局部结果归约
__global__ void kReduceGrid(int* d_src, int* d_dst, int iReduceSize)
{
int iworkload = 1 + (iReduceSize - 1) / (blockDim.x * gridDim.x);//exp
int blockworkstartpos = blockIdx.x * blockDim.x * iworkload;
extern __shared__ int sm[];
sm[threadIdx.x] = 0;
if ((blockworkstartpos + blockDim.x * iworkload) > iReduceSize)
{
iworkload = (iReduceSize - blockworkstartpos) / blockDim.x;
}
for (size_t i = 0; i < iworkload; i++)
{
sm[threadIdx.x] += d_src[blockworkstartpos + i * blockDim.x + threadIdx.x];
}
__syncthreads();
for (size_t i = (blockDim.x / 2); i >= HISTOGRAMDSTBINS; i /= 2)
{
if (threadIdx.x < i)
{
sm[threadIdx.x] += sm[threadIdx.x + i];
}
__syncthreads();
}
if (threadIdx.x < HISTOGRAMDSTBINS)
{
d_dst[threadIdx.x] = sm[threadIdx.x];
}
}
void TestHistogram(int iFuncIdx)
{
int* h_src = (int*)malloc(GAINSOURCEBINS * sizeof(int));
int* h_dst = (int*)malloc(HISTOGRAMDSTBINS * HISTOGRAMCOOPERATIVEGROUPGRID * sizeof(int));
memset(h_dst, 0, HISTOGRAMDSTBINS * HISTOGRAMCOOPERATIVEGROUPGRID * sizeof(int));
InitHistogramSourceBins(h_src, GAINSOURCEBINS);
int* d_src;
int* d_dst;
cudaMalloc(&d_src, GAINSOURCEBINS * sizeof(int));
cudaMalloc(&d_dst, HISTOGRAMDSTBINS * HISTOGRAMCOOPERATIVEGROUPGRID * sizeof(int));
cudaMemcpy(d_src, h_src, GAINSOURCEBINS * sizeof(int), cudaMemcpyHostToDevice);
void* param[] = { (void*)&d_src,(void*)&d_dst };
dim3 grid(HISTOGRAMCOOPERATIVEGROUPGRID);
dim3 block(HISTOGRAMBLOCK);
cudaError_t rtn;
switch (iFuncIdx)
{
case 0:
std::cout << "call kHistogram \n";
cudaLaunchCooperativeKernel((void*)kHistogram, grid, block, param);
cudaDeviceSynchronize();
rtn = cudaGetLastError();
cudaMemcpy(h_dst, d_dst, HISTOGRAMDSTBINS * HISTOGRAMCOOPERATIVEGROUPGRID * sizeof(int), cudaMemcpyDeviceToHost);
VerifyHistogram(h_dst, HISTOGRAMSOURCEBINS);
break;
case 3:
std::cout << "call kHistogramOpt \n";
cudaLaunchCooperativeKernel((void*)kHistogramOpt, grid, block, param);
cudaDeviceSynchronize();
rtn = cudaGetLastError();
cudaMemcpy(h_dst, d_dst, HISTOGRAMDSTBINS * HISTOGRAMCOOPERATIVEGROUPGRID * sizeof(int), cudaMemcpyDeviceToHost);
VerifyHistogram(h_dst, HISTOGRAMSOURCEBINS);
break;
case 4:
{
std::cout << "call kHistogram_OptRefSampls \n";
kHistogram_OptRefSampls << <RS_GRID,RS_BLOCK >> > (d_src,d_dst);
cudaDeviceSynchronize();
rtn = cudaGetLastError();
cudaMemcpy(h_dst, d_dst, HISTOGRAMDSTBINS * sizeof(int), cudaMemcpyDeviceToHost);
VerifyHistogram(h_dst, HISTOGRAMSOURCEBINS);
}
break;
case 1:
std::cout << "call kHistogramReduce \n";
cudaLaunchCooperativeKernel((void*)kHistogramReduce, grid, block, param);
cudaDeviceSynchronize();
rtn = cudaGetLastError();
cudaMemcpy(h_dst, d_dst, HISTOGRAMDSTBINS * HISTOGRAMCOOPERATIVEGROUPGRID * sizeof(int), cudaMemcpyDeviceToHost);
VerifyHistogram(h_dst, HISTOGRAMSOURCEBINS);
break;
case 2:
std::cout << "call func 3 \n";
grid.x *= 8;
kHistogramLocal << <grid, block >> > (d_src, d_dst);
kReduceGrid << <1, 1024, 4 * 1024 >> > (d_dst, d_dst, grid.x * HISTOGRAMDSTBINS);
cudaDeviceSynchronize();
rtn = cudaGetLastError();
cudaMemcpy(h_dst, d_dst, HISTOGRAMDSTBINS * sizeof(int), cudaMemcpyDeviceToHost);
VerifyHistogram(h_dst, GAINSOURCEBINS);
break;
default:
break;
}
std::cout << "rtn is " << cudaGetErrorString(rtn);
cudaFree(d_src);
cudaFree(d_dst);
free(h_src);
free(h_dst);
}
int main(int argc, char** argv)
{
std::string sInput = argv[1];
std::stringstream ss;
ss << sInput;
int iFuncIdx = 0;
ss >> iFuncIdx;
TestHistogram(iFuncIdx);
return 0;
}
分析
上文中实现了两个版本,其中kHistogram在归约时,只使用64个线程完成归约,kHistogramReduce在归约时,使用了一个block(192)的线程实现。
基于RTX2070显卡,对比运行时间,两个版几乎持平。通过性能分析,两个版本对计算资源的占用太低。资源利用率低的原因:本篇中为了在同一个核函数内实现grid同步,使用cudaLaunchCooperativeKernel启动核函数,cudaLaunchCooperativeKernel在运行时会进行判断,当核函数需要的物理资源超过当前显卡的限制,核函数会启动失败。本文中实现对共享内存使用过大,导致每个SM上只有很少的线程在执行,并行度低(2070有36个SM,每个SM共享内存最大48K,本文实现每个线程需要共享内存64 * 4Byte,因此每个SM上最多执行线程 48K/(64 * 4Byte)=192)。
总结
本篇中的实现由于使用过多的共享内存,导致使用cudaLaunchCooperativeKernel无法启动足够多的block,因此对资源的利用率太低。为了处理这个问题,可以将grid范围的归约操作放在另外的核函数中实现。同时,cooperative_groups存在启动限制,因此当使用过多的物理资源时,会导致并行度低,因此使用时要酌情考虑。