20240310-CUDA核函数--神经网络推理加速操作

最新推荐文章于 2025-06-04 23:33:34 发布

～秋枫吹散

最新推荐文章于 2025-06-04 23:33:34 发布

阅读量677

点赞数 3

CC 4.0 BY-SA版权

分类专栏： C++ CUDA 文章标签： c++ 人工智能

本文链接：https://blog.csdn.net/LP5956/article/details/136590683

C++ 同时被 2 个专栏收录

7 篇文章

订阅专栏

CUDA

1 篇文章

订阅专栏

grid、block、thread的关系及thread索引的计算
 ChatGPT CUDA 之间的东西

CUDA核编程

1、网格（Grid）、线程块（Block）和线程（Thread）

开普勒架构：最大线程块1024，最大网格2^31-1，两万亿个线程
每个Block中的线程可以使用共享内存进行通信和同步
Block之间的线程通信通常需要使用更高级的同步机制，如CUDA提供的原子操作或者互斥锁

2、内存组织

线程的Block

3、核函数的书写

在这里插入图片描述

3.1 图像的核函数书写：　一般是每一个像素分配一个线程

3.2 有关核函数定义的时候

blockDim：维度一般是32的倍数，但是也不要太大，似乎block有尺寸限制，我之前设置256*256 然后计算就出现了问题。

// 一个像素分配一个线程操作
dim3 blockDim(32, 32);
dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);

3.3 核函数的定义和实现

求线程全局索引
求输入数组数据索引和线程全局索引之间的关系
根据索引进行数据的变换操作

Example 1: BGRBGRBGRBGR–>BBBBBGGGGGRRRRR

下面以我个人对于图像的通道转换操作：

针对opencv 中图像.data 数组进行通道分离操作：

img.data是一个一维数组：排列的方式是BGRBGRBGRBGR(img 转换为了BGR 图像) 按照行排列的方式对图像中每一个像素点的三个通道进行排列，但是我现在需要利用核函数从而实

现B、G、R通道的分离操作。

__global__ void imageConversion(float *inputImage, float *outputImage, int height, int width, int channels)
{
    /**
     * 
     *  这个是将BGR图像中的mat.data  BGRBGRBGR排列转换为 BBBBBBGGGGGRRRR排列的方式
     *  其中 mat.data的范围是（0-1)
     * */
    // 获取线程的全局索引
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int idy = threadIdx.y + blockIdx.y * blockDim.y;
    // 计算一维数组中的索引
    int index = (idy * width + idx) * channels;
    // 检查索引是否在图像范围内
    if (idx < width && idy < height)
    {
        // 将BGR值存储到一维数组中
        outputImage[idy * width + idx] = inputImage[index];                          // Blue
        outputImage[height * width + idy * width + idx] = inputImage[index + 1];     // Green
        outputImage[2 * height * width + idy * width + idx] = inputImage[index + 2]; // Red
    }
}

Example 2: BBBBBGGGGGRRRRR–>RGBRGBRGBRGBRGB

__global__ void rearrangeChannels(const float *inputData, float *outputData, int height, int width) {
    // 获取线程的全局索引
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int idy = threadIdx.y + blockIdx.y * blockDim.y;
    // 计算输出数组中的索引
    int outputIndex = (idy * width + idx) * 3;
    // 检查索引是否在图像范围内
    if (idx < width && idy < height) {
        // 计算输入数组中的索引
        int inputIndex = idy * width + idx;
        // 将 BBBBGGGGRRRR 转换为 RGBRGBRGBRGBRGBRGRB
        outputData[outputIndex] = inputData[inputIndex + 2* height * width] *255 ;     //Red
        outputData[outputIndex + 1] = inputData[inputIndex + height * width] *255;    // Green
        outputData[outputIndex + 2] =inputData[inputIndex] *255;                      // Blue
    }
}

4、核函数的测试：

针对于核函数操作

1、将数据从cpu内存copy 到GPU显存操作；
2、在GPU上利用核函数进行计算求值操作；
3、将数据从GPU显存中copy 到CPU内存中。

    cv::Mat inputImage = cv::imread("/home/wxcwxc/wxcpython/rt4ksr/mdbn/000.png", cv::IMREAD_UNCHANGED);
    //  首先BGR化，然后再进行归一化浮点数操作
    cv::cvtColor(inputImage, inputImage, cv::COLOR_RGBA2BGR);
    inputImage.convertTo(inputImage, CV_32FC3, 1.0 / 255.);
    //  获取图像信息以及设置context 信息操作和初始化
    const int batchSize = 1;
    int height = inputImage.rows;
    int width = inputImage.cols;
    int channels = inputImage.channels();
    const int outputHeight = height * 2; 
    const int outputWidth = width * 2; 
    const size_t inputSize = batchSize * channels * height * width * sizeof(float);
    const size_t outputSize = batchSize * channels * outputHeight * outputWidth* sizeof(float);
    
    float *gpuInData;
    float *gputOutData;
    float *d_inputImage;
    
    cudaMalloc(&d_inputImage, channels * outputHeight * outputWidth * sizeof(float));
    cudaMalloc(&gpuInData, inputSize);
    cudaMalloc(&gputOutData, outputSize);

    cudaMemcpy(d_inputImage, inputImage.data, height * width * channels * sizeof(float), cudaMemcpyHostToDevice);

    dim3 blockDim(32, 32);
    dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);
    // 进行核函数的变换操作
    imageConversion<<<gridDim, blockDim>>>(d_inputImage, gpuInData, height, width, channels);

   /* 
    2. 推理操作
    */
     // 此处是本人的神经网络在GPU上的加速推理操作
    /*
        3. 推理结果的后处理操作
    */

    dim3 blockResDim(32, 32);
    dim3 gridDimRes((outputWidth + blockResDim.x - 1) / blockResDim.x, (outputHeight + blockResDim.y - 1) / blockResDim.y);
    
    rearrangeChannels <<<gridDimRes, blockResDim>>>(gputOutData, d_inputImage, outputHeight, outputWidth);

    cv::Mat reconstructedImage(outputHeight, outputWidth, CV_32FC3);
    cudaMemcpy(reconstructedImage.data, d_inputImage, outputHeight * outputWidth * channels * sizeof(float), cudaMemcpyDeviceToHost);

    std::string reconstructedImageImagePath = "resres.png";
    cv::imwrite(reconstructedImageImagePath, reconstructedImage);