参加【Nvidia CUDA线上训练营2023春】笔记——使用统一内存减少数据传输开销

一、知识梳理

CPU内存和GPU显存是两个独立的空间，两个空间的变量想交互（例如赋值）的话，就需要使用特定的函数传输，一个程序通常需要很多次的数据传输，所以数据传输产生的性能开销不容小觑，所以CUDA推出“统一内存”机制，简化数据传输操作。

二、实验例程

统一内存的声明方法

关键字 __managed__ 
或
函数调用
cudaError_t cudaMallocManaged(void **devPtr,size_t size,unsigned int flag=0)

对比

//不使用统一内存
#include <stdio.h>
#include <math.h>
#include "error.cuh"

#define BLOCK_SIZE 16

__global__ void gpu_matrix_mult_shared(int *d_a, int *d_b, int *d_result, int m, int n, int k) 
{
    __shared__ int tile_a[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ int tile_b[BLOCK_SIZE][BLOCK_SIZE];

    int row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
    int col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int tmp = 0;
    int idx;

    for (int sub = 0; sub < gridDim.x; ++sub) 
    {
        idx = row * n + sub * BLOCK_SIZE + threadIdx.x;
        tile_a[threadIdx.y][threadIdx.x] = row<n && (sub * BLOCK_SIZE + threadIdx.x)<n? d_a[idx]:0;
        idx = (sub * BLOCK_SIZE + threadIdx.y) * n + col;
        tile_b[threadIdx.y][threadIdx.x] = col<n && (sub * BLOCK_SIZE + threadIdx.y)<n? d_b[idx]:0;
        
        __syncthreads();
        for (int k = 0; k < BLOCK_SIZE; ++k) 
        {
            tmp += tile_a[threadIdx.y][k] * tile_b[k][threadIdx.x];
        }
        __syncthreads();
    }
    if(row < n && col < n)
    {
        d_result[row * n + col] = tmp;
    }
}

void cpu_matrix_mult(int *h_a, int *h_b, int *h_result, int m, int n, int k) {
    for (int i = 0; i < m; ++i) 
    {
        for (int j = 0; j < k; ++j) 
        {