在之前的两篇博客中我已经介绍了
VMware虚拟机:Jetson Orin NX 16G系统烧录教程_jetson orin nx烧录-CSDN博客
Jetson orin NX 16GB:opencv4.8.0使用CUDA加速_jetson nx opencv with cuda-CSDN博客
现在这篇博客主要是介绍使用Jetson orin NX 16GB在VScode中编写一个CDUA加速项目,当然使用的环境如下:
1、新建项目文件
在这里我先打开终端创建了一个cuda_project文件夹并打开这个文件
mkdir cuda_project
cd cuda_project
再在cuda_test文件夹下创建项目所需要的文件
mkdir assets build include src
touch CMakeLists.txt
touch include/cuda_utils.h
touch src/cuda_kernels.cu src/main.cpp
再打开VScode就可以看到下面的项目目录:
2、配置VScode环境
接下来就是配置VScode中的环境了
先在VScode的应用商店中下载以下扩展:
然后再按shitf+ctrl+P,输入C/C++:edit configurations(JSON)
点击后就会弹出一个c_cpp_properties.json文件(主要是指定项目代码的库和环境路径以及版本)
我们需要对这个进行修改(注意版本差异和路径差异):
{
"configurations": [
{
"name": "Linux",
"includePath": [
"${workspaceFolder}/include",
"/usr/local/cuda/include",
"/usr/local/include/opencv4",
"${workspaceFolder}/**"
],
"defines": [],
"compilerPath": "/usr/bin/g++",
"cStandard": "c11",
"cppStandard": "c++14",
//"intellisenseMode": "linux-gcc-arm64", // 注意架构是arm64
"configurationProvider": "ms-vscode.cmake-tools"
}
],
"version": 4
}
然后再按shitf+ctrl+P,输入CMake:Select a kit,点击
并选择如下编译器(每个人的版本不一样,选的会有差异)
之后再配置CMakeList.txt文件(注意路径和版本差异)
# 手动设置 CUDA 路径
set(CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda")
set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
cmake_minimum_required(VERSION 3.18)
project(cuda_test VERSION 1.0 LANGUAGES C CXX CUDA)
# 设置策略以避免CMP0104警告
if(POLICY CMP0104)
cmake_policy(SET CMP0104 NEW)
endif()
#-------------------------------------------------------------------------------
# 1. 设置编译选项和标准
#-------------------------------------------------------------------------------
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 14)
# 设置CUDA架构 - 使用不同的方法
set(CMAKE_CUDA_ARCHITECTURES "87") # 对于Jetson Orin NX
#-------------------------------------------------------------------------------
# 2. 查找依赖包 - 修改CUDA的查找方式
#-------------------------------------------------------------------------------
find_package(OpenCV 4.8 REQUIRED COMPONENTS core highgui imgproc)
# 对于CUDA,现代CMake版本更推荐使用find_package(CUDAToolkit)而不是find_package(CUDA)
find_package(CUDAToolkit REQUIRED)
#-------------------------------------------------------------------------------
# 3. 打印找到的库信息,便于调试
#-------------------------------------------------------------------------------
message(STATUS "Found OpenCV: ${OpenCV_VERSION} at ${OpenCV_DIR}")
message(STATUS " Includes: ${OpenCV_INCLUDE_DIRS}")
message(STATUS " Libraries: ${OpenCV_LIBS}")
message(STATUS "Found CUDA Toolkit: ${CUDAToolkit_VERSION}")
#-------------------------------------------------------------------------------
# 4. 添加可执行文件目标
#-------------------------------------------------------------------------------
add_executable(${PROJECT_NAME}
src/main.cpp
src/cuda_kernels.cu
)
#-------------------------------------------------------------------------------
# 5. 配置包含目录和链接库 - 修改CUDA链接方式
#-------------------------------------------------------------------------------
target_include_directories(${PROJECT_NAME} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/include
${OpenCV_INCLUDE_DIRS}
)
target_link_libraries(${PROJECT_NAME} PRIVATE
${OpenCV_LIBS}
CUDA::cudart # 使用新的目标名称
)
#-------------------------------------------------------------------------------
# 6. 配置CUDA相关的编译设置
#-------------------------------------------------------------------------------
# 设置CUDA架构属性
set_target_properties(${PROJECT_NAME} PROPERTIES
CUDA_ARCHITECTURES "87" # 再次确保设置架构
CUDA_SEPARABLE_COMPILATION ON
)
3、测试代码
首先是主函数main.cpp,主要作用是CPU计算时间和输出结果以及链接.cu文件
#include <iostream>
#include <chrono>
#include <cstdlib>
#include <cmath>
#include "cuda_utils.h"
#include <cstdio>
// 声明CUDA函数
extern float vectorAddGPU(const float* a, const float* b, float* c, int n, bool includeTransfer);
extern float vectorDotGPU(const float* a, const float* b, int n, bool includeTransfer);
extern float matrixMulGPU(float* A, float* B, float* C, int N, bool includeTransfer);
// 向量加法 - CPU版本
float vectorAddCPU(const float* a, const float* b, float* c, int n) {
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
auto end = std::chrono::high_resolution_clock::now();
float duration = std::chrono::duration<float, std::milli>(end - start).count();
std::cout << " CPU执行时间: " << duration << " ms" << std::endl;
return duration;
}
// 向量点积 - CPU版本
float vectorDotCPU(const float* a, const float* b, int n) {
auto start = std::chrono::high_resolution_clock::now();
float result = 0.0f;
for (int i = 0; i < n; i++) {
result += a[i] * b[i];
result += sinf(a[i]) * cosf(b[i]);
result += sqrtf(fabs(a[i] - b[i]));
}
auto end = std::chrono::high_resolution_clock::now();
float duration = std::chrono::duration<float, std::milli>(end - start).count();
std::cout << " CPU执行时间: " << duration << " ms" << std::endl;
return duration;
}
// 矩阵乘法 - CPU版本
float matrixMulCPU(float* A, float* B, float* C, int N) {
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
float sum = 0.0f;
for (int k = 0; k < N; k++) {
float a_val = A[i * N + k];
float b_val = B[k * N + j];
sum += a_val * b_val;
sum += 0.1f * sinf(a_val) * cosf(b_val);
}
C[i * N + j] = sum;
}
}
auto end = std::chrono::high_resolution_clock::now();
float duration = std::chrono::duration<float, std::milli>(end - start).count();
std::cout << " CPU执行时间: " << duration << " ms" << std::endl;
return duration;
}
// 验证结果是否正确
bool verifyResults(const float* a, const float* b, int n, float tolerance = 1e-5f) {
for (int i = 0; i < n; i++) {
if (std::abs(a[i] - b[i]) > tolerance) {
std::cout << "验证失败 at index " << i << ": "
<< a[i] << " != " << b[i] << " (差值: " << std::abs(a[i] - b[i]) << ")" << std::endl;
return false;
}
}
return true;
}
// 计算并显示加速比
void printSpeedup(float cpu_time, float gpu_time, const std::string& description) {
float speedup = cpu_time / gpu_time;
std::cout << " " << description << "加速比: " << speedup << "x (CPU/GPU)";
if (speedup > 1) {
std::cout << " ✓ GPU更快" << std::endl;
} else if (speedup < 1) {
std::cout << " ✗ CPU更快" << std::endl;
} else {
std::cout << " - 性能相同" << std::endl;
}
}
// 运行性能测试
void runPerformanceTest() {
std::cout << "=== GPU vs CPU 性能对比测试 ===" << std::endl;
std::cout << "设备: NVIDIA Jetson Orin NX" << std::endl;
std::cout << "CUDA: 12.6.68" << std::endl << std::endl;
// 测试1: 向量加法 (计算强度低)
std::cout << "1. 测试向量加法 (计算强度低):" << std::endl;
const int vectorSize = 1000000;
float *h_a = new float[vectorSize];
float *h_b = new float[vectorSize];
float *h_c_cpu = new float[vectorSize];
float *h_c_gpu = new float[vectorSize];
for (int i = 0; i < vectorSize; i++) {
h_a[i] = static_cast<float>(rand()) / RAND_MAX;
h_b[i] = static_cast<float>(rand()) / RAND_MAX;
}
std::cout << "向量大小: " << vectorSize << " 个元素" << std::endl;
std::cout << "CPU版本:" << std::endl;
float cpu_time = vectorAddCPU(h_a, h_b, h_c_cpu, vectorSize);
std::cout << "GPU版本 (含数据传输):" << std::endl;
float gpu_total_time = vectorAddGPU(h_a, h_b, h_c_gpu, vectorSize, true);
std::cout << "GPU版本 (仅计算):" << std::endl;
float gpu_compute_time = vectorAddGPU(h_a, h_b, h_c_gpu, vectorSize, false);
printSpeedup(cpu_time, gpu_total_time, "总时间");
printSpeedup(cpu_time, gpu_compute_time, "纯计算");
std::cout << "验证结果: ";
if (verifyResults(h_c_cpu, h_c_gpu, vectorSize)) {
std::cout << "✓ 成功" << std::endl;
} else {
std::cout << "✗ 失败" << std::endl;
}
delete[] h_a;
delete[] h_b;
delete[] h_c_cpu;
delete[] h_c_gpu;
// 测试2: 向量点积 (计算强度中等)
std::cout << "\n2. 测试向量点积 (计算强度中等):" << std::endl;
h_a = new float[vectorSize];
h_b = new float[vectorSize];
for (int i = 0; i < vectorSize; i++) {
h_a[i] = static_cast<float>(rand()) / RAND_MAX;
h_b[i] = static_cast<float>(rand()) / RAND_MAX;
}
std::cout << "向量大小: " << vectorSize << " 个元素" << std::endl;
std::cout << "CPU版本:" << std::endl;
cpu_time = vectorDotCPU(h_a, h_b, vectorSize);
std::cout << "GPU版本 (含数据传输):" << std::endl;
gpu_total_time = vectorDotGPU(h_a, h_b, vectorSize, true);
std::cout << "GPU版本 (仅计算):" << std::endl;
gpu_compute_time = vectorDotGPU(h_a, h_b, vectorSize, false);
printSpeedup(cpu_time, gpu_total_time, "总时间");
printSpeedup(cpu_time, gpu_compute_time, "纯计算");
delete[] h_a;
delete[] h_b;
// 测试3: 矩阵乘法 (计算强度高)
std::cout << "\n3. 测试矩阵乘法 (计算强度高):" << std::endl;
const int matrixSize = 512;
float *h_A = new float[matrixSize * matrixSize];
float *h_B = new float[matrixSize * matrixSize];
float *h_C_cpu = new float[matrixSize * matrixSize];
float *h_C_gpu = new float[matrixSize * matrixSize];
for (int i = 0; i < matrixSize * matrixSize; i++) {
h_A[i] = static_cast<float>(rand()) / RAND_MAX;
h_B[i] = static_cast<float>(rand()) / RAND_MAX;
}
std::cout << "矩阵大小: " << matrixSize << "x" << matrixSize << std::endl;
std::cout << "CPU版本:" << std::endl;
cpu_time = matrixMulCPU(h_A, h_B, h_C_cpu, matrixSize);
std::cout << "GPU版本 (含数据传输):" << std::endl;
gpu_total_time = matrixMulGPU(h_A, h_B, h_C_gpu, matrixSize, true);
std::cout << "GPU版本 (仅计算):" << std::endl;
gpu_compute_time = matrixMulGPU(h_A, h_B, h_C_gpu, matrixSize, false);
printSpeedup(cpu_time, gpu_total_time, "总时间");
printSpeedup(cpu_time, gpu_compute_time, "纯计算");
std::cout << "验证结果: ";
if (verifyResults(h_C_cpu, h_C_gpu, matrixSize * matrixSize, 1e-3f)) {
std::cout << "✓ 成功" << std::endl;
} else {
std::cout << "✗ 失败" << std::endl;
}
delete[] h_A;
delete[] h_B;
delete[] h_C_cpu;
delete[] h_C_gpu;
std::cout << "\n=== 测试完成 ===" << std::endl;
std::cout << "总结:" << std::endl;
std::cout << "- 计算强度低的任务 (如向量加法): GPU可能没有优势甚至更慢" << std::endl;
std::cout << "- 计算强度中等的任务 (如向量点积): GPU开始显示优势" << std::endl;
std::cout << "- 计算强度高的任务 (如矩阵乘法): GPU优势明显" << std::endl;
std::cout << "- 数据传输开销是影响GPU性能的重要因素" << std::endl;
}
int main() {
runPerformanceTest();
return 0;
}
之后是cuda_kernels.cu文件,主要是调用GPU加速计算过程,这里每种条件只用了一个核函数。
#include "cuda_utils.h"
#include <cmath>
#include <chrono>
// 简单操作:向量加法
__global__ void vectorAddKernel(const float* a, const float* b, float* c, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
c[i] = a[i] + b[i];
}
}
// 中等复杂度操作:向量点积
__global__ void vectorDotKernel(const float* a, const float* b, float* result, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
extern __shared__ float sdata[];
float sum = 0.0f;
if (i < n) {
sum += a[i] * b[i];
sum += sinf(a[i]) * cosf(b[i]);
sum += sqrtf(fabs(a[i] - b[i]));
}
sdata[threadIdx.x] = sum;
__syncthreads();
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
if (threadIdx.x < s) {
sdata[threadIdx.x] += sdata[threadIdx.x + s];
}
__syncthreads();
}
if (threadIdx.x == 0) {
atomicAdd(result, sdata[0]);
}
}
// 复杂操作:矩阵乘法
__global__ void matrixMulKernel(float* A, float* B, float* C, int N) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < N && col < N) {
float sum = 0.0f;
for (int k = 0; k < N; k++) {
float a_val = A[row * N + k];
float b_val = B[k * N + col];
sum += a_val * b_val;
sum += 0.1f * sinf(a_val) * cosf(b_val);
}
C[row * N + col] = sum;
}
}
// 向量加法 - GPU版本
float vectorAddGPU(const float* a, const float* b, float* c, int n, bool includeTransfer) {
float *d_a, *d_b, *d_c;
CHECK_CUDA_ERROR(cudaMalloc(&d_a, n * sizeof(float)));
CHECK_CUDA_ERROR(cudaMalloc(&d_b, n * sizeof(float)));
CHECK_CUDA_ERROR(cudaMalloc(&d_c, n * sizeof(float)));
auto total_start = std::chrono::high_resolution_clock::now();
if (includeTransfer) {
CHECK_CUDA_ERROR(cudaMemcpy(d_a, a, n * sizeof(float), cudaMemcpyHostToDevice));
CHECK_CUDA_ERROR(cudaMemcpy(d_b, b, n * sizeof(float), cudaMemcpyHostToDevice));
}
int blockSize = 256;
int gridSize = (n + blockSize - 1) / blockSize;
cudaEvent_t start, stop;
CHECK_CUDA_ERROR(cudaEventCreate(&start));
CHECK_CUDA_ERROR(cudaEventCreate(&stop));
CHECK_CUDA_ERROR(cudaEventRecord(start));
vectorAddKernel<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
CHECK_CUDA_ERROR(cudaEventRecord(stop));
CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
float compute_time = 0;
CHECK_CUDA_ERROR(cudaEventElapsedTime(&compute_time, start, stop));
if (includeTransfer) {
CHECK_CUDA_ERROR(cudaMemcpy(c, d_c, n * sizeof(float), cudaMemcpyDeviceToHost));
}
auto total_end = std::chrono::high_resolution_clock::now();
float total_time = std::chrono::duration<float, std::milli>(total_end - total_start).count();
// 添加时间输出
if (includeTransfer) {
printf(" GPU计算时间: %.3f ms, 总时间(含传输): %.3f ms, 传输时间: %.3f ms\n",
compute_time, total_time, total_time - compute_time);
}
else {
printf(" GPU计算时间: %.3f ms\n", compute_time);
}
CHECK_CUDA_ERROR(cudaEventDestroy(start));
CHECK_CUDA_ERROR(cudaEventDestroy(stop));
CHECK_CUDA_ERROR(cudaFree(d_a));
CHECK_CUDA_ERROR(cudaFree(d_b));
CHECK_CUDA_ERROR(cudaFree(d_c));
return includeTransfer ? total_time : compute_time;
}
// 向量点积 - GPU版本
float vectorDotGPU(const float* a, const float* b, int n, bool includeTransfer) {
float *d_a, *d_b, *d_result;
float h_result = 0.0f;
CHECK_CUDA_ERROR(cudaMalloc(&d_a, n * sizeof(float)));
CHECK_CUDA_ERROR(cudaMalloc(&d_b, n * sizeof(float)));
CHECK_CUDA_ERROR(cudaMalloc(&d_result, sizeof(float)));
auto total_start = std::chrono::high_resolution_clock::now();
CHECK_CUDA_ERROR(cudaMemset(d_result, 0, sizeof(float)));
if (includeTransfer) {
CHECK_CUDA_ERROR(cudaMemcpy(d_a, a, n * sizeof(float), cudaMemcpyHostToDevice));
CHECK_CUDA_ERROR(cudaMemcpy(d_b, b, n * sizeof(float), cudaMemcpyHostToDevice));
}
int blockSize = 256;
int gridSize = (n + blockSize - 1) / blockSize;
cudaEvent_t start, stop;
CHECK_CUDA_ERROR(cudaEventCreate(&start));
CHECK_CUDA_ERROR(cudaEventCreate(&stop));
CHECK_CUDA_ERROR(cudaEventRecord(start));
vectorDotKernel<<<gridSize, blockSize, blockSize * sizeof(float)>>>(d_a, d_b, d_result, n);
CHECK_CUDA_ERROR(cudaEventRecord(stop));
CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
float compute_time = 0;
CHECK_CUDA_ERROR(cudaEventElapsedTime(&compute_time, start, stop));
if (includeTransfer) {
CHECK_CUDA_ERROR(cudaMemcpy(&h_result, d_result, sizeof(float), cudaMemcpyDeviceToHost));
}
auto total_end = std::chrono::high_resolution_clock::now();
float total_time = std::chrono::duration<float, std::milli>(total_end - total_start).count();
// 添加时间输出
if (includeTransfer) {
printf(" GPU计算时间: %.3f ms, 总时间(含传输): %.3f ms, 传输时间: %.3f ms\n",
compute_time, total_time, total_time - compute_time);
} else {
printf(" GPU计算时间: %.3f ms\n", compute_time);
}
CHECK_CUDA_ERROR(cudaEventDestroy(start));
CHECK_CUDA_ERROR(cudaEventDestroy(stop));
CHECK_CUDA_ERROR(cudaFree(d_a));
CHECK_CUDA_ERROR(cudaFree(d_b));
CHECK_CUDA_ERROR(cudaFree(d_result));
return includeTransfer ? total_time : compute_time;
}
// 矩阵乘法 - GPU版本
float matrixMulGPU(float* A, float* B, float* C, int N, bool includeTransfer) {
float *d_A, *d_B, *d_C;
size_t size = N * N * sizeof(float);
CHECK_CUDA_ERROR(cudaMalloc(&d_A, size));
CHECK_CUDA_ERROR(cudaMalloc(&d_B, size));
CHECK_CUDA_ERROR(cudaMalloc(&d_C, size));
auto total_start = std::chrono::high_resolution_clock::now();
if (includeTransfer) {
CHECK_CUDA_ERROR(cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice));
CHECK_CUDA_ERROR(cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice));
}
dim3 threadsPerBlock(16, 16);
dim3 blocksPerGrid((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
(N + threadsPerBlock.y - 1) / threadsPerBlock.y);
cudaEvent_t start, stop;
CHECK_CUDA_ERROR(cudaEventCreate(&start));
CHECK_CUDA_ERROR(cudaEventCreate(&stop));
CHECK_CUDA_ERROR(cudaEventRecord(start));
matrixMulKernel<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
CHECK_CUDA_ERROR(cudaEventRecord(stop));
CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
float compute_time = 0;
CHECK_CUDA_ERROR(cudaEventElapsedTime(&compute_time, start, stop));
if (includeTransfer) {
CHECK_CUDA_ERROR(cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost));
}
auto total_end = std::chrono::high_resolution_clock::now();
float total_time = std::chrono::duration<float, std::milli>(total_end - total_start).count();
// 添加时间输出
if (includeTransfer) {
printf(" GPU计算时间: %.3f ms, 总时间(含传输): %.3f ms, 传输时间: %.3f ms\n",
compute_time, total_time, total_time - compute_time);
} else {
printf(" GPU计算时间: %.3f ms\n", compute_time);
}
CHECK_CUDA_ERROR(cudaEventDestroy(start));
CHECK_CUDA_ERROR(cudaEventDestroy(stop));
CHECK_CUDA_ERROR(cudaFree(d_A));
CHECK_CUDA_ERROR(cudaFree(d_B));
CHECK_CUDA_ERROR(cudaFree(d_C));
return includeTransfer ? total_time : compute_time;
}
最后是头文件cuda_utils.h,主要作用是检查核函数是否执行错误和CUDA错误检查宏。
#ifndef CUDA_UTILS_H
#define CUDA_UTILS_H
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
// CUDA 错误检查宏
#define CHECK_CUDA_ERROR(call) \
do { \
cudaError_t err = (call); \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d - %s\n", \
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while (0)
// 检查核函数执行错误
#define CHECK_LAST_CUDA_ERROR() \
do { \
cudaError_t err = cudaGetLastError(); \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA kernel error: %s\n", cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while (0)
#endif // CUDA_UTILS_H
4、测试代码
保存上述代码后,其目录如下
然后就是生成和编译代码:
其结果如下图。