目录
一、前言
YOLOv5-Lite项目:https://github.com/ppogg/YOLOv5-Lite
使用YOLOv5-Lite在树莓派4b上部署车辆检测模型(一)——UA-DETRAC车辆检测数据集的处理-CSDN博客
使用YOLOv5-Lite在树莓派4b上部署车辆检测模型(二)——使用数据集训练模型-CSDN博客
在前面的两篇文章中,我们处理好了UA-DETRAC数据集,并使用这个数据集训练了v5Lite-e模型。现在我们有了训练出的pt格式的权重文件,暂且称为v5Lite-e.pt,接下来需要将这个权重文件转换格式,然后在树莓派4B上部署。
二、树莓派环境配置
系统镜像源推荐使用官方下载器Raspberry Pi Imager,下载网址:https://www.raspberrypi.com/software/https://www.raspberrypi.com/software/ 系统选择raspbian,不同大版本的raspbian环境有微妙的区别,建议选择64位的bullseye版本系统。官方镜像源网站:
https://www.raspberrypi.com/software/operating-systems/#raspberry-pi-os-legacy-64-bithttps://www.raspberrypi.com/software/operating-systems/#raspberry-pi-os-legacy-64-bit 烧录好镜像源后,在树莓派上执行下面的命令安装依赖:
sudo apt-get install libprotobuf-dev protobuf-compiler
sudo apt-get install cmake
sudo apt-get install libopencv-dev
如果要使用摄像头,执行sudo raspi-config打开配置面板,激活摄像头。
三、MNN框架编译及模型转换
MNN框架选择2.7.1版本,执行下面命令进行编译:
mkdir build
cd build
cmake .. -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TOOL=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON -DMNN_SUPPORT_BF16=ON -DMNN_ARM82=ON -DMNN_BUILD_OPENCV=ON -DMNN_USE_OPENCV=ON
make -j
cmake的参数介绍见MNN项目wiki:https://github.com/alibaba/MNN/wiki/cmake#%E7%BC%96%E8%AF%91%E5%AE%8F%E4%BB%8B%E7%BB%8D
编译过程可能会报错:
/home/pi/MNN-2.7.1/source/backend/cpu/arm/arm64/bf16/ARMV86_MNNPackedMatMulRemain_BF16.S:158: Fatal error: macros nested too deeply
这是因为源码的汇编指令嵌套过深,会导致编译时定义的宏无法展开,需要将报错文件中的FMAX和FMIN这两个指令展开。
将ARMV86_MNNPackedMatMul_BF16.S、ARMV86_MNNPackedMatMulRemain_BF16.S这两个文件的内容替换为下面给出的即可解决报错。
ARMV86_MNNPackedMatMul_BF16.S
//
// ARMV86_MNNPackedMatMul_BF16.S
// MNN
//
// Created by MNN on 2022/10/09.
// Copyright © 2018-2021 Alibaba Group Holding Limited
//
#ifdef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
.macro SET_ZERO d0, d1, d2, d3
movi \d0\().4s, #0
movi \d1\().4s, #0
movi \d2\().4s, #0
movi \d3\().4s, #0
.endm
.macro Float32ToBf16 d0, d1, d2, d3
shrn \d0\().4h, \d0\().4s, #16
shrn \d1\().4h, \d1\().4s, #16
shrn \d2\().4h, \d2\().4s, #16
shrn \d3\().4h, \d3\().4s, #16
.endm
.macro SET_BIAS s, d0, d1, d2, d3
mov \d0\().16b, \s\().16b
mov \d1\().16b, \s\().16b
mov \d2\().16b, \s\().16b
mov \d3\().16b, \s\().16b
.endm
// 12 * 8 * 4 MatMul
asm_function ARMV86_MNNPackedMatMul_BF16
//void ARMV86_MNNPackedMatMul_BF16(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias);
// x0: C, x1:A, x2:B, x3:parameter, x4: postParameters, x5:bias
stp d14, d15, [sp, #-64]!
stp d12, d13, [sp, #16]
stp d10, d11, [sp, #32]
stp d8, d9, [sp, #48]
//ldr x8, [x3, #0] // deprecated
ldr x9, [x3, #8] // l
ldr x10, [x3, #16] // h
mov x11, #64 // B_stride = LP * HP = 4 * 8 * sizeof(int16_t)
ldr x13, [x3, #24] // cStride
ldr x7, [x3, #40] // bExtraStride
add x10, x10, #3
lsr x10, x10, #2
add x9, x9, #3
lsr x9, x9, #2
cbz x4, Start
ld1 {v5.4s}, [x4]
mov w17, v5.s[2] // min value
mov w18, v5.s[3] // max value
Start:
cmp x10, #2
blt LH4
LH8:
sub x14, x13, #96 // cStride - 96
LoopH:
mov x15, x1
mov x12, x9
cbz x5, NoBiasH8
ld1 {v0.4h, v1.4h}, [x5], #16 // 8 * sizeof(int16_t)
shll v0.4s, v0.4h, #16
shll v1.4s, v1.4h, #16
mov v2.16b, v0.16b
mov v3.16b, v1.16b
uzp1 v18.2d, v0.2d, v2.2d // bias_0, bias_1, bias_0, bias_1
uzp2 v19.2d, v0.2d, v2.2d // bias_2, bias_3, bias_2, bias_3
uzp1 v30.2d, v1.2d, v3.2d // bias_0, bias_1, bias_0, bias_1
uzp2 v31.2d, v1.2d, v3.2d // bias_2, bias_3, bias_2, bias_3
SET_BIAS v18, v8, v10, v12, v14
mov v16.16b, v18.16b
SET_BIAS v19, v9, v11, v13, v15
mov v17.16b, v19.16b
SET_BIAS v30, v20, v22, v24, v26
mov v28.16b, v30.16b
SET_BIAS v31, v21, v23, v25, v27
mov v29.16b, v31.16b
b LoopL
NoBiasH8:
SET_ZERO v8, v9, v10, v11
SET_ZERO v12, v13, v14, v15
SET_ZERO v16, v17, v18, v19
SET_ZERO v20, v21, v22, v23
SET_ZERO v24, v25, v26, v27
SET_ZERO v28, v29, v30, v31
LoopL:
// A [12, 4, bf16] : rn = 6 : v2 - v7
// B [ 8, 4, bf16] : rn = 2 : v0 - v1
// C [12, 8, fp32] : rn = 24 : v8 - v31
ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x15], #64 // A: 8 * 4 * sizeof(int16_t)
ld1 {v6.8h, v7.8h}, [x15], #32 // A: 4 * 4 * sizeof(int16_t)
ld1 {v0.8h, v1.8h}, [x2], #32 // B: 4 * 4 * sizeof(int16_t)
.inst 0x6e40ec48 // bfmmla v8.4s, v2.8h, v0.8h
.inst 0x6e41ec49 // bfmmla v9.4s, v2.8h, v1.8h
.inst 0x6e40ec6a // bfmmla v10.4s, v3.8h, v0.8h
.inst 0x6e41ec6b // bfmmla v11.4s, v3.8h, v1.8h
.inst 0x6e40ec8c // bfmmla v12.4s, v4.8h, v0.8h
.inst 0x6e41ec8d // bfmmla v13.4s, v4.8h, v1.8h
.inst 0x6e40ecae // bfmmla v14.4s, v5.8h, v0.8h
.inst 0x6e41ecaf // bfmmla v15.4s, v5.8h, v1.8h
.inst 0x6e40ecd0 // bfmmla v16.4s, v6.8h, v0.8h
.inst 0x6e41ecd1 // bfmmla v17.4s, v6.8h, v1.8h
.inst 0x6e40ecf2 // bfmmla v18.4s, v7.8h, v0.8h
.inst 0x6e41ecf3 // bfmmla v19.4s, v7.8h, v1.8h
ld1 {v0.8h, v1.8h}, [x2], #32 // B: 4 * 4 * sizeof(int16_t)
.inst 0x6e40ec54 // bfmmla v20.4s, v2.8h, v0.8h
.inst 0x6e41ec55 // bfmmla v21.4s, v2.8h, v1.8h
.inst 0x6e40ec76 // bfmmla v22.4s, v3.8h, v0.8h
.inst 0x6e41ec77 // bfmmla v23.4s, v3.8h, v1.8h
.inst 0x6e40ec98 // bfmmla v24.4s, v4.8h, v0.8h
.inst 0x6e41ec99 // bfmmla v25.4s, v4.8h, v1.8h
.inst 0x6e40ecba // bfmmla v26.4s, v5.8h, v0.8h
.inst 0x6e41ecbb // bfmmla v27.4s, v5.8h, v1.8h
.inst 0x6e40ecdc // bfmmla v28.4s, v6.8h, v0.8h
.i