BM1688模型性能测试工具
支持多种模式组合下的性能摸底:
- 1C1T1C,1core模型起1个线程独立运行在0core
- 1C2T1C,1core模型起2个线程共同运行在0core
- 1C2T2C,1core模型起2个线程分别运行在0、1core
- 1C4T2C,1core模型起4个线程分别运行在0、1core,每个core两个线程
- 2C1T2C,2core模型起1个线程独立运行在0、1core
- 2C2T2C,2core模型起2个线程共同运行在0、1core
其中1C2T2C/1C4T2C模式可以获得模型的最大吞吐,压满芯片
使用
直接丢到BM1688的盒子上,应该不缺依赖
# num_samples 表示用来测试的样本数量,最大1000
# mode 就是上述的六种模式
./bm_run_model2 model_path mode num_samples
代码
#include <iostream>
#include <string>
#include <chrono>
#include <thread>
#include "concurrentqueue.h"
#include "bmruntime.h"
using namespace moodycamel;
typedef struct {
uint64_t count{0};
float preprocess_time{0}; // us
float inference_time{0}; // us
float get_output_time{0}; // us
float total_time() const { return preprocess_time + inference_time + get_output_time; }
} counter_t;
void infer(bm_handle_t bm_handle, std::string bmodel, int* core_list, int core_num, int thread_id, ConcurrentQueue<int>& queue, counter_t& counter) {
void *p_bmrt = bmrt_create(bm_handle);
if (NULL == p_bmrt) {
printf("bmrt_create failed\n");
return;
}
bool success = bmrt_load_bmodel(p_bmrt, bmodel.c_str());
if (!success) {
printf("bmrt_load_bmodel failed\n");
return;
}
const char **net_names = NULL;
int net_num = bmrt_get_network_number(p_bmrt);
if (net_num > 1) {
printf("net_num > 1\n");
return;
}
bmrt_get_network_names(p_bmrt, &net_names);
const char* net_name = net_names[0];
auto net_info = bmrt_get_network_info(p_bmrt, net_name);
if (!net_info) {
printf("bmrt_get_network_info failed\n");
return;
}
bmrt_print_network_info(net_info);
if (net_info->core_num != core_num) {
printf("core_num[%d] != net_info->core_num[%d]\n", core_num, net_info->core_num);
return;
}
const int input_num = net_info->input_num;
const int output_num = net_info->output_num;
const int stage_num = net_info->stage_num;
assert(stage_num == 1);
bm_tensor_t input_tensors[input_num];
for (int i=0; i<input_num; ++i) {
bm_status_t status = bm_malloc_device_byte(bm_handle, &input_tensors[i].device_mem, net_info->max_input_bytes[i]);
input_tensors[i].st_mode = BM_STORE_1N;
input_tensors[i].dtype = net_info->input_dtypes[i];
input_tensors[i].shape.num_dims = net_info->stages[0].input_shapes[i].num_dims;
for (int d=0; d<net_info->stages[0].input_shapes[i].num_dims; ++d)
input_tensors[i].shape.dims[d] = net_info->stages[0].input_shapes[i].dims[d];
}
void* outputs[128]{};
bm_tensor_t output_tensors[output_num];
for (int i=0; i<output_num; ++i) {
bm_status_t status = bm_malloc_device_byte(bm_handle, &output_tensors[i].device_mem, net_info->max_output_bytes[i]);
output_tensors[i].st_mode = BM_STORE_1N;
output_tensors[i].dtype = net_info->output_dtypes[i];
output_tensors[i].shape.num_dims = net_info->stages[0].output_shapes[i].num_dims;
for (int d=0; d<net_info->stages[0].output_shapes[i].num_dims; ++d)
output_tensors[i].shape.dims[d] = net_info->stages[0].input_shapes[i].dims[d];
// system mem
outputs[i] = malloc(net_info->max_output_bytes[i]);
}
bool user_mem = true;
bool user_stmode = false;
int flag;
while(1) {
if (!queue.try_dequeue(flag))
break;
auto t0 = std::chrono::high_resolution_clock::now();
success = bmrt_launch_tensor_multi_cores(p_bmrt, net_name, input_tensors, input_num, output_tensors, output_num, user_mem, user_stmode, core_list, core_num);
// sync, wait for finishing inference
for (int c=0; c<core_num; ++c)
bm_thread_sync_from_core(bm_handle, core_list[c]);
std::chrono::duration<float, std::micro> tp0 = std::chrono::high_resolution_clock::now() - t0;
counter.inference_time += tp0.count();
printf("[Thread%d] inference time: %.3fms\n", thread_id, tp0.count() / 1000.0f);
counter.count++;
}
/**************************************************************/
// here all output info stored in output_tensors, such as data type, shape, device_mem.
// you can copy data to system memory, like this.
for (int k=0; k<counter.count; ++k) {
auto t0 = std::chrono::high_resolution_clock::now();
for (int i=0; i<output_num; ++i) {
bm_memcpy_d2s_partial(bm_handle, outputs[i], output_tensors[i].device_mem, bmrt_tensor_bytesize(&output_tensors[i]));
}
std::chrono::duration<float, std::micro> tp0 = std::chrono::high_resolution_clock::now() - t0;
counter.get_output_time += tp0.count();
}
printf("[Thread%d] samples: %ld ave_inference_time: %.3fms get_output_time: %.3fms\n",
thread_id, counter.count, counter.inference_time / 1000.0f / counter.count, counter.get_output_time / 1000.0f / counter.count);
free(net_names);
// at last, free device memory
for (int i = 0; i < input_num; ++i) {
bm_free_device(bm_handle, input_tensors[i].device_mem);
}
for (int i = 0; i < output_num; ++i) {
bm_free_device(bm_handle, output_tensors[i].device_mem);
free(outputs[i]);
}
}
int main(int argc, char *argv[]) {
if (argc != 4) {
printf("input param num(%d) must be == 4, "
"\n\t1 - model file path, 2 - mode, 3 - num sample"
"\n\t\t mode - 0, 1C1T1C"
"\n\t\t mode - 1, 1C2T1C"
"\n\t\t mode - 2, 1C2T2C"
"\n\t\t mode - 3, 1C4T2C"
"\n\t\t mode - 4, 2C1T2C"
"\n\t\t mode - 5, 2C2T2C\n", argc);
return -1;
}
std::string bmodel = argv[1];
int mode = atoi(argv[2]);
int num_samples = atoi(argv[3]);
int dev_id = 0; // soc, only 0
std::vector<int> core_list0;
std::vector<int> core_list1;
std::vector<counter_t> counters;
std::vector<std::thread> vt;
ConcurrentQueue<int> queue(num_samples);
for (int n = 0; n < num_samples; ++n) {
if (!queue.try_enqueue(n)) {
printf("Failed to enqueue\n");
return -1;
}
}
bm_handle_t bm_handle;
bm_status_t status = bm_dev_request(&bm_handle, dev_id);
if (BM_SUCCESS != status) {
printf("bm_dev_request failed\n");
return -1;
}
printf("set device id: %d\n", dev_id);
core_list0.clear();
core_list1.clear();
vt.clear();
if (mode == 0) { // 1C1T1C
counters.resize(1);
core_list0 = {0};
vt.emplace_back(infer, bm_handle, bmodel, core_list0.data(), 1, 0, std::ref(queue), std::ref(counters[0]));
} else if (mode == 1) { // 1C2T1C
counters.resize(2);
core_list0 = {0};
vt.emplace_back(infer, bm_handle, bmodel, core_list0.data(), 1, 0, std::ref(queue), std::ref(counters[0]));
vt.emplace_back(infer, bm_handle, bmodel, core_list0.data(), 1, 1, std::ref(queue), std::ref(counters[1]));
} else if (mode == 2) { // 1C2T2C
counters.resize(2);
core_list0 = {0};
core_list1 = {1};
vt.emplace_back(infer, bm_handle, bmodel, core_list0.data(), 1, 0, std::ref(queue), std::ref(counters[0]));
vt.emplace_back(infer, bm_handle, bmodel, core_list1.data(), 1, 1, std::ref(queue), std::ref(counters[1]));
} else if (mode == 3) { // 1C4T2C
counters.resize(4);
core_list0 = {0};
core_list1 = {1};
vt.emplace_back(infer, bm_handle, bmodel, core_list0.data(), 1, 0, std::ref(queue), std::ref(counters[0]));
vt.emplace_back(infer, bm_handle, bmodel, core_list1.data(), 1, 1, std::ref(queue), std::ref(counters[1]));
vt.emplace_back(infer, bm_handle, bmodel, core_list0.data(), 1, 2, std::ref(queue), std::ref(counters[2]));
vt.emplace_back(infer, bm_handle, bmodel, core_list1.data(), 1, 3, std::ref(queue), std::ref(counters[3]));
} else if (mode == 4) { // 2C1T2C
counters.resize(1);
core_list0 = {0, 1};
vt.emplace_back(infer, bm_handle, bmodel, core_list0.data(), 2, 0, std::ref(queue), std::ref(counters[0]));
} else if (mode == 5) { // 2C2T2C
counters.resize(2);
core_list0 = {0, 1};
vt.emplace_back(infer, bm_handle, bmodel, core_list0.data(), 2, 0, std::ref(queue), std::ref(counters[0]));
vt.emplace_back(infer, bm_handle, bmodel, core_list0.data(), 2, 1, std::ref(queue), std::ref(counters[1]));
} else {
printf("not support mode: %d\n", mode);
return -1;
}
for (auto &t: vt)
t.join();
// find max thread time
float max_span = -1;
int max_idx = 0;
for (int n=0; n<counters.size(); ++n) {
if (counters[n].total_time() > max_span) {
max_span = counters[n].total_time();
max_idx = n;
}
}
printf("ave_inference_time: %.3fms fps: %.3f get_output_time: %.3fms\n",
counters[max_idx].inference_time / 1000.0f / num_samples,
num_samples * 1e6 / counters[max_idx].inference_time,
counters[max_idx].get_output_time / 1000.0f / num_samples);
bm_dev_free(bm_handle);
return 0;
}