BM1688模型性能测试工具-CSDN博客

本文链接：https://blog.csdn.net/hqq091425/article/details/143989924

BM1688模型性能测试工具

支持多种模式组合下的性能摸底：

1C1T1C，1core模型起1个线程独立运行在0core
1C2T1C，1core模型起2个线程共同运行在0core
1C2T2C，1core模型起2个线程分别运行在0、1core
1C4T2C，1core模型起4个线程分别运行在0、1core，每个core两个线程
2C1T2C，2core模型起1个线程独立运行在0、1core
2C2T2C，2core模型起2个线程共同运行在0、1core

其中1C2T2C/1C4T2C模式可以获得模型的最大吞吐，压满芯片

使用

直接丢到BM1688的盒子上，应该不缺依赖

# num_samples 表示用来测试的样本数量，最大1000
# mode 就是上述的六种模式
./bm_run_model2 model_path mode num_samples

代码

#include <iostream>
#include <string>
#include <chrono>
#include <thread>
#include "concurrentqueue.h"
#include "bmruntime.h"

using namespace moodycamel;

typedef struct {
    uint64_t count{0};
    float preprocess_time{0};  // us
    float inference_time{0};  // us
    float get_output_time{0};  // us
    float total_time() const { return preprocess_time + inference_time + get_output_time; }
} counter_t;

void infer(bm_handle_t bm_handle, std::string bmodel, int* core_list, int core_num, int thread_id, ConcurrentQueue<int>& queue, counter_t& counter) {
  void *p_bmrt = bmrt_create(bm_handle);
  if (NULL == p_bmrt) {
    printf("bmrt_create failed\n");
    return;
  }

  bool success = bmrt_load_bmodel(p_bmrt, bmodel.c_str());
  if (!success) {
    printf("bmrt_load_bmodel failed\n");
    return;
  }

  const char **net_names = NULL;
  int net_num = bmrt_get_network_number(p_bmrt);
  if (net_num > 1) {
    printf("net_num > 1\n");
    return;
  }
  bmrt_get_network_names(p_bmrt, &net_names);
  
  const char* net_name = net_names[0];
  auto net_info = bmrt_get_network_info(p_bmrt, net_name);
  
  if (!net_info) {
    printf("bmrt_get_network_info failed\n");
    return;
  }
  bmrt_print_network_info(net_info);
  if (net_info->core_num != core_num) {
    printf("core_num[%d] != net_info->core_num[%d]\n", core_num, net_info->core_num);
    return;
  }
  const int input_num = net_info->input_num;
  const int output_num = net_info->output_num;
  const int stage_num = net_info->stage_num;
  assert(stage_num == 1);

  bm_tensor_t input_tensors[input_num];
  for (int i=0; i<input_num; ++i) {
    bm_status_t status = bm_malloc_device_byte(bm_handle, &input_tensors[i].device_mem, net_info->max_input_bytes[i]);
    input_tensors[i].st_mode = BM_STORE_1N;
    input_tensors[i].dtype = net_info->input_dtypes[i];
    input_tensors[i].shape.num_dims = net_info->stages[0].input_shapes[i].num_dims;
    for (int d=0; d<net_info->stages[0].input_shapes[i].num_dims; ++d)
      input_tensors[i].shape.dims[d] = net_info->stages[0].input_shapes[i].dims[d];
  }

  void* outputs[128]{};
  bm_tensor_t output_tensors[output_num];
  for (int i=0; i<output_num; ++i) {
    bm_status_t status = bm_malloc_device_byte(bm_handle, &output_tensors[i].device_mem, net_info->max_output_bytes[i]);
    output_tensors[i].st_mode = BM_STORE_1N;
    output_tensors[i].dtype = net_info->output_dtypes[i];
    output_tensors[i].shape.num_dims = net_info->stages[0].output_shapes[i].num_dims;
    for (int d=0; d<net_info->stages[0].output_shapes[i].num_dims; ++d)
      output_tensors[i].shape.dims[d] = net_info->stages[0].input_shapes[i].dims[d];
    // system mem
    outputs[i] = malloc(net_info->max_output_bytes[i]);
  }

  bool user_mem = true;
  bool user_stmode = false;
  int flag;
  while(1) {
    if (!queue.try_dequeue(flag))
      break;
    auto t0 = std::chrono::high_resolution_clock::now();
    success = bmrt_launch_tensor_multi_cores(p_bmrt, net_name, input_tensors, input_num, output_tensors, output_num, user_mem, user_stmode, core_list, core_num);
    // sync, wait for finishing inference
    for (int c=0; c<core_num; ++c)
      bm_thread_sync_from_core(bm_handle, core_list[c]);
    std::chrono::duration<float, std::micro> tp0 = std::chrono::high_resolution_clock::now() - t0;
    counter.inference_time += tp0.count();
    printf("[Thread%d] inference time: %.3fms\n", thread_id, tp0.count() / 1000.0f);
    counter.count++;
  }
  
  /**************************************************************/
  // here all output info stored in output_tensors, such as data type, shape, device_mem.
  // you can copy data to system memory, like this.
  for (int k=0; k<counter.count; ++k) {
    auto t0 = std::chrono::high_resolution_clock::now();
    for (int i=0; i<output_num; ++i) {
      bm_memcpy_d2s_partial(bm_handle, outputs[i], output_tensors[i].device_mem, bmrt_tensor_bytesize(&output_tensors[i]));
    }
    std::chrono::duration<float, std::micro> tp0 = std::chrono::high_resolution_clock::now() - t0;
    counter.get_output_time += tp0.count();
  }
  printf("[Thread%d] samples: %ld ave_inference_time: %.3fms get_output_time: %.3fms\n", 
    thread_id, counter.count, counter.inference_time / 1000.0f / counter.count, counter.get_output_time / 1000.0f / counter.count);

  free(net_names);
  // at last, free device memory
  for (int i = 0; i < input_num; ++i) {
    bm_free_device(bm_handle, input_tensors[i].device_mem);
  }
  for (int i = 0; i < output_num; ++i) {
    bm_free_device(bm_handle, output_tensors[i].device_mem);
    free(outputs[i]);
  }
}

int main(int argc, char *argv[]) {
  if (argc != 4) {
    printf("input param num(%d) must be == 4, "
            "\n\t1 - model file path, 2 - mode, 3 - num sample"
            "\n\t\t mode - 0, 1C1T1C"
            "\n\t\t mode - 1, 1C2T1C"
            "\n\t\t mode - 2, 1C2T2C"
            "\n\t\t mode - 3, 1C4T2C"
            "\n\t\t mode - 4, 2C1T2C"
            "\n\t\t mode - 5, 2C2T2C\n", argc);
    return -1;
  }

  std::string bmodel = argv[1];
  int mode = atoi(argv[2]);
  int num_samples = atoi(argv[3]);
  int dev_id = 0;  // soc, only 0
  std::vector<int> core_list0;
  std::vector<int> core_list1;
  std::vector<counter_t> counters;
  std::vector<std::thread> vt;
  ConcurrentQueue<int> queue(num_samples);
  for (int n = 0; n < num_samples; ++n) {
      if (!queue.try_enqueue(n)) {
          printf("Failed to enqueue\n");
          return -1;
      }
  }

  bm_handle_t bm_handle;
  bm_status_t status = bm_dev_request(&bm_handle, dev_id);
  if (BM_SUCCESS != status) {
    printf("bm_dev_request failed\n");
    return -1;
  }
  printf("set device id: %d\n", dev_id);

  core_list0.clear();
  core_list1.clear();
  vt.clear();
  if (mode == 0) {  // 1C1T1C
    counters.resize(1);
    core_list0 = {0};
    vt.emplace_back(infer, bm_handle, bmodel, core_list0.data(), 1, 0, std::ref(queue), std::ref(counters[0]));
  } else if (mode == 1) {  // 1C2T1C
    counters.resize(2);
    core_list0 = {0};
    vt.emplace_back(infer, bm_handle, bmodel, core_list0.data(), 1, 0, std::ref(queue), std::ref(counters[0]));
    vt.emplace_back(infer, bm_handle, bmodel, core_list0.data(), 1, 1, std::ref(queue), std::ref(counters[1]));
  } else if (mode == 2) {  // 1C2T2C
    counters.resize(2);
    core_list0 = {0};
    core_list1 = {1};
    vt.emplace_back(infer, bm_handle, bmodel, core_list0.data(), 1, 0, std::ref(queue), std::ref(counters[0]));
    vt.emplace_back(infer, bm_handle, bmodel, core_list1.data(), 1, 1, std::ref(queue), std::ref(counters[1]));
  } else if (mode == 3) {  // 1C4T2C
    counters.resize(4);
    core_list0 = {0};
    core_list1 = {1};
    vt.emplace_back(infer, bm_handle, bmodel, core_list0.data(), 1, 0, std::ref(queue), std::ref(counters[0]));
    vt.emplace_back(infer, bm_handle, bmodel, core_list1.data(), 1, 1, std::ref(queue), std::ref(counters[1]));
    vt.emplace_back(infer, bm_handle, bmodel, core_list0.data(), 1, 2, std::ref(queue), std::ref(counters[2]));
    vt.emplace_back(infer, bm_handle, bmodel, core_list1.data(), 1, 3, std::ref(queue), std::ref(counters[3]));
  } else if (mode == 4) {  // 2C1T2C
    counters.resize(1);
    core_list0 = {0, 1};
    vt.emplace_back(infer, bm_handle, bmodel, core_list0.data(), 2, 0, std::ref(queue), std::ref(counters[0]));
  } else if (mode == 5) {  // 2C2T2C
    counters.resize(2);
    core_list0 = {0, 1};
    vt.emplace_back(infer, bm_handle, bmodel, core_list0.data(), 2, 0, std::ref(queue), std::ref(counters[0]));
    vt.emplace_back(infer, bm_handle, bmodel, core_list0.data(), 2, 1, std::ref(queue), std::ref(counters[1]));
  } else {
    printf("not support mode: %d\n", mode);
    return -1;
  }

  for (auto &t: vt)
    t.join();

  // find max thread time
  float max_span = -1;
  int max_idx = 0;
  for (int n=0; n<counters.size(); ++n) {
    if (counters[n].total_time() > max_span) {
      max_span = counters[n].total_time();
      max_idx = n;
    }
  }
  printf("ave_inference_time: %.3fms fps: %.3f get_output_time: %.3fms\n", 
    counters[max_idx].inference_time / 1000.0f / num_samples, 
    num_samples * 1e6 / counters[max_idx].inference_time,
    counters[max_idx].get_output_time / 1000.0f / num_samples);

  bm_dev_free(bm_handle);
  return 0;
}