Intel AVX测试效果，跟编译参数相关-CSDN博客

本文链接：https://blog.csdn.net/anlongstar/article/details/148992132

1）首先上代码

#include <iostream>
#include <iomanip>
#include <cfloat>
#include <immintrin.h>
#include <assert.h>


using namespace std;

typedef __m256d (*integrable)(__m256d);

static __m256d parabola(__m256d x)
{ return _mm256_mul_pd(x, x); }


int main(int argc, char *argv[])
{
    const unsigned long long steps = 100000000;
    const unsigned int bytes_per_register = 32;
    double a = 0;
    double b = 1;
    double h = (b - a) / (steps * 1.0);
    __m256d sum = _mm256_setzero_pd();
    double sum2 = 0.0;


    clock_t begin = clock();
    const unsigned int vals_per_register = bytes_per_register / sizeof(double);
    assert(vals_per_register == 4);
    const unsigned int total_blocks = steps / vals_per_register;
    double x2[4];
    for (unsigned int i = 0; i <= total_blocks; i++)
    {
        for (int j = 0; j < vals_per_register; j++) {
            x2[j] = a + (i * vals_per_register + j) * h;
            sum2 = sum2 + x2[j]*x2[j];
        }
    }
    double total_sum = 0;
    total_sum = sum2 * h;
    cout << fixed << setprecision(DBL_DIG) << total_sum << endl;
    clock_t end = clock();
    cout << fixed << setprecision(DBL_DIG) << "ori =" << double(end - begin) / (1.0 * CLOCKS_PER_SEC) << " seconds." << endl;



    integrable f = parabola;
    begin = clock();
    //vals_per_register = bytes_per_register / sizeof(double);
    assert(vals_per_register == 4);
    //total_blocks = steps / vals_per_register;
    __m256d x;
    for (unsigned int i = 0; i <= total_blocks; i++)
    {
        for (int j = 0; j < vals_per_register; j++) x[j] = a + (i * vals_per_register + j) * h;
        sum = _mm256_add_pd(sum, f(x));
    }
    total_sum = 0;
    for (int i = 0; i < vals_per_register; i++) total_sum += sum[i];
    total_sum *= h;
    cout << fixed << setprecision(DBL_DIG) << total_sum << endl;
    end = clock();
    cout << fixed << setprecision(DBL_DIG) << double(end - begin) / (1.0 * CLOCKS_PER_SEC) << " seconds." << endl;
    return 0;
}

2）代码分析

上面代码是实际求和操作。

我添加了27-43行的内容，前面Github上的代码没有串行实现结果。

3）-O0编译命令及结果

原来使用Cmake，make搞得很复杂。

实际上使用类似这样的命令就行

clang++ -mavx -O0 -std=gnu++11 -o main.cpp.o -c main.cpp; clang++ -rdynamic main.cpp.o -o avx2_example; ./avx2_example

-mavx使用AVX

-O0表示O0级优化

运行结果：

0.333333368333224
ori =0.797198000000000 seconds.
0.333333368333363
1.199447000000000 seconds.

可见AVX比串行还慢。

4）

clang++ -mavx -O1 -std=gnu++11 -o main.cpp.o -c main.cpp; clang++ -rdynamic main.cpp.o -o avx2_example; ./avx2_example
0.333333368333224
ori =0.143298000000000 seconds.
0.333333368333363
0.796904000000000 seconds.

-O1有效果，对Ori提升更加明显

5）clang++ -mavx -O2 -std=gnu++11 -o main.cpp.o -c main.cpp; clang++ -rdynamic main.cpp.o -o avx2_example; ./avx2_example
0.333333368333224
ori =0.143289000000000 seconds.
0.333333368333363
0.068238000000000 seconds.

-O2就达到了我们想要的效果

6）不加 -mavx会报错：

clang++ -mavx -O3 -std=gnu++11 -o main.cpp.o -c main.cpp; clang++ -rdynamic main.cpp.o -o avx2_example; ./avx2_example
0.333333368333224
ori =0.143300000000000 seconds.
0.333333368333363
0.068255000000000 seconds.

-O3和-O2效果一样

7）

不加

clang++ -O3 -std=gnu++11 -o main.cpp.o -c main.cpp; clang++ -rdynamic main.cpp.o -o avx2_example; ./avx2_example
main.cpp:25:19: error: always_inline function '_mm256_setzero_pd' requires target feature 'avx', but would be inlined into function 'main' that is compiled without support for 'avx'
__m256d sum = _mm256_setzero_pd();
^
main.cpp:25:19: error: AVX vector return of type '__m256d' (vector of 4 'double' values) without 'avx' enabled changes the ABI
main.cpp:58:15: error: always_inline function '_mm256_add_pd' requires target feature 'avx', but would be inlined into function 'main' that is compiled without support for 'avx'
sum = _mm256_add_pd(sum, f(x));
^
main.cpp:58:15: error: AVX vector argument of type '__m256d' (vector of 4 'double' values) without 'avx' enabled changes the ABI
4 errors generated.
clang-12: error: no such file or directory: 'main.cpp.o'
clang-12: error: no input files

8）总结

就是 -mavx，也要和编译选项配合用，否则不一定会有串行的好。

AI总结得还蛮好的。