Intel AVX测试效果,跟编译参数相关

1)首先上代码

#include <iostream>
#include <iomanip>
#include <cfloat>
#include <immintrin.h>
#include <assert.h>


using namespace std;

typedef __m256d (*integrable)(__m256d);

static __m256d parabola(__m256d x)
{ return _mm256_mul_pd(x, x); }


int main(int argc, char *argv[])
{
    const unsigned long long steps = 100000000;
    const unsigned int bytes_per_register = 32;
    double a = 0;
    double b = 1;
    double h = (b - a) / (steps * 1.0);
    __m256d sum = _mm256_setzero_pd();
    double sum2 = 0.0;


    clock_t begin = clock();
    const unsigned int vals_per_register = bytes_per_register / sizeof(double);
    assert(vals_per_register == 4);
    const unsigned int total_blocks = steps / vals_per_register;
    double x2[4];
    for (unsigned int i = 0; i <= total_blocks; i++)
    {
        for (int j = 0; j < vals_per_register; j++) {
            x2[j] = a + (i * vals_per_register + j) * h;
            sum2 = sum2 + x2[j]*x2[j];
        }
    }
    double total_sum = 0;
    total_sum = sum2 * h;
    cout << fixed << setprecision(DBL_DIG) << total_sum << endl;
    clock_t end = clock();
    cout << fixed << setprecision(DBL_DIG) << "ori =" << double(end - begin) / (1.0 * CLOCKS_PER_SEC) << " seconds." << endl;



    integrable f = parabola;
    begin = clock();
    //vals_per_register = bytes_per_register / sizeof(double);
    assert(vals_per_register == 4);
    //total_blocks = steps / vals_per_register;
    __m256d x;
    for (unsigned int i = 0; i <= total_blocks; i++)
    {
        for (int j = 0; j < vals_per_register; j++) x[j] = a + (i * vals_per_register + j) * h;
        sum = _mm256_add_pd(sum, f(x));
    }
    total_sum = 0;
    for (int i = 0; i < vals_per_register; i++) total_sum += sum[i];
    total_sum *= h;
    cout << fixed << setprecision(DBL_DIG) << total_sum << endl;
    end = clock();
    cout << fixed << setprecision(DBL_DIG) << double(end - begin) / (1.0 * CLOCKS_PER_SEC) << " seconds." << endl;
    return 0;
}

2)代码分析

上面代码是实际求和操作。

我添加了27-43行的内容,前面Github上的代码没有串行实现结果。

3)-O0编译命令及结果

原来使用Cmake,make搞得很复杂。

实际上使用类似这样的命令就行

clang++ -mavx -O0 -std=gnu++11  -o  main.cpp.o -c  main.cpp; clang++ -rdynamic  main.cpp.o -o avx2_example; ./avx2_example

-mavx使用AVX

-O0表示O0级优化

运行结果:

0.333333368333224
ori =0.797198000000000 seconds.
0.333333368333363
1.199447000000000 seconds.

可见AVX比串行还慢。

4)

clang++ -mavx -O1 -std=gnu++11  -o  main.cpp.o -c  main.cpp; clang++ -rdynamic  main.cpp.o -o avx2_example; ./avx2_example
0.333333368333224
ori =0.143298000000000 seconds.
0.333333368333363
0.796904000000000 seconds.
 

-O1有效果,对Ori提升更加明显

5)clang++ -mavx -O2 -std=gnu++11  -o  main.cpp.o -c  main.cpp; clang++ -rdynamic  main.cpp.o -o avx2_example; ./avx2_example
0.333333368333224
ori =0.143289000000000 seconds.
0.333333368333363
0.068238000000000 seconds.

-O2就达到了我们想要的效果

6)不加 -mavx会报错:

clang++ -mavx -O3 -std=gnu++11  -o  main.cpp.o -c  main.cpp; clang++ -rdynamic  main.cpp.o -o avx2_example; ./avx2_example
0.333333368333224
ori =0.143300000000000 seconds.
0.333333368333363
0.068255000000000 seconds.
 

-O3和-O2效果一样

7)

不加

clang++  -O3 -std=gnu++11  -o  main.cpp.o -c  main.cpp; clang++ -rdynamic  main.cpp.o -o avx2_example; ./avx2_example
main.cpp:25:19: error: always_inline function '_mm256_setzero_pd' requires target feature 'avx', but would be inlined into function 'main' that is compiled without support for 'avx'
    __m256d sum = _mm256_setzero_pd();
                  ^
main.cpp:25:19: error: AVX vector return of type '__m256d' (vector of 4 'double' values) without 'avx' enabled changes the ABI
main.cpp:58:15: error: always_inline function '_mm256_add_pd' requires target feature 'avx', but would be inlined into function 'main' that is compiled without support for 'avx'
        sum = _mm256_add_pd(sum, f(x));
              ^
main.cpp:58:15: error: AVX vector argument of type '__m256d' (vector of 4 'double' values) without 'avx' enabled changes the ABI
4 errors generated.
clang-12: error: no such file or directory: 'main.cpp.o'
clang-12: error: no input files
 

8)总结 

就是 -mavx,也要和编译选项配合用,否则 不一定会有串行的好。

AI总结得还蛮好的。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值