1)首先上代码
#include <iostream>
#include <iomanip>
#include <cfloat>
#include <immintrin.h>
#include <assert.h>
using namespace std;
typedef __m256d (*integrable)(__m256d);
static __m256d parabola(__m256d x)
{ return _mm256_mul_pd(x, x); }
int main(int argc, char *argv[])
{
const unsigned long long steps = 100000000;
const unsigned int bytes_per_register = 32;
double a = 0;
double b = 1;
double h = (b - a) / (steps * 1.0);
__m256d sum = _mm256_setzero_pd();
double sum2 = 0.0;
clock_t begin = clock();
const unsigned int vals_per_register = bytes_per_register / sizeof(double);
assert(vals_per_register == 4);
const unsigned int total_blocks = steps / vals_per_register;
double x2[4];
for (unsigned int i = 0; i <= total_blocks; i++)
{
for (int j = 0; j < vals_per_register; j++) {
x2[j] = a + (i * vals_per_register + j) * h;
sum2 = sum2 + x2[j]*x2[j];
}
}
double total_sum = 0;
total_sum = sum2 * h;
cout << fixed << setprecision(DBL_DIG) << total_sum << endl;
clock_t end = clock();
cout << fixed << setprecision(DBL_DIG) << "ori =" << double(end - begin) / (1.0 * CLOCKS_PER_SEC) << " seconds." << endl;
integrable f = parabola;
begin = clock();
//vals_per_register = bytes_per_register / sizeof(double);
assert(vals_per_register == 4);
//total_blocks = steps / vals_per_register;
__m256d x;
for (unsigned int i = 0; i <= total_blocks; i++)
{
for (int j = 0; j < vals_per_register; j++) x[j] = a + (i * vals_per_register + j) * h;
sum = _mm256_add_pd(sum, f(x));
}
total_sum = 0;
for (int i = 0; i < vals_per_register; i++) total_sum += sum[i];
total_sum *= h;
cout << fixed << setprecision(DBL_DIG) << total_sum << endl;
end = clock();
cout << fixed << setprecision(DBL_DIG) << double(end - begin) / (1.0 * CLOCKS_PER_SEC) << " seconds." << endl;
return 0;
}
2)代码分析
上面代码是实际求和操作。
我添加了27-43行的内容,前面Github上的代码没有串行实现结果。
3)-O0编译命令及结果
原来使用Cmake,make搞得很复杂。
实际上使用类似这样的命令就行
clang++ -mavx -O0 -std=gnu++11 -o main.cpp.o -c main.cpp; clang++ -rdynamic main.cpp.o -o avx2_example; ./avx2_example
-mavx使用AVX
-O0表示O0级优化
运行结果:
0.333333368333224
ori =0.797198000000000 seconds.
0.333333368333363
1.199447000000000 seconds.
可见AVX比串行还慢。
4)
clang++ -mavx -O1 -std=gnu++11 -o main.cpp.o -c main.cpp; clang++ -rdynamic main.cpp.o -o avx2_example; ./avx2_example
0.333333368333224
ori =0.143298000000000 seconds.
0.333333368333363
0.796904000000000 seconds.
-O1有效果,对Ori提升更加明显
5)clang++ -mavx -O2 -std=gnu++11 -o main.cpp.o -c main.cpp; clang++ -rdynamic main.cpp.o -o avx2_example; ./avx2_example
0.333333368333224
ori =0.143289000000000 seconds.
0.333333368333363
0.068238000000000 seconds.
-O2就达到了我们想要的效果
6)不加 -mavx会报错:
clang++ -mavx -O3 -std=gnu++11 -o main.cpp.o -c main.cpp; clang++ -rdynamic main.cpp.o -o avx2_example; ./avx2_example
0.333333368333224
ori =0.143300000000000 seconds.
0.333333368333363
0.068255000000000 seconds.
-O3和-O2效果一样
7)
不加
clang++ -O3 -std=gnu++11 -o main.cpp.o -c main.cpp; clang++ -rdynamic main.cpp.o -o avx2_example; ./avx2_example
main.cpp:25:19: error: always_inline function '_mm256_setzero_pd' requires target feature 'avx', but would be inlined into function 'main' that is compiled without support for 'avx'
__m256d sum = _mm256_setzero_pd();
^
main.cpp:25:19: error: AVX vector return of type '__m256d' (vector of 4 'double' values) without 'avx' enabled changes the ABI
main.cpp:58:15: error: always_inline function '_mm256_add_pd' requires target feature 'avx', but would be inlined into function 'main' that is compiled without support for 'avx'
sum = _mm256_add_pd(sum, f(x));
^
main.cpp:58:15: error: AVX vector argument of type '__m256d' (vector of 4 'double' values) without 'avx' enabled changes the ABI
4 errors generated.
clang-12: error: no such file or directory: 'main.cpp.o'
clang-12: error: no input files
8)总结
就是 -mavx,也要和编译选项配合用,否则 不一定会有串行的好。
AI总结得还蛮好的。