Bon, c'est pas gagné
Code:
void normalize_AVX_nt(const float *data_real, const float *data_imag,
float *out_real, float *out_imag,
const float *counts, size_t len)
{
size_t i;
assert(len % 8 == 0);
const __m256i* icnt = (const __m256i *) counts;
const __m256i* ireal = (const __m256i *) data_real;
const __m256i* iimag = (const __m256i *) data_imag;
// registers contain 256 bits, i.e 8 float words of 32 bits
//#pragma omp parallel for num_threads( NUM_THREADS )
for (i = 0; i < len; i+=8)
{
// cnt = counts[i]
// __m128i _mm_stream_load_si128 (__m128i *p);
// __m256 cnt = _mm256_stream_load_si256(&counts[i]);
__m256 cnt = _mm256_castsi256_ps( _mm256_stream_load_si256(&icnt[i]));
// ar = data_real[i], ai = data_imag[i],
// __m256 ar = _mm256_stream_load_ps(&data_real[i]);
// __m256 ai = _mm256_stream_load_ps(&data_imag[i]);
__m256 ar = _mm256_castsi256_ps( _mm256_stream_load_si256(&ireal[i]));
__m256 ai = _mm256_castsi256_ps( _mm256_stream_load_si256(&iimag[i]));
// cnt <= max ( 1, cnt )
cnt = _mm256_max_ps(cnt, _mm256_set1_ps(1.0f));
// div = 1 / cnt
__m256 div = _mm256_div_ps(_mm256_set1_ps(1.0f), cnt);
// data_real[i] *= div
__m256 resr = _mm256_mul_ps(ar, div);
__m256 resi = _mm256_mul_ps(ai, div);
// data_imag[i] /= cnt
_mm256_stream_ps(&out_real[i], resr);
_mm256_stream_ps(&out_imag[i], resi);;
}
}
L'erreur est assez incompréhensible, mais je ne suis plus en état de poursuivre. Je verrai demain avec des neurones régénérés, plus ou moins...
Code:
In file included from /usr/lib/gcc/x86_64-linux-gnu/4.9/include/immintrin.h:43:0,
from TestMoyenneAVX.cc:10:
/usr/lib/gcc/x86_64-linux-gnu/4.9/include/avx2intrin.h: In function ‘void normalize_AVX_nt(const float*, const float*, float*, float*, const float*, size_t)’:
/usr/lib/gcc/x86_64-linux-gnu/4.9/include/avx2intrin.h:905:1: error: inlining failed in call to always_inline ‘__m256i _mm256_stream_load_si256(const __m256i*)’: target specific option mismatch
_mm256_stream_load_si256 (__m256i const *__X)
^