int nloop = integral_len/2; // process two samples in each loop: x1*conj(y1) + x2*conj(y2) __m128 product_real,product_imag,tmp_y; __m128 sum_real = _mm_set_ps1(0.0f); __m128 sum_imag = _mm_set_ps1(0.0f); __m128 x; __m128 y;
for (int i=0; i < nloop; i++) { x = _mm_loadu_ps(input_i); // load data from memory to register: x4 x3 x2 x1 y = _mm_loadu_ps(input_d); // load data from memory to register: y4 y3 y2 y1 product_real = _mm_mul_ps(x, y); // x4*y4, x3*y3, x2*y2, x1*y1 sum_real = _mm_add_ps(sum_real, product_real); // Add results into sum register tmp_y = _mm_shuffle_ps(y,y, 0xb1); // [y4 y3 y2 y1] => [y3 y4 y1 y2] product_imag = _mm_mul_ps(x,tmp_y); // x4*y3, x3*y4, x2*y1, x1*y2 sum_imag = _mm_addsub_ps(sum_imag, product_imag); // add resutls into sum register: x4*y3, -x3*y4, x2*y1, -x1*y2 input_i+=2; // shift pointer 2*4 bytes input_d+=2; }
sum_real = _mm_hadd_ps(sum_real,sum_real); // x3*y3+x4*y4, x1*y1+x2*y2 sum_real = _mm_hadd_ps(sum_real,sum_real); // x1*y1+x2*y2+x3*y3+x4*y4 _mm_store_ss(&(corr_output[ii][0]),sum_real); // store the real part into memory sum_imag = _mm_hadd_ps(sum_imag,sum_imag); // -x3*y4+x4*y3, -x1*y2+x2*y1 sum_imag = _mm_hadd_ps(sum_imag,sum_imag); // -x1*y2+x2*y1-x3*y4+x4*y3 _mm_store_ss(&(corr_output[ii][1]),sum_imag); // store the imag part into memory
// the last sample if there is
if (integral_len%2==1) { // pointers are already moved to the last sample corr_output[ii][0] += input_i[0][0]*input_d[0][0] + input_i[0][1]*input_d[0][1]; corr_output[ii][1] += -input_i[0][0]*input_d[0][1] + input_i[0][1]*input_d[0][0]; }