34#include <lal/LALConstants.h>
35#include <lal/SinCosLUT.h>
45#define OOTWOPI (1.0 / LAL_TWOPI)
47#define LUT_RES_F (1.0 * LUT_RES)
48#define OO_LUT_RES (1.0 / LUT_RES)
49#define X_TO_IND (1.0 * LUT_RES * OOTWOPI )
50#define IND_TO_X (LAL_TWOPI * OO_LUT_RES)
60 UINT4 paddedLength = length + align - 1;
62 size_t remBytes = ( ( size_t )
vector->data0 ) % align;
63 size_t offsetBytes = ( align - remBytes ) % align;
64 vector->data = (
void * )( ( (
char * )
vector->data0 ) + offsetBytes );
83 for (
UINT4 ii = 0; ii < length; ii++ ) {
105 for (
UINT4 ii = 0; ii < length; ii++ ) {
127 REAL4VectorAligned *delta0_int = NULL, *delta1_int = NULL, *roundedDelta0_int = NULL, *sinPiDelta0 = NULL, *sinPiDelta1 = NULL, *PiDelta = NULL, *cosPiDelta0 = NULL, *cosPiDelta1 = NULL, *realTerms = NULL, *realTerms2 = NULL, *imagTerms = NULL, *imagTerms2 = NULL;
141 for (
UINT4 ii = 0; ii < delta0_int->
length; ii++ ) {
143 delta1_int->data[ii] = (
REAL4 )( delta1->
data[ii] );
160 if ( fabsf( delta1_int->data[ii] ) < (
REAL4 )1.0e-6 ) {
161 if ( fabsf( delta0_int->
data[ii] ) < (
REAL4 )1.0e-6 ) {
163 }
else if ( fabsf( (
REAL4 )( delta0_int->
data[ii]*delta0_int->
data[ii] - 1.0 ) ) < (
REAL4 )1.0e-6 ) {
165 }
else if ( fabsf( (
REAL4 )( delta0_int->
data[ii] - roundedDelta0_int->data[ii] ) ) < (
REAL4 )1.0e-6 ) {
168 output->data[ii] = -
LAL_PI *
crectf( cosPiDelta0->data[ii], -sinPiDelta0->data[ii] ) * delta0_int->
data[ii] * ( delta0_int->
data[ii] * delta0_int->
data[ii] - 1.0 ) / sinPiDelta0->data[ii];
170 }
else if ( fabsf( (
REAL4 )( delta1_int->data[ii]*delta1_int->data[ii] - 1.0 ) ) < (
REAL4 )1.0e-6 ) {
171 if ( fabsf( delta0_int->
data[ii] ) < (
REAL4 )1.0e-6 ) {
173 }
else if ( fabsf( (
REAL4 )( delta0_int->
data[ii]*delta0_int->
data[ii] - 1.0 ) ) < (
REAL4 )1.0e-6 ) {
175 }
else if ( fabsf( (
REAL4 )( delta0_int->
data[ii] - roundedDelta0_int->data[ii] ) ) < (
REAL4 )1.0e-6 ) {
178 output->data[ii] = -
LAL_PI_2 *
crectf( -cosPiDelta0->data[ii], sinPiDelta0->data[ii] ) * delta0_int->
data[ii] * ( delta0_int->
data[ii] * delta0_int->
data[ii] - 1.0 ) / sinPiDelta0->data[ii];
180 }
else if ( fabsf( delta0_int->
data[ii] ) < (
REAL4 )1.0e-6 ) {
181 output->data[ii] = -
LAL_1_PI *
crectf( cosPiDelta1->data[ii], sinPiDelta1->data[ii] ) * sinPiDelta1->data[ii] / ( delta1_int->data[ii] * ( delta1_int->data[ii] * delta1_int->data[ii] - 1.0 ) );
182 }
else if ( fabsf( (
REAL4 )( delta0_int->
data[ii]*delta0_int->
data[ii] - 1.0 ) ) < (
REAL4 )1.0e-6 ) {
183 output->data[ii] =
LAL_2_PI *
crectf( cosPiDelta1->data[ii], sinPiDelta1->data[ii] ) * sinPiDelta1->data[ii] / ( delta1_int->data[ii] * ( delta1_int->data[ii] * delta1_int->data[ii] - 1.0 ) );
184 }
else if ( fabsf( (
REAL4 )( delta0_int->
data[ii] - roundedDelta0_int->data[ii] ) ) < (
REAL4 )1.0e-6 ) {
187 output->data[ii] =
scaling->data[ii] * sinPiDelta1->data[ii] / sinPiDelta0->data[ii] *
crectf( realTerms->data[ii], imagTerms->data[ii] );
227 a = input1->
data + offset1;
228 b = input2->
data + offset2;
233 *
c = ( *a ) * ( *b );
246 if ( vectorMath == 1 ) {
248 }
else if ( vectorMath == 2 ) {
250 }
else for (
UINT4 ii = 0; ii < input1->
length; ii++ ) {
259 if ( vectorMath == 1 ) {
261 }
else if ( vectorMath == 2 ) {
263 }
else for (
UINT4 ii = 0; ii < input->
length; ii++ ) {
272 if ( vectorMath == 1 ) {
274 }
else if ( vectorMath == 2 ) {
276 }
else for (
UINT4 ii = 0; ii < input->
length; ii++ ) {
285 if ( vectorMath == 1 ) {
287 }
else if ( vectorMath == 2 ) {
289 }
else for (
UINT4 ii = 0; ii < input1->
length; ii++ ) {
298 if ( vectorMath == 1 ) {
300 }
else if ( vectorMath == 2 ) {
302 }
else for (
UINT4 ii = 0; ii < input1->
length; ii++ ) {
311 if ( vectorMath == 1 ) {
313 }
else if ( vectorMath == 2 ) {
315 }
else for (
UINT4 ii = 0; ii < input1->
length; ii++ ) {
324 if ( vectorMath == 1 ) {
326 }
else if ( vectorMath == 2 ) {
328 }
else for (
UINT4 ii = 0; ii < input->
length; ii++ ) {
337 if ( vectorMath == 2 ) {
341 __m256d *arr, *result;
342 arr = ( __m256d * )(
void * )input->
data;
343 result = ( __m256d * )(
void * )
output->data;
344 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
345 *result = _mm256_floor_pd( *arr );
350 for (
UINT4 ii = 4 * roundedvectorlength; ii < input->
length; ii++ ) {
356 fprintf( stderr,
"%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n",
__func__ );
359 }
else for (
UINT4 ii = 0; ii < input->
length; ii++ ) {
368 if ( vectorMath == 2 ) {
372 __m256 *arr, *result;
373 arr = ( __m256 * )(
void * )input->
data;
374 result = ( __m256 * )(
void * )
output->data;
375 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
376 *result = _mm256_round_ps( *arr, _MM_FROUND_TO_NEAREST_INT );
381 for (
UINT4 ii = 8 * roundedvectorlength; ii < input->
length; ii++ ) {
387 fprintf( stderr,
"%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n",
__func__ );
390 }
else for (
UINT4 ii = 0; ii < input->
length; ii++ ) {
399 if ( vectorMath == 2 ) {
403 __m256d *arr, *result;
404 arr = ( __m256d * )(
void * )input->
data;
405 result = ( __m256d * )(
void * )
output->data;
406 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
407 *result = _mm256_round_pd( *arr, _MM_FROUND_TO_NEAREST_INT );
412 for (
UINT4 ii = 4 * roundedvectorlength; ii < input->
length; ii++ ) {
418 fprintf( stderr,
"%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n",
__func__ );
421 }
else for (
UINT4 ii = 0; ii < input->
length; ii++ ) {
430 if ( vectorMath == 2 ) {
434 __m256 *arr, *result;
435 __m256i mask = _mm256_set1_epi32( ~0x80000000 );
436 arr = ( __m256 * )(
void * )input->
data;
437 result = ( __m256 * )(
void * )
output->data;
438 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
439 *result = _mm256_and_ps( *arr, ( __m256 )mask );
444 for (
UINT4 ii = 8 * roundedvectorlength; ii < input->
length; ii++ ) {
450 fprintf( stderr,
"%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n",
__func__ );
453 }
else if ( vectorMath == 1 ) {
456 __m128 *arr, *result;
457 __m128i mask = _mm_set1_epi32( ~0x80000000 );
458 arr = ( __m128 * )(
void * )input->
data;
459 result = ( __m128 * )(
void * )
output->data;
460 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
461 *result = _mm_and_ps( *arr, ( __m128 )mask );
465 for (
UINT4 ii = 4 * roundedvectorlength; ii < input->
length; ii++ ) {
471 fprintf( stderr,
"%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n",
__func__ );
474 }
else for (
UINT4 ii = 0; ii < input->
length; ii++ ) {
483 if ( vectorMath == 2 ) {
487 __m256d *arr, *result;
488 __m256i mask = _mm256_set1_epi64x( ~0x8000000000000000 );
489 arr = ( __m256d * )(
void * )input->
data;
490 result = ( __m256d * )(
void * )
output->data;
491 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
492 *result = _mm256_and_pd( *arr, ( __m256d )mask );
497 for (
UINT4 ii = 4 * roundedvectorlength; ii < input->
length; ii++ ) {
503 fprintf( stderr,
"%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n",
__func__ );
506 }
else if ( vectorMath == 1 ) {
509 __m128d *arr, *result;
510 __m128i mask = _mm_set1_epi64x( ~0x8000000000000000 );
511 arr = ( __m128d * )(
void * )input->
data;
512 result = ( __m128d * )(
void * )
output->data;
513 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
514 *result = _mm_and_pd( *arr, ( __m128d )mask );
518 for (
UINT4 ii = 2 * roundedvectorlength; ii < input->
length; ii++ ) {
524 fprintf( stderr,
"%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n",
__func__ );
527 }
else for (
UINT4 ii = 0; ii < input->
length; ii++ ) {
536 if ( vectorMath == 2 ) {
540 __m256 *result = ( __m256 * )(
void * )
output->data;
542 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
543 __m256 input_re = _mm256_set_ps( (
float )crealf( input->
data[position + 7] ), (
float )crealf( input->
data[position + 6] ), (
float )crealf( input->
data[position + 5] ), (
float )crealf( input->
data[position + 4] ), (
float )crealf( input->
data[position + 3] ), (
float )crealf( input->
data[position + 2] ), (
float )crealf( input->
data[position + 1] ), (
float )crealf( input->
data[position] ) );
544 __m256 input_im = _mm256_set_ps( (
float )cimagf( input->
data[position + 7] ), (
float )cimagf( input->
data[position + 6] ), (
float )cimagf( input->
data[position + 5] ), (
float )cimagf( input->
data[position + 4] ), (
float )cimagf( input->
data[position + 3] ), (
float )cimagf( input->
data[position + 2] ), (
float )cimagf( input->
data[position + 1] ), (
float )cimagf( input->
data[position] ) );
545 __m256 input_re_sq = _mm256_mul_ps( input_re, input_re );
546 __m256 input_im_sq = _mm256_mul_ps( input_im, input_im );
547 __m256 sqsum = _mm256_add_ps( input_re_sq, input_im_sq );
548 *result = _mm256_sqrt_ps( sqsum );
553 for (
UINT4 ii = 8 * roundedvectorlength; ii < input->
length; ii++ ) {
559 fprintf( stderr,
"%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n",
__func__ );
562 }
else if ( vectorMath == 1 ) {
565 __m128 *result = ( __m128 * )(
void * )
output->data;
567 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
568 __m128 input_re = _mm_set_ps( (
float )crealf( input->
data[position + 3] ), (
float )crealf( input->
data[position + 2] ), (
float )crealf( input->
data[position + 1] ), (
float )crealf( input->
data[position] ) );
569 __m128 input_im = _mm_set_ps( (
float )cimagf( input->
data[position + 3] ), (
float )cimagf( input->
data[position + 2] ), (
float )cimagf( input->
data[position + 1] ), (
float )cimagf( input->
data[position] ) );
570 __m128 input_re_sq = _mm_mul_ps( input_re, input_re );
571 __m128 input_im_sq = _mm_mul_ps( input_im, input_im );
572 __m128 sqsum = _mm_add_ps( input_re_sq, input_im_sq );
573 *result = _mm_sqrt_ps( sqsum );
577 for (
UINT4 ii = 4 * roundedvectorlength; ii < input->
length; ii++ ) {
583 fprintf( stderr,
"%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n",
__func__ );
586 }
else for (
UINT4 ii = 0; ii < input->
length; ii++ ) {
594 if ( vectorMath == 2 ) {
598 __m256d *result = ( __m256d * )(
void * )
output->data;
600 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
601 __m256d input_re = _mm256_set_pd( (
double )crealf( input->
data[position + 3] ), (
double )crealf( input->
data[position + 2] ), (
double )crealf( input->
data[position + 1] ), (
double )crealf( input->
data[position] ) );
602 __m256d input_im = _mm256_set_pd( (
double )cimagf( input->
data[position + 3] ), (
double )cimagf( input->
data[position + 2] ), (
double )cimagf( input->
data[position + 1] ), (
double )cimagf( input->
data[position] ) );
603 __m256d input_re_sq = _mm256_mul_pd( input_re, input_re );
604 __m256d input_im_sq = _mm256_mul_pd( input_im, input_im );
605 __m256d sqsum = _mm256_add_pd( input_re_sq, input_im_sq );
606 *result = _mm256_sqrt_pd( sqsum );
611 for (
UINT4 ii = 4 * roundedvectorlength; ii < input->
length; ii++ ) {
617 fprintf( stderr,
"%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n",
__func__ );
620 }
else if ( vectorMath == 1 ) {
623 __m128d *result = ( __m128d * )(
void * )
output->data;
625 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
626 __m128d input_re = _mm_set_pd( (
double )crealf( input->
data[position + 1] ), (
double )crealf( input->
data[position] ) );
627 __m128d input_im = _mm_set_pd( (
double )cimagf( input->
data[position + 1] ), (
double )cimagf( input->
data[position] ) );
628 __m128d input_re_sq = _mm_mul_pd( input_re, input_re );
629 __m128d input_im_sq = _mm_mul_pd( input_im, input_im );
630 __m128d sqsum = _mm_add_pd( input_re_sq, input_im_sq );
631 *result = _mm_sqrt_pd( sqsum );
635 for (
UINT4 ii = 2 * roundedvectorlength; ii < input->
length; ii++ ) {
641 fprintf( stderr,
"%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n",
__func__ );
644 }
else for (
UINT4 ii = 0; ii < input->
length; ii++ ) {
663 __m128d *arr1, *arr2, *result;
664 arr1 = ( __m128d * )(
void * )input1->
data;
665 arr2 = ( __m128d * )(
void * )input2->
data;
666 result = ( __m128d * )(
void * )
output->data;
669 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
670 *result = _mm_add_pd( *arr1, *arr2 );
677 for (
UINT4 ii = 2 * roundedvectorlength; ii < input1->
length; ii++ ) {
686 fprintf( stderr,
"%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n",
__func__ );
709 __m256d *arr1, *arr2, *result;
710 arr1 = ( __m256d * )(
void * )input1->
data;
711 arr2 = ( __m256d * )(
void * )input2->
data;
712 result = ( __m256d * )(
void * )
output->data;
715 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
716 *result = _mm256_add_pd( *arr1, *arr2 );
723 for (
UINT4 ii = 4 * roundedvectorlength; ii < input1->
length; ii++ ) {
735 fprintf( stderr,
"%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n",
__func__ );
754 __m128 *arr1, *arr2, *result;
755 arr1 = ( __m128 * )(
void * )input1->
data;
756 arr2 = ( __m128 * )(
void * )input2->
data;
757 result = ( __m128 * )(
void * )
output->data;
760 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
761 *result = _mm_sub_ps( *arr1, *arr2 );
768 for (
UINT4 ii = 4 * roundedvectorlength; ii < input1->
length; ii++ ) {
777 fprintf( stderr,
"%s: Failed because SSE is not supported, possibly because -msse flag wasn't used for compiling.\n",
__func__ );
799 __m256 *arr1, *arr2, *result;
800 arr1 = ( __m256 * )(
void * )input1->
data;
801 arr2 = ( __m256 * )(
void * )input2->
data;
802 result = ( __m256 * )(
void * )
output->data;
805 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
806 *result = _mm256_sub_ps( *arr1, *arr2 );
813 for (
UINT4 ii = 8 * roundedvectorlength; ii < input1->
length; ii++ ) {
825 fprintf( stderr,
"%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n",
__func__ );
844 __m128d *arr1, *arr2, *result;
845 arr1 = ( __m128d * )(
void * )input1->
data;
846 arr2 = ( __m128d * )(
void * )input2->
data;
847 result = ( __m128d * )(
void * )
output->data;
850 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
851 *result = _mm_sub_pd( *arr1, *arr2 );
858 for (
UINT4 ii = 2 * roundedvectorlength; ii < input1->
length; ii++ ) {
867 fprintf( stderr,
"%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n",
__func__ );
889 __m256d *arr1, *arr2, *result;
890 arr1 = ( __m256d * )(
void * )input1->
data;
891 arr2 = ( __m256d * )(
void * )input2->
data;
892 result = ( __m256d * )(
void * )
output->data;
895 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
896 *result = _mm256_sub_pd( *arr1, *arr2 );
903 for (
UINT4 ii = 4 * roundedvectorlength; ii < input1->
length; ii++ ) {
915 fprintf( stderr,
"%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n",
__func__ );
934 __m128d *arr1, *arr2, *result;
935 arr1 = ( __m128d * )(
void * )input1->
data;
936 arr2 = ( __m128d * )(
void * )input2->
data;
937 result = ( __m128d * )(
void * )
output->data;
940 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
941 *result = _mm_mul_pd( *arr1, *arr2 );
948 for (
INT4 ii = 2 * roundedvectorlength; ii < (
INT4 )input1->
length; ii++ ) {
957 fprintf( stderr,
"%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n",
__func__ );
980 __m256d *arr1, *arr2, *result;
981 arr1 = ( __m256d * )(
void * )input1->
data;
982 arr2 = ( __m256d * )(
void * )input2->
data;
983 result = ( __m256d * )(
void * )
output->data;
986 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
987 *result = _mm256_mul_pd( *arr1, *arr2 );
994 for (
INT4 ii = 4 * roundedvectorlength; ii < (
INT4 )input1->
length; ii++ ) {
1006 fprintf( stderr,
"%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n",
__func__ );
1024 __m128d *arr1, *result;
1025 arr1 = ( __m128d * )(
void * )input1->
data;
1026 result = ( __m128d * )(
void * )
output->data;
1028 __m128d one = _mm_set1_pd( 1.0 );
1031 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
1032 *result = _mm_div_pd( one, *arr1 );
1038 for (
INT4 ii = 2 * roundedvectorlength; ii < (
INT4 )input1->
length; ii++ ) {
1046 fprintf( stderr,
"%s: Failed because SSE is not supported, possibly because -msse flag wasn't used for compiling.\n",
__func__ );
1068 __m256d *arr1, *result;
1069 arr1 = ( __m256d * )(
void * )input1->
data;
1070 result = ( __m256d * )(
void * )
output->data;
1072 __m256d one = _mm256_set1_pd( 1.0 );
1075 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
1076 *result = _mm256_div_pd( one, *arr1 );
1082 for (
INT4 ii = 4 * roundedvectorlength; ii < (
INT4 )input1->
length; ii++ ) {
1093 fprintf( stderr,
"%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n",
__func__ );
1112 __m128d *arr1, *result;
1113 arr1 = ( __m128d * )(
void * )input->
data;
1114 result = ( __m128d * )(
void * )
output->data;
1116 __m128d scalefactor = _mm_set1_pd( scalar );
1119 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
1120 *result = _mm_add_pd( *arr1, scalefactor );
1126 for (
INT4 ii = 2 * roundedvectorlength; ii < (
INT4 )input->
length; ii++ ) {
1135 fprintf( stderr,
"%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n",
__func__ );
1158 __m256d *arr1, *result;
1159 arr1 = ( __m256d * )(
void * )input->
data;
1160 result = ( __m256d * )(
void * )
output->data;
1162 __m256d scalefactor = _mm256_set1_pd( scalar );
1165 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
1166 *result = _mm256_add_pd( *arr1, scalefactor );
1172 for (
INT4 ii = 4 * roundedvectorlength; ii < (
INT4 )input->
length; ii++ ) {
1184 fprintf( stderr,
"%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n",
__func__ );
1203 __m128d *arr1, *result;
1204 arr1 = ( __m128d * )(
void * )input->
data;
1205 result = ( __m128d * )(
void * )
output->data;
1207 __m128d scalefactor = _mm_set1_pd(
scale );
1210 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
1211 *result = _mm_mul_pd( *arr1, scalefactor );
1217 for (
UINT4 ii = 2 * roundedvectorlength; ii < input->
length; ii++ ) {
1226 fprintf( stderr,
"%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n",
__func__ );
1249 __m256d *arr1, *result;
1250 arr1 = ( __m256d * )(
void * )input->
data;
1251 result = ( __m256d * )(
void * )
output->data;
1253 __m256d scalefactor = _mm256_set1_pd(
scale );
1256 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
1257 *result = _mm256_mul_pd( *arr1, scalefactor );
1263 for (
UINT4 ii = 4 * roundedvectorlength; ii < input->
length; ii++ ) {
1275 fprintf( stderr,
"%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n",
__func__ );
1294 INT4 vec1 = vectorpos1, vec2 = vectorpos2, outvec = outputvectorpos;
1295 for (
INT4 ii = 0; ii < numvectors; ii++ ) {
1615 __m128d *
x, *result;
1616 x = ( __m128d * )(
void * )input->
data;
1617 result = ( __m128d * )(
void * )
output->data;
1619 __m128i expoffset = _mm_set_epi32( 0, 1023, 0, 1023 );
1620 __m128i maskupper32bits = _mm_set_epi32( 0xffffffff, 0x00000000, 0xffffffff, 0x00000000 );
1621 __m128i masklower32bits = _mm_set_epi32( 0x00000000, 0xffffffff, 0x00000000, 0xffffffff );
1622 __m128d log2e = _mm_set1_pd( 1.442695040888963 );
1623 __m128d onehalf = _mm_set1_pd( 0.5 );
1624 __m128d one = _mm_set1_pd( 1.0 );
1625 __m128d two = _mm_set1_pd( 2.0 );
1626 __m128d maxexp = _mm_set1_pd( 7.007827128933840e+02 );
1627 __m128d minexp = _mm_set1_pd( -7.007827128933840e+02 );
1629 __m128d cephes_c1 = _mm_set1_pd( 6.93145751953125e-1 );
1630 __m128d cephes_c2 = _mm_set1_pd( 1.42860682030941723212e-6 );
1631 __m128d cephes_p0 = _mm_set1_pd( 1.26177193074810590878e-4 );
1632 __m128d cephes_p1 = _mm_set1_pd( 3.02994407707441961300e-2 );
1633 __m128d cephes_p2 = _mm_set1_pd( 9.99999999999999999910e-1 );
1634 __m128d cephes_q0 = _mm_set1_pd( 3.00198505138664455042e-6 );
1635 __m128d cephes_q1 = _mm_set1_pd( 2.52448340349684104192e-3 );
1636 __m128d cephes_q2 = _mm_set1_pd( 2.27265548208155028766e-1 );
1637 __m128d cephes_q3 = _mm_set1_pd( 2.00000000000000000009e0 );
1639 for (
INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
1641 y = _mm_max_pd(
y, minexp );
1642 y = _mm_min_pd(
y, maxexp );
1645 __m128d log2etimesx = _mm_mul_pd( log2e,
y );
1648 __m128d log2etimesxplushalf = _mm_add_pd( log2etimesx, onehalf );
1649 __m128i log2etimesx_rounded_i = _mm_cvttpd_epi32( log2etimesxplushalf );
1650 __m128d log2etimesx_rounded = _mm_cvtepi32_pd( log2etimesx_rounded_i );
1651 __m128d mask = _mm_cmpgt_pd( log2etimesx_rounded, log2etimesxplushalf );
1652 mask = _mm_and_pd( mask, one );
1653 log2etimesx_rounded = _mm_sub_pd( log2etimesx_rounded, mask );
1654 log2etimesx_rounded_i = _mm_cvttpd_epi32( log2etimesx_rounded );
1657 __m128d log2etimesx_rounded_times_c1 = _mm_mul_pd( log2etimesx_rounded, cephes_c1 );
1658 __m128d log2etimesx_rounded_times_c2 = _mm_mul_pd( log2etimesx_rounded, cephes_c2 );
1659 y = _mm_sub_pd(
y, log2etimesx_rounded_times_c1 );
1660 y = _mm_sub_pd(
y, log2etimesx_rounded_times_c2 );
1663 __m128d xsq = _mm_mul_pd(
y,
y );
1667 __m128d polevlresult = cephes_p0;
1668 polevlresult = _mm_mul_pd( polevlresult, xsq );
1669 polevlresult = _mm_add_pd( polevlresult, cephes_p1 );
1670 polevlresult = _mm_mul_pd( polevlresult, xsq );
1671 polevlresult = _mm_add_pd( polevlresult, cephes_p2 );
1672 __m128d xtimespolresult = _mm_mul_pd(
y, polevlresult );
1675 polevlresult = cephes_q0;
1676 polevlresult = _mm_mul_pd( polevlresult, xsq );
1677 polevlresult = _mm_add_pd( polevlresult, cephes_q1 );
1678 polevlresult = _mm_mul_pd( polevlresult, xsq );
1679 polevlresult = _mm_add_pd( polevlresult, cephes_q2 );
1680 polevlresult = _mm_mul_pd( polevlresult, xsq );
1681 polevlresult = _mm_add_pd( polevlresult, cephes_q3 );
1682 __m128d polevlresult_minus_xtimespolresult = _mm_sub_pd( polevlresult, xtimespolresult );
1683 y = _mm_div_pd( xtimespolresult, polevlresult_minus_xtimespolresult );
1686 y = _mm_mul_pd(
y, two );
1687 y = _mm_add_pd(
y, one );
1692 __m128i log2etimesx_rounded_i_64bit = _mm_unpacklo_epi32( log2etimesx_rounded_i, log2etimesx_rounded_i );
1693 __m128i maskedlower32 = _mm_and_si128( log2etimesx_rounded_i_64bit, masklower32bits );
1694 log2etimesx_rounded_i_64bit = _mm_srai_epi32( log2etimesx_rounded_i_64bit, 32 );
1695 log2etimesx_rounded_i_64bit = _mm_and_si128( log2etimesx_rounded_i_64bit, maskupper32bits );
1696 log2etimesx_rounded_i_64bit = _mm_xor_si128( log2etimesx_rounded_i_64bit, maskedlower32 );
1699 __m128i log2etimesx_rounded_i_with_offset = _mm_add_epi64( log2etimesx_rounded_i_64bit, expoffset );
1700 log2etimesx_rounded_i_with_offset = _mm_slli_epi64( log2etimesx_rounded_i_with_offset, 52 );
1701 __m128d pow2n = _mm_castsi128_pd( log2etimesx_rounded_i_with_offset );
1704 *result = _mm_mul_pd(
y, pow2n );
1712 for (
INT4 ii = 2 * roundedvectorlength; ii < (
INT4 )input->
length; ii++ ) {
1720 fprintf( stderr,
"%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n",
__func__ );
#define __func__
log an I/O error, i.e.
const double scale
multiplicative scaling factor of the coordinate
void * XLALMalloc(size_t n)
int XLALVectorSinCosREAL4(REAL4 *out1, REAL4 *out2, const REAL4 *in, const UINT4 len)
int XLALVectorScaleREAL4(REAL4 *out, REAL4 scalar, const REAL4 *in, const UINT4 len)
void XLALDestroyREAL4VectorAligned(REAL4VectorAligned *in)
REAL4VectorAligned * XLALCreateREAL4VectorAligned(const UINT4 length, const UINT4 align)
int XLALVectorAddREAL4(REAL4 *out, const REAL4 *in1, const REAL4 *in2, const UINT4 len)
int XLALVectorMultiplyREAL4(REAL4 *out, const REAL4 *in1, const REAL4 *in2, const UINT4 len)
#define XLAL_CHECK(assertion,...)
#define XLAL_CHECK_NULL(assertion,...)
REAL4VectorAligned ** data
alignedREAL8Vector ** data
INT4 avxSSVectorSubtract(REAL4VectorAligned *output, REAL4VectorAligned *input1, REAL4VectorAligned *input2)
Subtract two REAL4VectorAligned using AVX.
INT4 avxScaleREAL8Vector(alignedREAL8Vector *output, alignedREAL8Vector *input, REAL8 scale)
Scale the elements of a alignedREAL8Vector by a REAL8 value using AVX.
INT4 sseScaleREAL8Vector(alignedREAL8Vector *output, alignedREAL8Vector *input, REAL8 scale)
Scale the elements of a alignedREAL8Vector by a REAL8 value using SSE.
INT4 VectorAbsREAL4(REAL4VectorAligned *output, REAL4VectorAligned *input, INT4 vectorMath)
INT4 VectorShiftREAL8(alignedREAL8Vector *output, alignedREAL8Vector *input, REAL8 shift, INT4 vectorMath)
void destroyAlignedREAL8Vector(alignedREAL8Vector *vector)
INT4 avxDDVectorSum(alignedREAL8Vector *output, alignedREAL8Vector *input1, alignedREAL8Vector *input2)
Sum two alignedREAL8Vector using AVX.
INT4 DirichletRatioVector(COMPLEX8Vector *output, alignedREAL8Vector *delta0, alignedREAL8Vector *delta1, alignedREAL8Vector *scaling, const UserInput_t *params)
INT4 sseSSVectorSubtract(REAL4VectorAligned *output, REAL4VectorAligned *input1, REAL4VectorAligned *input2)
Subtract two REAL4VectorAligned using SSE.
INT4 sse_exp_REAL8Vector(alignedREAL8Vector *output, alignedREAL8Vector *input)
Exponential of input vector is computed using SSE, based on the Cephes library.
INT4 VectorCabsfCOMPLEX8(REAL4VectorAligned *output, COMPLEX8Vector *input, INT4 vectorMath)
INT4 VectorInvertREAL8(alignedREAL8Vector *output, alignedREAL8Vector *input, INT4 vectorMath)
INT4 sseDDVectorSum(alignedREAL8Vector *output, alignedREAL8Vector *input1, alignedREAL8Vector *input2)
Sum two alignedREAL8Vector using SSE2.
INT4 VectorRoundREAL4(REAL4VectorAligned *output, REAL4VectorAligned *input, INT4 vectorMath)
INT4 avxAddScalarToREAL8Vector(alignedREAL8Vector *output, alignedREAL8Vector *input, REAL8 scalar)
Add a REAL8 scalar value to the elements of a alignedREAL8Vector using AVX.
INT4 avxDDVectorMultiply(alignedREAL8Vector *output, alignedREAL8Vector *input1, alignedREAL8Vector *input2)
Multiply two alignedREAL8Vector using AVX.
void destroyAlignedREAL8VectorArray(alignedREAL8VectorArray *array)
INT4 VectorRoundREAL8(alignedREAL8Vector *output, alignedREAL8Vector *input, INT4 vectorMath)
INT4 VectorArraySum(REAL4VectorAlignedArray *output, REAL4VectorAlignedArray *input1, REAL4VectorAlignedArray *input2, INT4 vectorpos1, INT4 vectorpos2, INT4 outputvectorpos, INT4 numvectors)
Sum vectors from REAL4VectorAlignedArrays into an output REAL4VectorAlignedArray using SIMD.
INT4 VectorAddREAL8(alignedREAL8Vector *output, alignedREAL8Vector *input1, alignedREAL8Vector *input2, INT4 vectorMath)
alignedREAL8VectorArray * createAlignedREAL8VectorArray(const UINT4 length, const UINT4 vectorLength, const size_t align)
INT4 VectorFloorREAL8(alignedREAL8Vector *output, alignedREAL8Vector *input, INT4 vectorMath)
REAL4VectorAlignedArray * createREAL4VectorAlignedArray(const UINT4 length, const UINT4 vectorLength, const size_t align)
INT4 VectorScaleREAL8(alignedREAL8Vector *output, alignedREAL8Vector *input, REAL8 scale, INT4 vectorMath)
void destroyREAL4VectorAlignedArray(REAL4VectorAlignedArray *array)
INT4 avxInvertREAL8Vector(alignedREAL8Vector *output, alignedREAL8Vector *input1)
Invert a alignedREAL8Vector using AVX.
INT4 VectorAbsREAL8(alignedREAL8Vector *output, alignedREAL8Vector *input, INT4 vectorMath)
INT4 avxDDVectorSubtract(alignedREAL8Vector *output, alignedREAL8Vector *input1, alignedREAL8Vector *input2)
Subtract two alignedREAL8Vector using AVX.
INT4 fastSSVectorMultiply_with_stride_and_offset(REAL4VectorAligned *output, const REAL4VectorAligned *input1, const REAL4VectorAligned *input2, const INT4 stride1, const INT4 stride2, const INT4 offset1, const INT4 offset2)
Computes a multiplication of two vectors with a stride and initial offset.
INT4 VectorSubtractREAL4(REAL4VectorAligned *output, REAL4VectorAligned *input1, REAL4VectorAligned *input2, INT4 vectorMath)
INT4 sseAddScalarToREAL8Vector(alignedREAL8Vector *output, alignedREAL8Vector *input, REAL8 scalar)
Add a REAL8 scalar value to the elements of a alignedREAL8Vector using SSE.
INT4 sseDDVectorSubtract(alignedREAL8Vector *output, alignedREAL8Vector *input1, alignedREAL8Vector *input2)
Subtract two alignedREAL8Vector using SSE.
alignedREAL8Vector * createAlignedREAL8Vector(UINT4 length, const size_t align)
INT4 sseDDVectorMultiply(alignedREAL8Vector *output, alignedREAL8Vector *input1, alignedREAL8Vector *input2)
Multiply two alignedREAL8Vector using SSE.
INT4 sseInvertREAL8Vector(alignedREAL8Vector *output, alignedREAL8Vector *input1)
Invert a alignedREAL8Vector using SSE.
INT4 VectorSubtractREAL8(alignedREAL8Vector *output, alignedREAL8Vector *input1, alignedREAL8Vector *input2, INT4 vectorMath)
INT4 VectorMultiplyREAL8(alignedREAL8Vector *output, alignedREAL8Vector *input1, alignedREAL8Vector *input2, INT4 vectorMath)
INT4 VectorCabsCOMPLEX8(alignedREAL8Vector *output, COMPLEX8Vector *input, INT4 vectorMath)