Loading [MathJax]/extensions/TeX/AMSsymbols.js
LALPulsar 7.1.1.1-b246709
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
vectormath.c
Go to the documentation of this file.
1/*
2 * Copyright (C) 2011, 2012, 2014, 2015 Evan Goetz
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with with program; see the file COPYING. If not, write to the
16 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
17 * MA 02110-1301 USA
18 */
19
20
21
22#ifdef __SSE__
23#include <xmmintrin.h>
24#endif
25#ifdef __SSE2__
26#include <emmintrin.h>
27#endif
28#ifdef __AVX__
29#include <immintrin.h>
30#endif
31
32#include <math.h>
33
34#include <lal/LALConstants.h>
35#include <lal/SinCosLUT.h>
36
37#include "vectormath.h"
38
39#ifdef HAVE_STDINT_H
40#include <stdint.h>
41#else
42typedef size_t uintptr_t;
43#endif
44
45#define OOTWOPI (1.0 / LAL_TWOPI)
46#define LUT_RES 1024 /* resolution of lookup-table */
47#define LUT_RES_F (1.0 * LUT_RES)
48#define OO_LUT_RES (1.0 / LUT_RES)
49#define X_TO_IND (1.0 * LUT_RES * OOTWOPI )
50#define IND_TO_X (LAL_TWOPI * OO_LUT_RES)
51#define TRUE (1==1)
52#define FALSE (1==0)
53
54
56{
58 XLAL_CHECK_NULL( ( vector = XLALMalloc( sizeof( *vector ) ) ) != NULL, XLAL_ENOMEM );
59 vector->length = length;
60 UINT4 paddedLength = length + align - 1;
61 XLAL_CHECK_NULL( ( vector->data0 = XLALMalloc( paddedLength * sizeof( REAL8 ) ) ) != NULL, XLAL_ENOMEM );
62 size_t remBytes = ( ( size_t )vector->data0 ) % align;
63 size_t offsetBytes = ( align - remBytes ) % align;
64 vector->data = ( void * )( ( ( char * )vector->data0 ) + offsetBytes );
65 return vector;
66}
68{
69 if ( !vector ) {
70 return;
71 }
72 if ( vector->data0 ) {
73 XLALFree( vector->data0 );
74 }
76}
77alignedREAL8VectorArray *createAlignedREAL8VectorArray( const UINT4 length, const UINT4 vectorLength, const size_t align )
78{
79 alignedREAL8VectorArray *array = NULL;
80 XLAL_CHECK_NULL( ( array = XLALMalloc( sizeof( *array ) ) ) != NULL, XLAL_ENOMEM );
81 array->length = length;
82 XLAL_CHECK_NULL( ( array->data = XLALMalloc( sizeof( *( array->data ) ) * array->length ) ) != NULL, XLAL_ENOMEM );
83 for ( UINT4 ii = 0; ii < length; ii++ ) {
84 XLAL_CHECK_NULL( ( array->data[ii] = createAlignedREAL8Vector( vectorLength, align ) ) != NULL, XLAL_EFUNC );
85 }
86 return array;
87}
89{
90 if ( !array ) {
91 return;
92 }
93 for ( UINT4 ii = 0; ii < array->length; ii++ ) {
94 destroyAlignedREAL8Vector( array->data[ii] );
95 }
96 XLALFree( array->data );
97 XLALFree( array );
98}
99REAL4VectorAlignedArray *createREAL4VectorAlignedArray( const UINT4 length, const UINT4 vectorLength, const size_t align )
100{
101 REAL4VectorAlignedArray *array = NULL;
102 XLAL_CHECK_NULL( ( array = XLALMalloc( sizeof( *array ) ) ) != NULL, XLAL_ENOMEM );
103 array->length = length;
104 XLAL_CHECK_NULL( ( array->data = XLALMalloc( sizeof( *( array->data ) ) * array->length ) ) != NULL, XLAL_ENOMEM );
105 for ( UINT4 ii = 0; ii < length; ii++ ) {
106 XLAL_CHECK_NULL( ( array->data[ii] = XLALCreateREAL4VectorAligned( vectorLength, align ) ) != NULL, XLAL_EFUNC );
107 }
108 return array;
109}
111{
112 if ( !array ) {
113 return;
114 }
115 for ( UINT4 ii = 0; ii < array->length; ii++ ) {
117 }
118 XLALFree( array->data );
119 XLALFree( array );
120}
121
123{
124
125 XLAL_CHECK( output != NULL && delta0 != NULL && delta1 != NULL && scaling != NULL && params != NULL, XLAL_EFUNC );
126
127 REAL4VectorAligned *delta0_int = NULL, *delta1_int = NULL, *roundedDelta0_int = NULL, *sinPiDelta0 = NULL, *sinPiDelta1 = NULL, *PiDelta = NULL, *cosPiDelta0 = NULL, *cosPiDelta1 = NULL, *realTerms = NULL, *realTerms2 = NULL, *imagTerms = NULL, *imagTerms2 = NULL;
128 XLAL_CHECK( ( delta0_int = XLALCreateREAL4VectorAligned( delta0->length, 32 ) ) != NULL, XLAL_EFUNC );
129 XLAL_CHECK( ( delta1_int = XLALCreateREAL4VectorAligned( delta0->length, 32 ) ) != NULL, XLAL_EFUNC );
130 XLAL_CHECK( ( roundedDelta0_int = XLALCreateREAL4VectorAligned( delta0->length, 32 ) ) != NULL, XLAL_EFUNC );
131 XLAL_CHECK( ( sinPiDelta0 = XLALCreateREAL4VectorAligned( delta0->length, 32 ) ) != NULL, XLAL_EFUNC );
132 XLAL_CHECK( ( sinPiDelta1 = XLALCreateREAL4VectorAligned( delta0->length, 32 ) ) != NULL, XLAL_EFUNC );
133 XLAL_CHECK( ( cosPiDelta0 = XLALCreateREAL4VectorAligned( delta0->length, 32 ) ) != NULL, XLAL_EFUNC );
134 XLAL_CHECK( ( cosPiDelta1 = XLALCreateREAL4VectorAligned( delta0->length, 32 ) ) != NULL, XLAL_EFUNC );
135 XLAL_CHECK( ( PiDelta = XLALCreateREAL4VectorAligned( delta0->length, 32 ) ) != NULL, XLAL_EFUNC );
136 XLAL_CHECK( ( realTerms = XLALCreateREAL4VectorAligned( delta0->length, 32 ) ) != NULL, XLAL_EFUNC );
137 XLAL_CHECK( ( realTerms2 = XLALCreateREAL4VectorAligned( delta0->length, 32 ) ) != NULL, XLAL_EFUNC );
138 XLAL_CHECK( ( imagTerms = XLALCreateREAL4VectorAligned( delta0->length, 32 ) ) != NULL, XLAL_EFUNC );
139 XLAL_CHECK( ( imagTerms2 = XLALCreateREAL4VectorAligned( delta0->length, 32 ) ) != NULL, XLAL_EFUNC );
140
141 for ( UINT4 ii = 0; ii < delta0_int->length; ii++ ) {
142 delta0_int->data[ii] = ( REAL4 )( delta0->data[ii] );
143 delta1_int->data[ii] = ( REAL4 )( delta1->data[ii] );
144 }
145
146 XLAL_CHECK( VectorRoundREAL4( roundedDelta0_int, delta0_int, params->vectorMath ) == XLAL_SUCCESS, XLAL_EFUNC );
147 XLAL_CHECK( XLALVectorScaleREAL4( PiDelta->data, ( REAL4 )LAL_PI, delta0_int->data, delta0_int->length ) == XLAL_SUCCESS, XLAL_EFUNC );
148 XLAL_CHECK( XLALVectorSinCosREAL4( sinPiDelta0->data, cosPiDelta0->data, PiDelta->data, PiDelta->length ) == XLAL_SUCCESS, XLAL_EFUNC );
149 XLAL_CHECK( XLALVectorScaleREAL4( PiDelta->data, ( REAL4 )LAL_PI, delta1_int->data, delta1_int->length ) == XLAL_SUCCESS, XLAL_EFUNC );
150 XLAL_CHECK( XLALVectorSinCosREAL4( sinPiDelta1->data, cosPiDelta1->data, PiDelta->data, PiDelta->length ) == XLAL_SUCCESS, XLAL_EFUNC );
151 XLAL_CHECK( XLALVectorMultiplyREAL4( realTerms->data, cosPiDelta1->data, cosPiDelta0->data, cosPiDelta1->length ) == XLAL_SUCCESS, XLAL_EFUNC );
152 XLAL_CHECK( XLALVectorMultiplyREAL4( realTerms2->data, sinPiDelta1->data, sinPiDelta0->data, sinPiDelta1->length ) == XLAL_SUCCESS, XLAL_EFUNC );
153 XLAL_CHECK( XLALVectorAddREAL4( realTerms->data, realTerms->data, realTerms2->data, realTerms->length ) == XLAL_SUCCESS, XLAL_EFUNC );
154 XLAL_CHECK( XLALVectorMultiplyREAL4( imagTerms->data, sinPiDelta1->data, cosPiDelta0->data, sinPiDelta1->length ) == XLAL_SUCCESS, XLAL_EFUNC );
155 XLAL_CHECK( XLALVectorMultiplyREAL4( imagTerms2->data, cosPiDelta1->data, sinPiDelta0->data, cosPiDelta1->length ) == XLAL_SUCCESS, XLAL_EFUNC );
156 XLAL_CHECK( XLALVectorScaleREAL4( imagTerms2->data, -1.0, imagTerms2->data, imagTerms2->length ) == XLAL_SUCCESS, XLAL_EFUNC );
157 XLAL_CHECK( XLALVectorAddREAL4( imagTerms->data, imagTerms->data, imagTerms2->data, imagTerms->length ) == XLAL_SUCCESS, XLAL_EFUNC );
158
159 for ( UINT4 ii = 0; ii < output->length; ii++ ) {
160 if ( fabsf( delta1_int->data[ii] ) < ( REAL4 )1.0e-6 ) {
161 if ( fabsf( delta0_int->data[ii] ) < ( REAL4 )1.0e-6 ) {
162 output->data[ii] = 1.0;
163 } else if ( fabsf( ( REAL4 )( delta0_int->data[ii]*delta0_int->data[ii] - 1.0 ) ) < ( REAL4 )1.0e-6 ) {
164 output->data[ii] = -2.0;
165 } else if ( fabsf( ( REAL4 )( delta0_int->data[ii] - roundedDelta0_int->data[ii] ) ) < ( REAL4 )1.0e-6 ) {
166 output->data[ii] = 0.0;
167 } else {
168 output->data[ii] = -LAL_PI * crectf( cosPiDelta0->data[ii], -sinPiDelta0->data[ii] ) * delta0_int->data[ii] * ( delta0_int->data[ii] * delta0_int->data[ii] - 1.0 ) / sinPiDelta0->data[ii];
169 }
170 } else if ( fabsf( ( REAL4 )( delta1_int->data[ii]*delta1_int->data[ii] - 1.0 ) ) < ( REAL4 )1.0e-6 ) {
171 if ( fabsf( delta0_int->data[ii] ) < ( REAL4 )1.0e-6 ) {
172 output->data[ii] = -0.5;
173 } else if ( fabsf( ( REAL4 )( delta0_int->data[ii]*delta0_int->data[ii] - 1.0 ) ) < ( REAL4 )1.0e-6 ) {
174 output->data[ii] = 1.0;
175 } else if ( fabsf( ( REAL4 )( delta0_int->data[ii] - roundedDelta0_int->data[ii] ) ) < ( REAL4 )1.0e-6 ) {
176 output->data[ii] = 0.0;
177 } else {
178 output->data[ii] = -LAL_PI_2 * crectf( -cosPiDelta0->data[ii], sinPiDelta0->data[ii] ) * delta0_int->data[ii] * ( delta0_int->data[ii] * delta0_int->data[ii] - 1.0 ) / sinPiDelta0->data[ii];
179 }
180 } else if ( fabsf( delta0_int->data[ii] ) < ( REAL4 )1.0e-6 ) {
181 output->data[ii] = -LAL_1_PI * crectf( cosPiDelta1->data[ii], sinPiDelta1->data[ii] ) * sinPiDelta1->data[ii] / ( delta1_int->data[ii] * ( delta1_int->data[ii] * delta1_int->data[ii] - 1.0 ) );
182 } else if ( fabsf( ( REAL4 )( delta0_int->data[ii]*delta0_int->data[ii] - 1.0 ) ) < ( REAL4 )1.0e-6 ) {
183 output->data[ii] = LAL_2_PI * crectf( cosPiDelta1->data[ii], sinPiDelta1->data[ii] ) * sinPiDelta1->data[ii] / ( delta1_int->data[ii] * ( delta1_int->data[ii] * delta1_int->data[ii] - 1.0 ) );
184 } else if ( fabsf( ( REAL4 )( delta0_int->data[ii] - roundedDelta0_int->data[ii] ) ) < ( REAL4 )1.0e-6 ) {
185 output->data[ii] = 0.0;
186 } else {
187 output->data[ii] = scaling->data[ii] * sinPiDelta1->data[ii] / sinPiDelta0->data[ii] * crectf( realTerms->data[ii], imagTerms->data[ii] );
188 }
189 }
190
191 XLALDestroyREAL4VectorAligned( delta0_int );
192 XLALDestroyREAL4VectorAligned( delta1_int );
193 XLALDestroyREAL4VectorAligned( roundedDelta0_int );
194 XLALDestroyREAL4VectorAligned( sinPiDelta0 );
195 XLALDestroyREAL4VectorAligned( sinPiDelta1 );
196 XLALDestroyREAL4VectorAligned( cosPiDelta0 );
197 XLALDestroyREAL4VectorAligned( cosPiDelta1 );
200 XLALDestroyREAL4VectorAligned( realTerms2 );
202 XLALDestroyREAL4VectorAligned( imagTerms2 );
203
204 return XLAL_SUCCESS;
205
206}
207
208/**
209 * \brief Computes a multiplication of two vectors with a stride and initial offset
210 *
211 * Be sure you know what you are doing or this could go wrong (no error checking for speed!)
212 * \param [out] output Pointer to REAL4VectorAligned output
213 * \param [in] input1 Pointer to first REAL4VectorAligned input
214 * \param [in] input2 Pointer to second REAL4VectorAligned input
215 * \param [in] stride1 Skip stride1 number of elements in input1
216 * \param [in] stride2 Skip stride2 number of elements in input2
217 * \param [in] offset1 Start at offset1 number of elements from the beginning of input1
218 * \param [in] offset2 Start at offset2 number of elements from the beginning of input2
219 * \return Status value
220 */
221INT4 fastSSVectorMultiply_with_stride_and_offset( REAL4VectorAligned *output, const REAL4VectorAligned *input1, const REAL4VectorAligned *input2, const INT4 stride1, const INT4 stride2, const INT4 offset1, const INT4 offset2 )
222{
223
224 REAL4 *a, *b, *c;
225 INT4 n;
226
227 a = input1->data + offset1;
228 b = input2->data + offset2;
229 c = output->data;
230 n = output->length;
231
232 while ( n-- > 0 ) {
233 *c = ( *a ) * ( *b );
234 a = a + stride1;
235 b = b + stride2;
236 c++;
237 }
238
239 return XLAL_SUCCESS;
240
241} /* SSVectorMultiply_with_stride_and_offset() */
242
244{
245 XLAL_CHECK( output != NULL && input1 != NULL && input2 != NULL, XLAL_EINVAL );
246 if ( vectorMath == 1 ) {
248 } else if ( vectorMath == 2 ) {
250 } else for ( UINT4 ii = 0; ii < input1->length; ii++ ) {
251 output->data[ii] = input1->data[ii] - input2->data[ii];
252 }
253 return XLAL_SUCCESS;
254}
255
257{
258 XLAL_CHECK( output != NULL && input != NULL, XLAL_EINVAL );
259 if ( vectorMath == 1 ) {
261 } else if ( vectorMath == 2 ) {
263 } else for ( UINT4 ii = 0; ii < input->length; ii++ ) {
264 output->data[ii] = input->data[ii] * scale;
265 }
266 return XLAL_SUCCESS;
267}
268
270{
271 XLAL_CHECK( output != NULL && input != NULL, XLAL_EINVAL );
272 if ( vectorMath == 1 ) {
274 } else if ( vectorMath == 2 ) {
276 } else for ( UINT4 ii = 0; ii < input->length; ii++ ) {
277 output->data[ii] = input->data[ii] + shift;
278 }
279 return XLAL_SUCCESS;
280}
281
283{
284 XLAL_CHECK( output != NULL && input1 != NULL && input2 != NULL, XLAL_EINVAL );
285 if ( vectorMath == 1 ) {
286 XLAL_CHECK( sseDDVectorSum( output, input1, input2 ) == XLAL_SUCCESS, XLAL_EFUNC );
287 } else if ( vectorMath == 2 ) {
288 XLAL_CHECK( avxDDVectorSum( output, input1, input2 ) == XLAL_SUCCESS, XLAL_EFUNC );
289 } else for ( UINT4 ii = 0; ii < input1->length; ii++ ) {
290 output->data[ii] = input1->data[ii] + input2->data[ii];
291 }
292 return XLAL_SUCCESS;
293}
294
296{
297 XLAL_CHECK( output != NULL && input1 != NULL && input2 != NULL, XLAL_EINVAL );
298 if ( vectorMath == 1 ) {
300 } else if ( vectorMath == 2 ) {
302 } else for ( UINT4 ii = 0; ii < input1->length; ii++ ) {
303 output->data[ii] = input1->data[ii] - input2->data[ii];
304 }
305 return XLAL_SUCCESS;
306}
307
309{
310 XLAL_CHECK( output != NULL && input1 != NULL && input2 != NULL, XLAL_EINVAL );
311 if ( vectorMath == 1 ) {
313 } else if ( vectorMath == 2 ) {
315 } else for ( UINT4 ii = 0; ii < input1->length; ii++ ) {
316 output->data[ii] = input1->data[ii] * input2->data[ii];
317 }
318 return XLAL_SUCCESS;
319}
320
322{
323 XLAL_CHECK( output != NULL && input != NULL, XLAL_EINVAL );
324 if ( vectorMath == 1 ) {
326 } else if ( vectorMath == 2 ) {
328 } else for ( UINT4 ii = 0; ii < input->length; ii++ ) {
329 output->data[ii] = 1.0 / input->data[ii];
330 }
331 return XLAL_SUCCESS;
332}
333
335{
336 XLAL_CHECK( output != NULL && input != NULL, XLAL_EINVAL );
337 if ( vectorMath == 2 ) {
338#ifdef __AVX__
339 _mm256_zeroupper();
340 INT4 roundedvectorlength = ( INT4 )input->length / 4;
341 __m256d *arr, *result;
342 arr = ( __m256d * )( void * )input->data;
343 result = ( __m256d * )( void * )output->data;
344 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
345 *result = _mm256_floor_pd( *arr );
346 arr++;
347 result++;
348 }
349 _mm256_zeroupper();
350 for ( UINT4 ii = 4 * roundedvectorlength; ii < input->length; ii++ ) {
351 output->data[ii] = floor( input->data[ii] );
352 }
353#else
354 ( void )output;
355 ( void )input;
356 fprintf( stderr, "%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n", __func__ );
358#endif
359 } else for ( UINT4 ii = 0; ii < input->length; ii++ ) {
360 output->data[ii] = floor( input->data[ii] );
361 }
362 return XLAL_SUCCESS;
363}
364
366{
367 XLAL_CHECK( output != NULL && input != NULL, XLAL_EINVAL );
368 if ( vectorMath == 2 ) {
369#ifdef __AVX__
370 _mm256_zeroupper();
371 INT4 roundedvectorlength = ( INT4 )input->length / 8;
372 __m256 *arr, *result;
373 arr = ( __m256 * )( void * )input->data;
374 result = ( __m256 * )( void * )output->data;
375 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
376 *result = _mm256_round_ps( *arr, _MM_FROUND_TO_NEAREST_INT );
377 arr++;
378 result++;
379 }
380 _mm256_zeroupper();
381 for ( UINT4 ii = 8 * roundedvectorlength; ii < input->length; ii++ ) {
382 output->data[ii] = roundf( input->data[ii] );
383 }
384#else
385 ( void )output;
386 ( void )input;
387 fprintf( stderr, "%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n", __func__ );
389#endif
390 } else for ( UINT4 ii = 0; ii < input->length; ii++ ) {
391 output->data[ii] = round( input->data[ii] );
392 }
393 return XLAL_SUCCESS;
394}
395
397{
398 XLAL_CHECK( output != NULL && input != NULL, XLAL_EINVAL );
399 if ( vectorMath == 2 ) {
400#ifdef __AVX__
401 _mm256_zeroupper();
402 INT4 roundedvectorlength = ( INT4 )input->length / 4;
403 __m256d *arr, *result;
404 arr = ( __m256d * )( void * )input->data;
405 result = ( __m256d * )( void * )output->data;
406 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
407 *result = _mm256_round_pd( *arr, _MM_FROUND_TO_NEAREST_INT );
408 arr++;
409 result++;
410 }
411 _mm256_zeroupper();
412 for ( UINT4 ii = 4 * roundedvectorlength; ii < input->length; ii++ ) {
413 output->data[ii] = round( input->data[ii] );
414 }
415#else
416 ( void )output;
417 ( void )input;
418 fprintf( stderr, "%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n", __func__ );
420#endif
421 } else for ( UINT4 ii = 0; ii < input->length; ii++ ) {
422 output->data[ii] = round( input->data[ii] );
423 }
424 return XLAL_SUCCESS;
425}
426
428{
429 XLAL_CHECK( output != NULL && input != NULL, XLAL_EINVAL );
430 if ( vectorMath == 2 ) {
431#ifdef __AVX__
432 _mm256_zeroupper();
433 INT4 roundedvectorlength = ( INT4 )input->length / 8;
434 __m256 *arr, *result;
435 __m256i mask = _mm256_set1_epi32( ~0x80000000 );
436 arr = ( __m256 * )( void * )input->data;
437 result = ( __m256 * )( void * )output->data;
438 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
439 *result = _mm256_and_ps( *arr, ( __m256 )mask );
440 arr++;
441 result++;
442 }
443 _mm256_zeroupper();
444 for ( UINT4 ii = 8 * roundedvectorlength; ii < input->length; ii++ ) {
445 output->data[ii] = fabsf( input->data[ii] );
446 }
447#else
448 ( void )output;
449 ( void )input;
450 fprintf( stderr, "%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n", __func__ );
452#endif
453 } else if ( vectorMath == 1 ) {
454#ifdef __SSE2__
455 INT4 roundedvectorlength = ( INT4 )input->length / 4;
456 __m128 *arr, *result;
457 __m128i mask = _mm_set1_epi32( ~0x80000000 );
458 arr = ( __m128 * )( void * )input->data;
459 result = ( __m128 * )( void * )output->data;
460 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
461 *result = _mm_and_ps( *arr, ( __m128 )mask );
462 arr++;
463 result++;
464 }
465 for ( UINT4 ii = 4 * roundedvectorlength; ii < input->length; ii++ ) {
466 output->data[ii] = fabsf( input->data[ii] );
467 }
468#else
469 ( void )output;
470 ( void )input;
471 fprintf( stderr, "%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n", __func__ );
473#endif
474 } else for ( UINT4 ii = 0; ii < input->length; ii++ ) {
475 output->data[ii] = fabsf( input->data[ii] );
476 }
477 return XLAL_SUCCESS;
478}
479
481{
482 XLAL_CHECK( output != NULL && input != NULL, XLAL_EINVAL );
483 if ( vectorMath == 2 ) {
484#ifdef __AVX__
485 _mm256_zeroupper();
486 INT4 roundedvectorlength = ( INT4 )input->length / 4;
487 __m256d *arr, *result;
488 __m256i mask = _mm256_set1_epi64x( ~0x8000000000000000 );
489 arr = ( __m256d * )( void * )input->data;
490 result = ( __m256d * )( void * )output->data;
491 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
492 *result = _mm256_and_pd( *arr, ( __m256d )mask );
493 arr++;
494 result++;
495 }
496 _mm256_zeroupper();
497 for ( UINT4 ii = 4 * roundedvectorlength; ii < input->length; ii++ ) {
498 output->data[ii] = fabs( input->data[ii] );
499 }
500#else
501 ( void )output;
502 ( void )input;
503 fprintf( stderr, "%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n", __func__ );
505#endif
506 } else if ( vectorMath == 1 ) {
507#ifdef __SSE2__
508 INT4 roundedvectorlength = ( INT4 )input->length / 2;
509 __m128d *arr, *result;
510 __m128i mask = _mm_set1_epi64x( ~0x8000000000000000 );
511 arr = ( __m128d * )( void * )input->data;
512 result = ( __m128d * )( void * )output->data;
513 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
514 *result = _mm_and_pd( *arr, ( __m128d )mask );
515 arr++;
516 result++;
517 }
518 for ( UINT4 ii = 2 * roundedvectorlength; ii < input->length; ii++ ) {
519 output->data[ii] = fabs( input->data[ii] );
520 }
521#else
522 ( void )output;
523 ( void )input;
524 fprintf( stderr, "%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n", __func__ );
526#endif
527 } else for ( UINT4 ii = 0; ii < input->length; ii++ ) {
528 output->data[ii] = fabs( input->data[ii] );
529 }
530 return XLAL_SUCCESS;
531}
532
534{
535 XLAL_CHECK( output != NULL && input != NULL, XLAL_EINVAL );
536 if ( vectorMath == 2 ) {
537#ifdef __AVX__
538 _mm256_zeroupper();
539 INT4 roundedvectorlength = ( INT4 )input->length / 8;
540 __m256 *result = ( __m256 * )( void * )output->data;
541 INT4 position = 0;
542 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
543 __m256 input_re = _mm256_set_ps( ( float )crealf( input->data[position + 7] ), ( float )crealf( input->data[position + 6] ), ( float )crealf( input->data[position + 5] ), ( float )crealf( input->data[position + 4] ), ( float )crealf( input->data[position + 3] ), ( float )crealf( input->data[position + 2] ), ( float )crealf( input->data[position + 1] ), ( float )crealf( input->data[position] ) );
544 __m256 input_im = _mm256_set_ps( ( float )cimagf( input->data[position + 7] ), ( float )cimagf( input->data[position + 6] ), ( float )cimagf( input->data[position + 5] ), ( float )cimagf( input->data[position + 4] ), ( float )cimagf( input->data[position + 3] ), ( float )cimagf( input->data[position + 2] ), ( float )cimagf( input->data[position + 1] ), ( float )cimagf( input->data[position] ) );
545 __m256 input_re_sq = _mm256_mul_ps( input_re, input_re );
546 __m256 input_im_sq = _mm256_mul_ps( input_im, input_im );
547 __m256 sqsum = _mm256_add_ps( input_re_sq, input_im_sq );
548 *result = _mm256_sqrt_ps( sqsum );
549 position += 8;
550 result++;
551 }
552 _mm256_zeroupper();
553 for ( UINT4 ii = 8 * roundedvectorlength; ii < input->length; ii++ ) {
554 output->data[ii] = cabsf( input->data[ii] );
555 }
556#else
557 ( void )output;
558 ( void )input;
559 fprintf( stderr, "%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n", __func__ );
561#endif
562 } else if ( vectorMath == 1 ) {
563#ifdef __SSE2__
564 INT4 roundedvectorlength = ( INT4 )input->length / 4;
565 __m128 *result = ( __m128 * )( void * )output->data;
566 INT4 position = 0;
567 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
568 __m128 input_re = _mm_set_ps( ( float )crealf( input->data[position + 3] ), ( float )crealf( input->data[position + 2] ), ( float )crealf( input->data[position + 1] ), ( float )crealf( input->data[position] ) );
569 __m128 input_im = _mm_set_ps( ( float )cimagf( input->data[position + 3] ), ( float )cimagf( input->data[position + 2] ), ( float )cimagf( input->data[position + 1] ), ( float )cimagf( input->data[position] ) );
570 __m128 input_re_sq = _mm_mul_ps( input_re, input_re );
571 __m128 input_im_sq = _mm_mul_ps( input_im, input_im );
572 __m128 sqsum = _mm_add_ps( input_re_sq, input_im_sq );
573 *result = _mm_sqrt_ps( sqsum );
574 position += 4;
575 result++;
576 }
577 for ( UINT4 ii = 4 * roundedvectorlength; ii < input->length; ii++ ) {
578 output->data[ii] = cabsf( input->data[ii] );
579 }
580#else
581 ( void )output;
582 ( void )input;
583 fprintf( stderr, "%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n", __func__ );
585#endif
586 } else for ( UINT4 ii = 0; ii < input->length; ii++ ) {
587 output->data[ii] = cabsf( input->data[ii] );
588 }
589 return XLAL_SUCCESS;
590}
592{
593 XLAL_CHECK( output != NULL && input != NULL, XLAL_EINVAL );
594 if ( vectorMath == 2 ) {
595#ifdef __AVX__
596 _mm256_zeroupper();
597 INT4 roundedvectorlength = ( INT4 )input->length / 4;
598 __m256d *result = ( __m256d * )( void * )output->data;
599 INT4 position = 0;
600 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
601 __m256d input_re = _mm256_set_pd( ( double )crealf( input->data[position + 3] ), ( double )crealf( input->data[position + 2] ), ( double )crealf( input->data[position + 1] ), ( double )crealf( input->data[position] ) );
602 __m256d input_im = _mm256_set_pd( ( double )cimagf( input->data[position + 3] ), ( double )cimagf( input->data[position + 2] ), ( double )cimagf( input->data[position + 1] ), ( double )cimagf( input->data[position] ) );
603 __m256d input_re_sq = _mm256_mul_pd( input_re, input_re );
604 __m256d input_im_sq = _mm256_mul_pd( input_im, input_im );
605 __m256d sqsum = _mm256_add_pd( input_re_sq, input_im_sq );
606 *result = _mm256_sqrt_pd( sqsum );
607 position += 4;
608 result++;
609 }
610 _mm256_zeroupper();
611 for ( UINT4 ii = 4 * roundedvectorlength; ii < input->length; ii++ ) {
612 output->data[ii] = cabs( input->data[ii] );
613 }
614#else
615 ( void )output;
616 ( void )input;
617 fprintf( stderr, "%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n", __func__ );
619#endif
620 } else if ( vectorMath == 1 ) {
621#ifdef __SSE2__
622 INT4 roundedvectorlength = ( INT4 )input->length / 2;
623 __m128d *result = ( __m128d * )( void * )output->data;
624 INT4 position = 0;
625 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
626 __m128d input_re = _mm_set_pd( ( double )crealf( input->data[position + 1] ), ( double )crealf( input->data[position] ) );
627 __m128d input_im = _mm_set_pd( ( double )cimagf( input->data[position + 1] ), ( double )cimagf( input->data[position] ) );
628 __m128d input_re_sq = _mm_mul_pd( input_re, input_re );
629 __m128d input_im_sq = _mm_mul_pd( input_im, input_im );
630 __m128d sqsum = _mm_add_pd( input_re_sq, input_im_sq );
631 *result = _mm_sqrt_pd( sqsum );
632 position += 2;
633 result++;
634 }
635 for ( UINT4 ii = 2 * roundedvectorlength; ii < input->length; ii++ ) {
636 output->data[ii] = cabs( input->data[ii] );
637 }
638#else
639 ( void )output;
640 ( void )input;
641 fprintf( stderr, "%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n", __func__ );
643#endif
644 } else for ( UINT4 ii = 0; ii < input->length; ii++ ) {
645 output->data[ii] = cabs( input->data[ii] );
646 }
647 return XLAL_SUCCESS;
648}
649
650/**
651 * Sum two alignedREAL8Vector using SSE2
652 * \param [out] output Pointer to a alignedREAL8Vector
653 * \param [in] input1 Pointer to a alignedREAL8Vector
654 * \param [in] input2 Pointer to a alignedREAL8Vector
655 * \return Status value
656 */
658{
659
660#ifdef __SSE2__
661 INT4 roundedvectorlength = ( INT4 )input1->length / 2;
662
663 __m128d *arr1, *arr2, *result;
664 arr1 = ( __m128d * )( void * )input1->data;
665 arr2 = ( __m128d * )( void * )input2->data;
666 result = ( __m128d * )( void * )output->data;
667
668 //add the two vectors into the output
669 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
670 *result = _mm_add_pd( *arr1, *arr2 );
671 arr1++;
672 arr2++;
673 result++;
674 }
675
676 //Finish up the remaining part
677 for ( UINT4 ii = 2 * roundedvectorlength; ii < input1->length; ii++ ) {
678 output->data[ii] = input1->data[ii] + input2->data[ii];
679 }
680
681 return XLAL_SUCCESS;
682#else
683 ( void )output;
684 ( void )input1;
685 ( void )input2;
686 fprintf( stderr, "%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n", __func__ );
688#endif
689
690}
691
692
693/**
694 * Sum two alignedREAL8Vector using AVX
695 * \param [out] output Pointer to a alignedREAL8Vector
696 * \param [in] input1 Pointer to a alignedREAL8Vector
697 * \param [in] input2 Pointer to a alignedREAL8Vector
698 * \return Status value
699 */
701{
702
703#ifdef __AVX__
704 //Need to zero the upper 128 bits in case of an SSE function call after this AVX function
705 _mm256_zeroupper();
706
707 INT4 roundedvectorlength = ( INT4 )input1->length / 4;
708
709 __m256d *arr1, *arr2, *result;
710 arr1 = ( __m256d * )( void * )input1->data;
711 arr2 = ( __m256d * )( void * )input2->data;
712 result = ( __m256d * )( void * )output->data;
713
714 //add the two vectors into the output
715 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
716 *result = _mm256_add_pd( *arr1, *arr2 );
717 arr1++;
718 arr2++;
719 result++;
720 }
721
722 //Finish up the remaining part
723 for ( UINT4 ii = 4 * roundedvectorlength; ii < input1->length; ii++ ) {
724 output->data[ii] = input1->data[ii] + input2->data[ii];
725 }
726
727 //Need to zero the upper 128 bits in case of an SSE function call after this AVX function
728 _mm256_zeroupper();
729
730 return XLAL_SUCCESS;
731#else
732 ( void )output;
733 ( void )input1;
734 ( void )input2;
735 fprintf( stderr, "%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n", __func__ );
737#endif
738
739}
740
741/**
742 * Subtract two REAL4VectorAligned using SSE
743 * \param [out] output Pointer to a REAL4VectorAligned
744 * \param [in] input1 Pointer to a REAL4VectorAligned
745 * \param [in] input2 Pointer to a REAL4VectorAligned
746 * \return Status value
747 */
749{
750
751#ifdef __SSE__
752 INT4 roundedvectorlength = ( INT4 )input1->length / 4;
753
754 __m128 *arr1, *arr2, *result;
755 arr1 = ( __m128 * )( void * )input1->data;
756 arr2 = ( __m128 * )( void * )input2->data;
757 result = ( __m128 * )( void * )output->data;
758
759 //subtract the two vectors into the output
760 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
761 *result = _mm_sub_ps( *arr1, *arr2 );
762 arr1++;
763 arr2++;
764 result++;
765 }
766
767 //Finish up the remaining part
768 for ( UINT4 ii = 4 * roundedvectorlength; ii < input1->length; ii++ ) {
769 output->data[ii] = input1->data[ii] - input2->data[ii];
770 }
771
772 return XLAL_SUCCESS;
773#else
774 ( void )output;
775 ( void )input1;
776 ( void )input2;
777 fprintf( stderr, "%s: Failed because SSE is not supported, possibly because -msse flag wasn't used for compiling.\n", __func__ );
779#endif
780
781}
782
783/**
784 * Subtract two REAL4VectorAligned using AVX
785 * \param [out] output Pointer to a REAL4VectorAligned
786 * \param [in] input1 Pointer to a REAL4VectorAligned
787 * \param [in] input2 Pointer to a REAL4VectorAligned
788 * \return Status value
789 */
791{
792
793#ifdef __AVX__
794 //Need to zero the upper 128 bits in case of an SSE function call after this AVX function
795 _mm256_zeroupper();
796
797 INT4 roundedvectorlength = ( INT4 )input1->length / 8;
798
799 __m256 *arr1, *arr2, *result;
800 arr1 = ( __m256 * )( void * )input1->data;
801 arr2 = ( __m256 * )( void * )input2->data;
802 result = ( __m256 * )( void * )output->data;
803
804 //subtract the two vectors into the output
805 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
806 *result = _mm256_sub_ps( *arr1, *arr2 );
807 arr1++;
808 arr2++;
809 result++;
810 }
811
812 //Finish up the remaining part
813 for ( UINT4 ii = 8 * roundedvectorlength; ii < input1->length; ii++ ) {
814 output->data[ii] = input1->data[ii] - input2->data[ii];
815 }
816
817 //Need to zero the upper 128 bits in case of an SSE function call after this AVX function
818 _mm256_zeroupper();
819
820 return XLAL_SUCCESS;
821#else
822 ( void )output;
823 ( void )input1;
824 ( void )input2;
825 fprintf( stderr, "%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n", __func__ );
827#endif
828
829}
830
831/**
832 * Subtract two alignedREAL8Vector using SSE
833 * \param [out] output Pointer to a alignedREAL8Vector
834 * \param [in] input1 Pointer to a alignedREAL8Vector
835 * \param [in] input2 Pointer to a alignedREAL8Vector
836 * \return Status value
837 */
839{
840
841#ifdef __SSE2__
842 INT4 roundedvectorlength = ( INT4 )input1->length / 2;
843
844 __m128d *arr1, *arr2, *result;
845 arr1 = ( __m128d * )( void * )input1->data;
846 arr2 = ( __m128d * )( void * )input2->data;
847 result = ( __m128d * )( void * )output->data;
848
849 //Subtract the two vectors into the output
850 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
851 *result = _mm_sub_pd( *arr1, *arr2 );
852 arr1++;
853 arr2++;
854 result++;
855 }
856
857 //Finish up the remaining part
858 for ( UINT4 ii = 2 * roundedvectorlength; ii < input1->length; ii++ ) {
859 output->data[ii] = input1->data[ii] - input2->data[ii];
860 }
861
862 return XLAL_SUCCESS;
863#else
864 ( void )output;
865 ( void )input1;
866 ( void )input2;
867 fprintf( stderr, "%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n", __func__ );
869#endif
870
871}
872
873/**
874 * Subtract two alignedREAL8Vector using AVX
875 * \param [out] output Pointer to a alignedREAL8Vector
876 * \param [in] input1 Pointer to a alignedREAL8Vector
877 * \param [in] input2 Pointer to a alignedREAL8Vector
878 * \return Status value
879 */
881{
882
883#ifdef __AVX__
884 //Need to zero the upper 128 bits in case of an SSE function call after this AVX function
885 _mm256_zeroupper();
886
887 INT4 roundedvectorlength = ( INT4 )input1->length / 4;
888
889 __m256d *arr1, *arr2, *result;
890 arr1 = ( __m256d * )( void * )input1->data;
891 arr2 = ( __m256d * )( void * )input2->data;
892 result = ( __m256d * )( void * )output->data;
893
894 //Subtract the two vectors into the output
895 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
896 *result = _mm256_sub_pd( *arr1, *arr2 );
897 arr1++;
898 arr2++;
899 result++;
900 }
901
902 //Finish up the remaining part
903 for ( UINT4 ii = 4 * roundedvectorlength; ii < input1->length; ii++ ) {
904 output->data[ii] = input1->data[ii] - input2->data[ii];
905 }
906
907 //Need to zero the upper 128 bits in case of an SSE function call after this AVX function
908 _mm256_zeroupper();
909
910 return XLAL_SUCCESS;
911#else
912 ( void )output;
913 ( void )input1;
914 ( void )input2;
915 fprintf( stderr, "%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n", __func__ );
917#endif
918
919}
920
921/**
922 * Multiply two alignedREAL8Vector using SSE
923 * \param [out] output Pointer to a alignedREAL8Vector
924 * \param [in] input1 Pointer to a alignedREAL8Vector
925 * \param [in] input2 Pointer to a alignedREAL8Vector
926 * \return Status value
927 */
929{
930
931#ifdef __SSE2__
932 INT4 roundedvectorlength = ( INT4 )input1->length / 2;
933
934 __m128d *arr1, *arr2, *result;
935 arr1 = ( __m128d * )( void * )input1->data;
936 arr2 = ( __m128d * )( void * )input2->data;
937 result = ( __m128d * )( void * )output->data;
938
939 //multiply the two vectors into the output
940 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
941 *result = _mm_mul_pd( *arr1, *arr2 );
942 arr1++;
943 arr2++;
944 result++;
945 }
946
947 //Finish up the remaining part
948 for ( INT4 ii = 2 * roundedvectorlength; ii < ( INT4 )input1->length; ii++ ) {
949 output->data[ii] = input1->data[ii] * input2->data[ii];
950 }
951
952 return XLAL_SUCCESS;
953#else
954 ( void )output;
955 ( void )input1;
956 ( void )input2;
957 fprintf( stderr, "%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n", __func__ );
959#endif
960
961}
962
963
964/**
965 * Multiply two alignedREAL8Vector using AVX
966 * \param [out] output Pointer to a alignedREAL8Vector
967 * \param [in] input1 Pointer to a alignedREAL8Vector
968 * \param [in] input2 Pointer to a alignedREAL8Vector
969 * \return Status value
970 */
972{
973
974#ifdef __AVX__
975 //Need to zero the upper 128 bits in case of an SSE function call after this AVX function
976 _mm256_zeroupper();
977
978 INT4 roundedvectorlength = ( INT4 )input1->length / 4;
979
980 __m256d *arr1, *arr2, *result;
981 arr1 = ( __m256d * )( void * )input1->data;
982 arr2 = ( __m256d * )( void * )input2->data;
983 result = ( __m256d * )( void * )output->data;
984
985 //multiply the two vectors into the output
986 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
987 *result = _mm256_mul_pd( *arr1, *arr2 );
988 arr1++;
989 arr2++;
990 result++;
991 }
992
993 //Finish up the remaining part
994 for ( INT4 ii = 4 * roundedvectorlength; ii < ( INT4 )input1->length; ii++ ) {
995 output->data[ii] = input1->data[ii] * input2->data[ii];
996 }
997
998 //Need to zero the upper 128 bits in case of an SSE function call after this AVX function
999 _mm256_zeroupper();
1000
1001 return XLAL_SUCCESS;
1002#else
1003 ( void )output;
1004 ( void )input1;
1005 ( void )input2;
1006 fprintf( stderr, "%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n", __func__ );
1008#endif
1009
1010}
1011
1012/**
1013 * Invert a alignedREAL8Vector using SSE
1014 * \param [out] output Pointer to a alignedREAL8Vector
1015 * \param [in] input1 Pointer to a alignedREAL8Vector
1016 * \return Status value
1017 */
1019{
1020
1021#ifdef __SSE__
1022 INT4 roundedvectorlength = ( INT4 )input1->length / 2;
1023
1024 __m128d *arr1, *result;
1025 arr1 = ( __m128d * )( void * )input1->data;
1026 result = ( __m128d * )( void * )output->data;
1027
1028 __m128d one = _mm_set1_pd( 1.0 );
1029
1030 //Invert the vector
1031 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
1032 *result = _mm_div_pd( one, *arr1 );
1033 arr1++;
1034 result++;
1035 }
1036
1037 //Finish up the remaining part
1038 for ( INT4 ii = 2 * roundedvectorlength; ii < ( INT4 )input1->length; ii++ ) {
1039 output->data[ii] = 1.0 / input1->data[ii];
1040 }
1041
1042 return XLAL_SUCCESS;
1043#else
1044 ( void )output;
1045 ( void )input1;
1046 fprintf( stderr, "%s: Failed because SSE is not supported, possibly because -msse flag wasn't used for compiling.\n", __func__ );
1048#endif
1049
1050}
1051
1052
1053/**
1054 * Invert a alignedREAL8Vector using AVX
1055 * \param [out] output Pointer to a alignedREAL8Vector
1056 * \param [in] input1 Pointer to a alignedREAL8Vector
1057 * \return Status value
1058 */
1060{
1061
1062#ifdef __AVX__
1063 //Need to zero the upper 128 bits in case of an SSE function call after this AVX function
1064 _mm256_zeroupper();
1065
1066 INT4 roundedvectorlength = ( INT4 )input1->length / 4;
1067
1068 __m256d *arr1, *result;
1069 arr1 = ( __m256d * )( void * )input1->data;
1070 result = ( __m256d * )( void * )output->data;
1071
1072 __m256d one = _mm256_set1_pd( 1.0 );
1073
1074 //Invert the vector
1075 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
1076 *result = _mm256_div_pd( one, *arr1 );
1077 arr1++;
1078 result++;
1079 }
1080
1081 //Finish up the remaining part
1082 for ( INT4 ii = 4 * roundedvectorlength; ii < ( INT4 )input1->length; ii++ ) {
1083 output->data[ii] = 1.0 / input1->data[ii];
1084 }
1085
1086 //Need to zero the upper 128 bits in case of an SSE function call after this AVX function
1087 _mm256_zeroupper();
1088
1089 return XLAL_SUCCESS;
1090#else
1091 ( void )output;
1092 ( void )input1;
1093 fprintf( stderr, "%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n", __func__ );
1095#endif
1096
1097}
1098
1099/**
1100 * Add a REAL8 scalar value to the elements of a alignedREAL8Vector using SSE
1101 * \param [out] output Pointer to a alignedREAL8Vector
1102 * \param [in] input Pointer to a alignedREAL8Vector
1103 * \param [in] scalar Value to add to the elements of input
1104 * \return Status value
1105 */
1107{
1108
1109#ifdef __SSE2__
1110 INT4 roundedvectorlength = ( INT4 )input->length / 2;
1111
1112 __m128d *arr1, *result;
1113 arr1 = ( __m128d * )( void * )input->data;
1114 result = ( __m128d * )( void * )output->data;
1115
1116 __m128d scalefactor = _mm_set1_pd( scalar );
1117
1118 //Add the value to the vector and put in the output
1119 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
1120 *result = _mm_add_pd( *arr1, scalefactor );
1121 arr1++;
1122 result++;
1123 }
1124
1125 //Finish up the remaining part
1126 for ( INT4 ii = 2 * roundedvectorlength; ii < ( INT4 )input->length; ii++ ) {
1127 output->data[ii] = input->data[ii] + scalar;
1128 }
1129
1130 return XLAL_SUCCESS;
1131#else
1132 ( void )output;
1133 ( void )input;
1134 ( void )scalar;
1135 fprintf( stderr, "%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n", __func__ );
1137#endif
1138
1139}
1140
1141
1142/**
1143 * Add a REAL8 scalar value to the elements of a alignedREAL8Vector using AVX
1144 * \param [out] output Pointer to a alignedREAL8Vector
1145 * \param [in] input Pointer to a alignedREAL8Vector
1146 * \param [in] scalar Value to add to the elements of input
1147 * \return Status value
1148 */
1150{
1151
1152#ifdef __AVX__
1153 //Need to zero the upper 128 bits in case of an SSE function call after this AVX function
1154 _mm256_zeroupper();
1155
1156 INT4 roundedvectorlength = ( INT4 )input->length / 4;
1157
1158 __m256d *arr1, *result;
1159 arr1 = ( __m256d * )( void * )input->data;
1160 result = ( __m256d * )( void * )output->data;
1161
1162 __m256d scalefactor = _mm256_set1_pd( scalar );
1163
1164 //Add the value to the vector and put in the output
1165 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
1166 *result = _mm256_add_pd( *arr1, scalefactor );
1167 arr1++;
1168 result++;
1169 }
1170
1171 //Finish up the remaining part
1172 for ( INT4 ii = 4 * roundedvectorlength; ii < ( INT4 )input->length; ii++ ) {
1173 output->data[ii] = input->data[ii] + scalar;
1174 }
1175
1176 //Need to zero the upper 128 bits in case of an SSE function call after this AVX function
1177 _mm256_zeroupper();
1178
1179 return XLAL_SUCCESS;
1180#else
1181 ( void )output;
1182 ( void )input;
1183 ( void )scalar;
1184 fprintf( stderr, "%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n", __func__ );
1186#endif
1187
1188}
1189
1190/**
1191 * Scale the elements of a alignedREAL8Vector by a REAL8 value using SSE
1192 * \param [out] output Pointer to a alignedREAL8Vector
1193 * \param [in] input Pointer to a alignedREAL8Vector
1194 * \param [in] scale Value to scale the elements of input
1195 * \return Status value
1196 */
1198{
1199
1200#ifdef __SSE2__
1201 INT4 roundedvectorlength = ( INT4 )input->length / 2;
1202
1203 __m128d *arr1, *result;
1204 arr1 = ( __m128d * )( void * )input->data;
1205 result = ( __m128d * )( void * )output->data;
1206
1207 __m128d scalefactor = _mm_set1_pd( scale );
1208
1209 //multiply the vector into the output
1210 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
1211 *result = _mm_mul_pd( *arr1, scalefactor );
1212 arr1++;
1213 result++;
1214 }
1215
1216 //Finish up the remaining part
1217 for ( UINT4 ii = 2 * roundedvectorlength; ii < input->length; ii++ ) {
1218 output->data[ii] = input->data[ii] * scale;
1219 }
1220
1221 return XLAL_SUCCESS;
1222#else
1223 ( void )output;
1224 ( void )input;
1225 ( void )scale;
1226 fprintf( stderr, "%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n", __func__ );
1228#endif
1229
1230}
1231
1232
1233/**
1234 * Scale the elements of a alignedREAL8Vector by a REAL8 value using AVX
1235 * \param [out] output Pointer to a alignedREAL8Vector
1236 * \param [in] input Pointer to a alignedREAL8Vector
1237 * \param [in] scale Value to scale the elements of input
1238 * \return Status value
1239 */
1241{
1242
1243#ifdef __AVX__
1244 //Need to zero the upper 128 bits in case of an SSE function call after this AVX function
1245 _mm256_zeroupper();
1246
1247 INT4 roundedvectorlength = ( INT4 )input->length / 4;
1248
1249 __m256d *arr1, *result;
1250 arr1 = ( __m256d * )( void * )input->data;
1251 result = ( __m256d * )( void * )output->data;
1252
1253 __m256d scalefactor = _mm256_set1_pd( scale );
1254
1255 //multiply the vector into the output
1256 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
1257 *result = _mm256_mul_pd( *arr1, scalefactor );
1258 arr1++;
1259 result++;
1260 }
1261
1262 //Finish up the remaining part
1263 for ( UINT4 ii = 4 * roundedvectorlength; ii < input->length; ii++ ) {
1264 output->data[ii] = input->data[ii] * scale;
1265 }
1266
1267 //Need to zero the upper 128 bits in case of an SSE function call after this AVX function
1268 _mm256_zeroupper();
1269
1270 return XLAL_SUCCESS;
1271#else
1272 ( void )output;
1273 ( void )input;
1274 ( void )scale;
1275 fprintf( stderr, "%s: Failed because AVX is not supported, possibly because -mavx flag wasn't used for compiling.\n", __func__ );
1277#endif
1278
1279}
1280
1281/**
1282 * Sum vectors from REAL4VectorAlignedArrays into an output REAL4VectorAlignedArray using SIMD
1283 * \param [out] output Pointer to REAL4VectorAlignedArray
1284 * \param [in] input1 Pointer to REAL4VectorAlignedArray
1285 * \param [in] input2 Pointer to REAL4VectorAlignedArray
1286 * \param [in] vectorpos1 Starting vector index for input1
1287 * \param [in] vectorpos2 Starting vector index for input2
1288 * \param [in] outputvectorpos Starting vector index for output
1289 * \param [in] numvectors Number of vectors to sum, incrementing vectorpos1, vectorpos2, and outputvectorpos by 1 each time
1290 * \return Status value
1291 */
1292INT4 VectorArraySum( REAL4VectorAlignedArray *output, REAL4VectorAlignedArray *input1, REAL4VectorAlignedArray *input2, INT4 vectorpos1, INT4 vectorpos2, INT4 outputvectorpos, INT4 numvectors )
1293{
1294 INT4 vec1 = vectorpos1, vec2 = vectorpos2, outvec = outputvectorpos;
1295 for ( INT4 ii = 0; ii < numvectors; ii++ ) {
1296 XLAL_CHECK( XLALVectorAddREAL4( output->data[outvec]->data, input1->data[vec1]->data, input2->data[vec2]->data, input1->data[vec1]->length ) == XLAL_SUCCESS, XLAL_EFUNC );
1297 vec1++;
1298 vec2++;
1299 outvec++;
1300 }
1301 return XLAL_SUCCESS;
1302}
1303
1304/*
1305 * (Deprecated) Compute from a look up table, the sin and cos of a vector of x values using SSE
1306 * Cannot use this for x values less than 0 or greater than 2.147483648e9
1307 * \param [out] sin2pix_vector Pointer to REAL8Vector of sin(2*pi*x)
1308 * \param [out] cos2pix_vector Pointer to REAL8Vector of cos(2*pi*x)
1309 * \param [in] x Pointer to REAL8Vector
1310 * \return Status value
1311 */
1312/* INT4 sse_sin_cos_2PI_LUT_REAL8Vector(REAL8Vector *sin2pix_vector, REAL8Vector *cos2pix_vector, REAL8Vector *x)
1313{
1314
1315#ifdef __SSE2__
1316 INT4 roundedvectorlength = (INT4)x->length / 2;
1317
1318 static BOOLEAN firstCall = TRUE;
1319 static REAL8 sinVal[LUT_RES+1], cosVal[LUT_RES+1];
1320
1321 // the first time we get called, we set up the lookup-table
1322 if ( firstCall ) {
1323 for (UINT4 k=0; k <= LUT_RES; k++) {
1324 sinVal[k] = sin( LAL_TWOPI * k * OO_LUT_RES );
1325 cosVal[k] = cos( LAL_TWOPI * k * OO_LUT_RES );
1326 }
1327 firstCall = FALSE;
1328 }
1329
1330 REAL8 *allocinput = NULL, *allocoutput1 = NULL, *allocoutput2 = NULL, *alignedinput = NULL, *alignedoutput1 = NULL, *alignedoutput2 = NULL;
1331 INT4 vecaligned = 0, outputaligned1 = 0, outputaligned2 = 0;
1332 __m128d *arr1, *sinresult, *cosresult;
1333
1334 //Allocate memory for aligning input vector if necessary
1335 if ( x->data==(void*)(((uintptr_t)x->data+15) & ~15) ) {
1336 vecaligned = 1;
1337 arr1 = (__m128d*)(void*)x->data;
1338 } else {
1339 XLAL_CHECK( (allocinput = (REAL8*)XLALMalloc(2*roundedvectorlength*sizeof(REAL8) + 15)) != NULL, XLAL_ENOMEM );
1340 alignedinput = (void*)(((uintptr_t)allocinput+15) & ~15);
1341 memcpy(alignedinput, x->data, sizeof(REAL8)*2*roundedvectorlength);
1342 arr1 = (__m128d*)(void*)alignedinput;
1343 }
1344
1345 //Allocate memory for aligning output vector 1 if necessary
1346 if ( sin2pix_vector->data==(void*)(((uintptr_t)sin2pix_vector->data+15) & ~15) ) {
1347 outputaligned1 = 1;
1348 sinresult = (__m128d*)(void*)sin2pix_vector->data;
1349 } else {
1350 XLAL_CHECK( (allocoutput1 = (REAL8*)XLALMalloc(2*roundedvectorlength*sizeof(REAL8) + 15)) != NULL, XLAL_ENOMEM );
1351 alignedoutput1 = (void*)(((uintptr_t)allocoutput1+15) & ~15);
1352 sinresult = (__m128d*)(void*)alignedoutput1;
1353 }
1354
1355 //Allocate memory for aligning output vector 2 if necessary
1356 if ( cos2pix_vector->data==(void*)(((uintptr_t)cos2pix_vector->data+15) & ~15) ) {
1357 outputaligned2 = 1;
1358 cosresult = (__m128d*)(void*)cos2pix_vector->data;
1359 } else {
1360 XLAL_CHECK( (allocoutput2 = (REAL8*)XLALMalloc(2*roundedvectorlength*sizeof(REAL8) + 15)) != NULL, XLAL_ENOMEM );
1361 alignedoutput2 = (void*)(((uintptr_t)allocoutput2+15) & ~15);
1362 cosresult = (__m128d*)(void*)alignedoutput2;
1363 }
1364
1365 INT4 *integerVect = NULL;
1366 XLAL_CHECK( (integerVect = (INT4*)XLALMalloc(2*sizeof(INT4)+15)) != NULL, XLAL_ENOMEM );
1367 INT4 *I0 = (void*)(((uintptr_t)integerVect+15) & ~15);
1368 memset(I0, 0, sizeof(INT4)*2);
1369
1370 __m128d lutresf = _mm_set1_pd(LUT_RES_F);
1371 __m128d onehalf = _mm_set1_pd(0.5);
1372 __m128d oolutres = _mm_set1_pd(OO_LUT_RES);
1373 __m128d twopi = _mm_set1_pd(LAL_TWOPI);
1374
1375 for (INT4 ii=0; ii<roundedvectorlength; ii++) {
1376
1377 //Fractional part of x [0,1)
1378 __m128i xfloor = _mm_cvttpd_epi32(*arr1);
1379 __m128d xfloord = _mm_cvtepi32_pd(xfloor);
1380 __m128d xt = _mm_sub_pd(*arr1, xfloord);
1381
1382 //i0 in [0, LUT_RES]
1383 __m128d d0 = _mm_mul_pd(xt, lutresf);
1384 __m128d d1 = _mm_add_pd(d0, onehalf);
1385 __m128i i0 = _mm_cvttpd_epi32(d1);
1386 __m128d i0d = _mm_cvtepi32_pd(i0);
1387 //__m128i *i_0 = &i0;
1388 __m128i i0copy = i0;
1389
1390 //d and d2
1391 __m128d d_0 = _mm_mul_pd(oolutres, i0d);
1392 __m128d d_1 = _mm_sub_pd(xt, d_0);
1393 __m128d d = _mm_mul_pd(d_1, twopi);
1394 __m128d d2 = _mm_mul_pd(d, d);
1395 d2 = _mm_mul_pd(d2, onehalf);
1396
1397 //I0 = (INT4*)i_0;
1398 I0[0] = _mm_cvtsi128_si32(_mm_srli_si128(i0copy, 0));
1399 I0[1] = _mm_cvtsi128_si32(_mm_srli_si128(i0copy, 4));
1400 __m128d ts = _mm_setr_pd(sinVal[I0[0]], sinVal[I0[1]]);
1401 __m128d tc = _mm_setr_pd(cosVal[I0[0]], cosVal[I0[1]]);
1402
1403 __m128d dtimestc = _mm_mul_pd(d, tc);
1404 __m128d d2timests = _mm_mul_pd(d2, ts);
1405 __m128d dtimests = _mm_mul_pd(d, ts);
1406 __m128d d2timestc = _mm_mul_pd(d2, tc);
1407 __m128d tsplusdtimestc = _mm_add_pd(ts, dtimestc);
1408 __m128d tcminusdtimests = _mm_sub_pd(tc, dtimests);
1409 *sinresult = _mm_sub_pd(tsplusdtimestc, d2timests);
1410 *cosresult = _mm_sub_pd(tcminusdtimests, d2timestc);
1411
1412 arr1++;
1413 sinresult++;
1414 cosresult++;
1415 }
1416
1417 //Copy output aligned memory to non-aligned memory if necessary
1418 if (!outputaligned1) memcpy(sin2pix_vector->data, alignedoutput1, 2*roundedvectorlength*sizeof(REAL8));
1419 if (!outputaligned2) memcpy(cos2pix_vector->data, alignedoutput2, 2*roundedvectorlength*sizeof(REAL8));
1420
1421 //Finish up the remaining part
1422 REAL4 sin2pix = 0.0, cos2pix = 0.0;
1423 for (INT4 ii=2*roundedvectorlength; ii<(INT4)x->length; ii++) {
1424 XLAL_CHECK( XLALSinCos2PiLUT(&sin2pix, &cos2pix, x->data[ii]) == XLAL_SUCCESS, XLAL_EFUNC );
1425 if (sin2pix>1.0) sin2pix = 1.0;
1426 else if (sin2pix<-1.0) sin2pix = -1.0;
1427 if (cos2pix>1.0) cos2pix = 1.0;
1428 else if (cos2pix<-1.0) cos2pix = -1.0;
1429 sin2pix_vector->data[ii] = (REAL8)sin2pix;
1430 cos2pix_vector->data[ii] = (REAL8)cos2pix;
1431 }
1432
1433 //Free memory if necessary
1434 if (!vecaligned) XLALFree(allocinput);
1435 if (!outputaligned1) XLALFree(allocoutput1);
1436 if (!outputaligned2) XLALFree(allocoutput2);
1437
1438 //Free memory
1439 XLALFree(integerVect);
1440
1441 return XLAL_SUCCESS;
1442#else
1443 (void)sin2pix_vector;
1444 (void)cos2pix_vector;
1445 (void)x;
1446 fprintf(stderr, "%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n", __func__);
1447 XLAL_ERROR(XLAL_EFAILED);
1448#endif
1449
1450} */ // sse_sin_cos_2PI_LUT_REAL8Vector()
1451
1452
1453/*
1454 * (Deprecated) Compute from a look up table, the sin and cos of a vector of x values using SSE
1455 * Cannot use this for x values less than 0 or greater than 2.147483648e9
1456 * \param [out] sin2pix_vector Pointer to REAL4Vector of sin(2*pi*x)
1457 * \param [out] cos2pix_vector Pointer to REAL4Vector of cos(2*pi*x)
1458 * \param [in] x Pointer to REAL4Vector
1459 * \return Status value
1460 */
1461/* INT4 sse_sin_cos_2PI_LUT_REAL4Vector(REAL4Vector *sin2pix_vector, REAL4Vector *cos2pix_vector, REAL4Vector *x)
1462{
1463
1464#ifdef __SSE2__
1465 INT4 roundedvectorlength = (INT4)x->length / 4;
1466
1467 static BOOLEAN firstCall = TRUE;
1468 static REAL4 sinVal[LUT_RES+1], cosVal[LUT_RES+1];
1469
1470 // the first time we get called, we set up the lookup-table
1471 if ( firstCall ) {
1472 for (UINT4 k=0; k <= LUT_RES; k++) {
1473 sinVal[k] = (REAL4)sinf( (REAL4)(LAL_TWOPI * k * OO_LUT_RES) );
1474 cosVal[k] = (REAL4)cosf( (REAL4)(LAL_TWOPI * k * OO_LUT_RES) );
1475 }
1476 firstCall = FALSE;
1477 }
1478
1479 REAL4 *allocinput = NULL, *allocoutput1 = NULL, *allocoutput2 = NULL, *alignedinput = NULL, *alignedoutput1 = NULL, *alignedoutput2 = NULL;
1480 INT4 vecaligned = 0, outputaligned1 = 0, outputaligned2 = 0;
1481 __m128 *arr1, *sinresult, *cosresult;
1482
1483 //Allocate memory for aligning input vector if necessary
1484 if ( x->data==(void*)(((uintptr_t)x->data+15) & ~15) ) {
1485 vecaligned = 1;
1486 arr1 = (__m128*)(void*)x->data;
1487 } else {
1488 XLAL_CHECK( (allocinput = (REAL4*)XLALMalloc(4*roundedvectorlength*sizeof(REAL4) + 15)) != NULL, XLAL_ENOMEM );
1489 alignedinput = (void*)(((uintptr_t)allocinput+15) & ~15);
1490 memcpy(alignedinput, x->data, sizeof(REAL4)*4*roundedvectorlength);
1491 arr1 = (__m128*)(void*)alignedinput;
1492 }
1493
1494 //Allocate memory for aligning output vector 1 if necessary
1495 if ( sin2pix_vector->data==(void*)(((uintptr_t)sin2pix_vector->data+15) & ~15) ) {
1496 outputaligned1 = 1;
1497 sinresult = (__m128*)(void*)sin2pix_vector->data;
1498 } else {
1499 XLAL_CHECK( (allocoutput1 = (REAL4*)XLALMalloc(4*roundedvectorlength*sizeof(REAL4) + 15)) != NULL, XLAL_ENOMEM );
1500 alignedoutput1 = (void*)(((uintptr_t)allocoutput1+15) & ~15);
1501 sinresult = (__m128*)(void*)alignedoutput1;
1502 }
1503
1504 //Allocate memory for aligning output vector 2 if necessary
1505 if ( cos2pix_vector->data==(void*)(((uintptr_t)cos2pix_vector->data+15) & ~15) ) {
1506 outputaligned2 = 1;
1507 cosresult = (__m128*)(void*)cos2pix_vector->data;
1508 } else {
1509 XLAL_CHECK( (allocoutput2 = (REAL4*)XLALMalloc(4*roundedvectorlength*sizeof(REAL4) + 15)) != NULL, XLAL_ENOMEM );
1510 alignedoutput2 = (void*)(((uintptr_t)allocoutput2+15) & ~15);
1511 cosresult = (__m128*)(void*)alignedoutput2;
1512 }
1513
1514 INT4 *integerVect = NULL;
1515 XLAL_CHECK( (integerVect = (INT4*)XLALMalloc(4*sizeof(INT4)+15)) != NULL, XLAL_ENOMEM );
1516 INT4 *I0 = (void*)(((uintptr_t)integerVect+15) & ~15);
1517 memset(I0, 0, sizeof(INT4)*4);
1518
1519 __m128 lutresf = _mm_set1_ps((REAL4)LUT_RES_F);
1520 __m128 onehalf = _mm_set1_ps(0.5f);
1521 __m128 oolutres = _mm_set1_ps((REAL4)OO_LUT_RES);
1522 __m128 twopi = _mm_set1_ps((REAL4)LAL_TWOPI);
1523
1524 for (INT4 ii=0; ii<roundedvectorlength; ii++) {
1525
1526 //Fractional part of x [0,1)
1527 __m128i xfloor = _mm_cvttps_epi32(*arr1);
1528 __m128 xfloors = _mm_cvtepi32_ps(xfloor);
1529 __m128 xt = _mm_sub_ps(*arr1, xfloors);
1530
1531 //i0 in [0, LUT_RES]
1532 __m128 d0 = _mm_mul_ps(xt, lutresf);
1533 __m128 d1 = _mm_add_ps(d0, onehalf);
1534 __m128i i0 = _mm_cvttps_epi32(d1);
1535 __m128 i0s = _mm_cvtepi32_ps(i0);
1536 __m128i i0copy = i0;
1537
1538 //d and d2
1539 __m128 d_0 = _mm_mul_ps(oolutres, i0s);
1540 __m128 d_1 = _mm_sub_ps(xt, d_0);
1541 __m128 d = _mm_mul_ps(d_1, twopi);
1542 __m128 d2 = _mm_mul_ps(d, d);
1543 d2 = _mm_mul_ps(d2, onehalf);
1544
1545 //I0 = (INT4*)i_0;
1546 I0[0] = _mm_cvtsi128_si32(_mm_srli_si128(i0copy, 0));
1547 I0[1] = _mm_cvtsi128_si32(_mm_srli_si128(i0copy, 4));
1548 I0[2] = _mm_cvtsi128_si32(_mm_srli_si128(i0copy, 8));
1549 I0[3] = _mm_cvtsi128_si32(_mm_srli_si128(i0copy, 12));
1550 __m128 ts = _mm_setr_ps(sinVal[I0[0]], sinVal[I0[1]], sinVal[I0[2]], sinVal[I0[3]]);
1551 __m128 tc = _mm_setr_ps(cosVal[I0[0]], cosVal[I0[1]], cosVal[I0[2]], cosVal[I0[3]]);
1552
1553 __m128 dtimestc = _mm_mul_ps(d, tc);
1554 __m128 d2timests = _mm_mul_ps(d2, ts);
1555 __m128 dtimests = _mm_mul_ps(d, ts);
1556 __m128 d2timestc = _mm_mul_ps(d2, tc);
1557 __m128 tsplusdtimestc = _mm_add_ps(ts, dtimestc);
1558 __m128 tcminusdtimests = _mm_sub_ps(tc, dtimests);
1559 *sinresult = _mm_sub_ps(tsplusdtimestc, d2timests);
1560 *cosresult = _mm_sub_ps(tcminusdtimests, d2timestc);
1561
1562 arr1++;
1563 sinresult++;
1564 cosresult++;
1565 }
1566
1567 //Copy output aligned memory to non-aligned memory if necessary
1568 if (!outputaligned1) memcpy(sin2pix_vector->data, alignedoutput1, 4*roundedvectorlength*sizeof(REAL4));
1569 if (!outputaligned2) memcpy(cos2pix_vector->data, alignedoutput2, 4*roundedvectorlength*sizeof(REAL4));
1570
1571 //Finish up the remaining part
1572 REAL4 sin2pix = 0.0, cos2pix = 0.0;
1573 for (INT4 ii=4*roundedvectorlength; ii<(INT4)x->length; ii++) {
1574 XLAL_CHECK( XLALSinCos2PiLUT(&(sin2pix), &(cos2pix), x->data[ii]) == XLAL_SUCCESS, XLAL_EFUNC );
1575 if (sin2pix>1.0) sin2pix = 1.0;
1576 else if (sin2pix<-1.0) sin2pix = -1.0;
1577 if (cos2pix>1.0) cos2pix = 1.0;
1578 else if (cos2pix<-1.0) cos2pix = -1.0;
1579 sin2pix_vector->data[ii] = sin2pix;
1580 cos2pix_vector->data[ii] = cos2pix;
1581 }
1582
1583 //Free memory if necessary
1584 if (!vecaligned) XLALFree(allocinput);
1585 if (!outputaligned1) XLALFree(allocoutput1);
1586 if (!outputaligned2) XLALFree(allocoutput2);
1587
1588 //Free memory
1589 XLALFree(integerVect);
1590
1591 return XLAL_SUCCESS;
1592#else
1593 (void)sin2pix_vector;
1594 (void)cos2pix_vector;
1595 (void)x;
1596 fprintf(stderr, "%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n", __func__);
1597 XLAL_ERROR(XLAL_EFAILED);
1598#endif
1599
1600} */ // sse_sin_cos_2PI_LUT_REAL4Vector()
1601
1602
1603/**
1604 * Exponential of input vector is computed using SSE, based on the Cephes library
1605 * \param [out] output Pointer to alignedREAL8Vector
1606 * \param [in] input Pointer to alignedREAL8Vector
1607 * \return Status value
1608 */
1610{
1611
1612#ifdef __SSE2__
1613 INT4 roundedvectorlength = ( INT4 )input->length / 2;
1614
1615 __m128d *x, *result;
1616 x = ( __m128d * )( void * )input->data;
1617 result = ( __m128d * )( void * )output->data;
1618
1619 __m128i expoffset = _mm_set_epi32( 0, 1023, 0, 1023 ); //__m128i expoffset = _mm_set1_epi64x(1023); //Exponent mask for double precision
1620 __m128i maskupper32bits = _mm_set_epi32( 0xffffffff, 0x00000000, 0xffffffff, 0x00000000 ); //__m128i maskupper32bits = _mm_set1_epi64x(0xffffffff00000000); //mask for upper 32 bits
1621 __m128i masklower32bits = _mm_set_epi32( 0x00000000, 0xffffffff, 0x00000000, 0xffffffff ); //__m128i masklower32bits = _mm_set1_epi64x(0x00000000ffffffff); //mask for lower 32 bits
1622 __m128d log2e = _mm_set1_pd( 1.442695040888963 ); //ln(2)
1623 __m128d onehalf = _mm_set1_pd( 0.5 ); //0.5
1624 __m128d one = _mm_set1_pd( 1.0 ); //1.0
1625 __m128d two = _mm_set1_pd( 2.0 ); //2.0
1626 __m128d maxexp = _mm_set1_pd( 7.007827128933840e+02 );
1627 __m128d minexp = _mm_set1_pd( -7.007827128933840e+02 );
1628
1629 __m128d cephes_c1 = _mm_set1_pd( 6.93145751953125e-1 );
1630 __m128d cephes_c2 = _mm_set1_pd( 1.42860682030941723212e-6 );
1631 __m128d cephes_p0 = _mm_set1_pd( 1.26177193074810590878e-4 );
1632 __m128d cephes_p1 = _mm_set1_pd( 3.02994407707441961300e-2 );
1633 __m128d cephes_p2 = _mm_set1_pd( 9.99999999999999999910e-1 );
1634 __m128d cephes_q0 = _mm_set1_pd( 3.00198505138664455042e-6 );
1635 __m128d cephes_q1 = _mm_set1_pd( 2.52448340349684104192e-3 );
1636 __m128d cephes_q2 = _mm_set1_pd( 2.27265548208155028766e-1 );
1637 __m128d cephes_q3 = _mm_set1_pd( 2.00000000000000000009e0 );
1638
1639 for ( INT4 ii = 0; ii < roundedvectorlength; ii++ ) {
1640 __m128d y = *x;
1641 y = _mm_max_pd( y, minexp );
1642 y = _mm_min_pd( y, maxexp );
1643
1644 //Compute x*ln(2)
1645 __m128d log2etimesx = _mm_mul_pd( log2e, y );
1646
1647 //Round
1648 __m128d log2etimesxplushalf = _mm_add_pd( log2etimesx, onehalf );
1649 __m128i log2etimesx_rounded_i = _mm_cvttpd_epi32( log2etimesxplushalf );
1650 __m128d log2etimesx_rounded = _mm_cvtepi32_pd( log2etimesx_rounded_i );
1651 __m128d mask = _mm_cmpgt_pd( log2etimesx_rounded, log2etimesxplushalf );
1652 mask = _mm_and_pd( mask, one );
1653 log2etimesx_rounded = _mm_sub_pd( log2etimesx_rounded, mask );
1654 log2etimesx_rounded_i = _mm_cvttpd_epi32( log2etimesx_rounded );
1655
1656 //multiply and subtract as in the cephes code
1657 __m128d log2etimesx_rounded_times_c1 = _mm_mul_pd( log2etimesx_rounded, cephes_c1 );
1658 __m128d log2etimesx_rounded_times_c2 = _mm_mul_pd( log2etimesx_rounded, cephes_c2 );
1659 y = _mm_sub_pd( y, log2etimesx_rounded_times_c1 );
1660 y = _mm_sub_pd( y, log2etimesx_rounded_times_c2 );
1661
1662 //x**2
1663 __m128d xsq = _mm_mul_pd( y, y );
1664
1665 //Pade approximation with polynomials
1666 //Now the polynomial part 1
1667 __m128d polevlresult = cephes_p0;
1668 polevlresult = _mm_mul_pd( polevlresult, xsq );
1669 polevlresult = _mm_add_pd( polevlresult, cephes_p1 );
1670 polevlresult = _mm_mul_pd( polevlresult, xsq );
1671 polevlresult = _mm_add_pd( polevlresult, cephes_p2 );
1672 __m128d xtimespolresult = _mm_mul_pd( y, polevlresult );
1673
1674 //And polynomial part 2
1675 polevlresult = cephes_q0;
1676 polevlresult = _mm_mul_pd( polevlresult, xsq );
1677 polevlresult = _mm_add_pd( polevlresult, cephes_q1 );
1678 polevlresult = _mm_mul_pd( polevlresult, xsq );
1679 polevlresult = _mm_add_pd( polevlresult, cephes_q2 );
1680 polevlresult = _mm_mul_pd( polevlresult, xsq );
1681 polevlresult = _mm_add_pd( polevlresult, cephes_q3 );
1682 __m128d polevlresult_minus_xtimespolresult = _mm_sub_pd( polevlresult, xtimespolresult );
1683 y = _mm_div_pd( xtimespolresult, polevlresult_minus_xtimespolresult );
1684
1685 //Finishing the pade approximation
1686 y = _mm_mul_pd( y, two );
1687 y = _mm_add_pd( y, one );
1688
1689 //Need to get the integer to a 64 bit format for below even though we don't need that much
1690 //Note that this is still limited to an INT4 size
1691 //It's hard because there is no _mm_srai_epi64 command, so we do it ourselves
1692 __m128i log2etimesx_rounded_i_64bit = _mm_unpacklo_epi32( log2etimesx_rounded_i, log2etimesx_rounded_i ); //Interleve with itself
1693 __m128i maskedlower32 = _mm_and_si128( log2etimesx_rounded_i_64bit, masklower32bits ); //Save lower 32 bits
1694 log2etimesx_rounded_i_64bit = _mm_srai_epi32( log2etimesx_rounded_i_64bit, 32 ); //Shift with the sign bit
1695 log2etimesx_rounded_i_64bit = _mm_and_si128( log2etimesx_rounded_i_64bit, maskupper32bits ); //Discard now useless lower 32 bits
1696 log2etimesx_rounded_i_64bit = _mm_xor_si128( log2etimesx_rounded_i_64bit, maskedlower32 ); //Restore original lower 32 bits
1697
1698 //Now construct 2**n
1699 __m128i log2etimesx_rounded_i_with_offset = _mm_add_epi64( log2etimesx_rounded_i_64bit, expoffset );
1700 log2etimesx_rounded_i_with_offset = _mm_slli_epi64( log2etimesx_rounded_i_with_offset, 52 );
1701 __m128d pow2n = _mm_castsi128_pd( log2etimesx_rounded_i_with_offset );
1702
1703 //And multiply
1704 *result = _mm_mul_pd( y, pow2n );
1705
1706 x++;
1707 result++;
1708
1709 }
1710
1711 //Finish up the remaining part
1712 for ( INT4 ii = 2 * roundedvectorlength; ii < ( INT4 )input->length; ii++ ) {
1713 output->data[ii] = exp( input->data[ii] );
1714 }
1715
1716 return XLAL_SUCCESS;
1717#else
1718 ( void )output;
1719 ( void )input;
1720 fprintf( stderr, "%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n", __func__ );
1722#endif
1723
1724}
1725
1726
1727/*
1728 * (Deprecated) Exponential of input vector is computed using SSE, based on the Cephes library
1729 * \param [out] output Pointer to REAL4Vector
1730 * \param [in] input Pointer to REAL4Vector
1731 * \return Status value
1732 */
1733/* INT4 sse_exp_REAL4Vector(REAL4Vector *output, REAL4Vector *input)
1734{
1735
1736#ifdef __SSE2__
1737 INT4 roundedvectorlength = (INT4)input->length / 4;
1738
1739 REAL4 *allocinput = NULL, *allocoutput = NULL, *alignedinput = NULL, *alignedoutput = NULL;
1740 INT4 vecaligned = 0, outputaligned = 0;
1741
1742 __m128 *x, *result;
1743
1744 //Allocate memory for aligning input vector if necessary
1745 if ( input->data==(void*)(((size_t)input->data+15) & ~15) ) {
1746 vecaligned = 1;
1747 x = (__m128*)(void*)input->data;
1748 } else {
1749 XLAL_CHECK( (allocinput = (REAL4*)XLALMalloc(4*roundedvectorlength*sizeof(REAL4) + 15)) != NULL, XLAL_ENOMEM );
1750 alignedinput = (void*)(((size_t)allocinput+15) & ~15);
1751 memcpy(alignedinput, input->data, sizeof(REAL4)*4*roundedvectorlength);
1752 x = (__m128*)(void*)alignedinput;
1753 }
1754
1755 //Allocate memory for aligning output vector 1 if necessary
1756 if ( output->data==(void*)(((size_t)output->data+15) & ~15) ) {
1757 outputaligned = 1;
1758 result = (__m128*)(void*)output->data;
1759 } else {
1760 XLAL_CHECK( (allocoutput = (REAL4*)XLALMalloc(4*roundedvectorlength*sizeof(REAL4) + 15)) != NULL, XLAL_ENOMEM );
1761 alignedoutput = (void*)(((size_t)allocoutput+15) & ~15);
1762 result = (__m128*)(void*)alignedoutput;
1763 }
1764
1765 __m128i expoffset = _mm_set1_epi32(127); //Exponent mask for single precision
1766 __m128 log2e = _mm_set1_ps(1.442695040888963f); //ln(2)
1767 __m128 onehalf = _mm_set1_ps(0.5f); //0.5
1768 __m128 one = _mm_set1_ps(1.0f); //1.0
1769 __m128 two = _mm_set1_ps(2.0f); //2.0
1770 __m128 maxexp = _mm_set1_ps(88.3762626647949f);
1771 __m128 minexp = _mm_set1_ps(-88.3762626647949f);
1772
1773 __m128 cephes_c1 = _mm_set1_ps(6.93145751953125e-1f);
1774 __m128 cephes_c2 = _mm_set1_ps(1.42860682030941723212e-6f);
1775 __m128 cephes_p0 = _mm_set1_ps(1.26177193074810590878e-4f);
1776 __m128 cephes_p1 = _mm_set1_ps(3.02994407707441961300e-2f);
1777 __m128 cephes_p2 = _mm_set1_ps(9.99999999999999999910e-1f);
1778 __m128 cephes_q0 = _mm_set1_ps(3.00198505138664455042e-6f);
1779 __m128 cephes_q1 = _mm_set1_ps(2.52448340349684104192e-3f);
1780 __m128 cephes_q2 = _mm_set1_ps(2.27265548208155028766e-1f);
1781 __m128 cephes_q3 = _mm_set1_ps(2.00000000000000000009e0f);
1782
1783 for (INT4 ii=0; ii<roundedvectorlength; ii++) {
1784 __m128 y = *x;
1785 y = _mm_max_ps(y, minexp);
1786 y = _mm_min_ps(y, maxexp);
1787
1788 //Compute x*ln(2)
1789 __m128 log2etimesx = _mm_mul_ps(log2e, y);
1790
1791 //Round
1792 __m128 log2etimesxplushalf = _mm_add_ps(log2etimesx, onehalf);
1793 __m128i log2etimesx_rounded_i = _mm_cvttps_epi32(log2etimesxplushalf);
1794 __m128 log2etimesx_rounded = _mm_cvtepi32_ps(log2etimesx_rounded_i);
1795 __m128 mask = _mm_cmpgt_ps(log2etimesx_rounded, log2etimesxplushalf);
1796 mask = _mm_and_ps(mask, one);
1797 log2etimesx_rounded = _mm_sub_ps(log2etimesx_rounded, mask);
1798 log2etimesx_rounded_i = _mm_cvttps_epi32(log2etimesx_rounded);
1799
1800 //multiply and subtract as in the cephes code
1801 __m128 log2etimesx_rounded_times_c1 = _mm_mul_ps(log2etimesx_rounded, cephes_c1);
1802 __m128 log2etimesx_rounded_times_c2 = _mm_mul_ps(log2etimesx_rounded, cephes_c2);
1803 y = _mm_sub_ps(y, log2etimesx_rounded_times_c1);
1804 y = _mm_sub_ps(y, log2etimesx_rounded_times_c2);
1805
1806 //x**2
1807 __m128 xsq = _mm_mul_ps(y, y);
1808
1809 //Pade approximation with polynomials
1810 //Now the polynomial part 1
1811 __m128 polevlresult = cephes_p0;
1812 polevlresult = _mm_mul_ps(polevlresult, xsq);
1813 polevlresult = _mm_add_ps(polevlresult, cephes_p1);
1814 polevlresult = _mm_mul_ps(polevlresult, xsq);
1815 polevlresult = _mm_add_ps(polevlresult, cephes_p2);
1816 __m128 xtimespolresult = _mm_mul_ps(y, polevlresult);
1817
1818 //And polynomial part 2
1819 polevlresult = cephes_q0;
1820 polevlresult = _mm_mul_ps(polevlresult, xsq);
1821 polevlresult = _mm_add_ps(polevlresult, cephes_q1);
1822 polevlresult = _mm_mul_ps(polevlresult, xsq);
1823 polevlresult = _mm_add_ps(polevlresult, cephes_q2);
1824 polevlresult = _mm_mul_ps(polevlresult, xsq);
1825 polevlresult = _mm_add_ps(polevlresult, cephes_q3);
1826 __m128 polevlresult_minus_xtimespolresult = _mm_sub_ps(polevlresult, xtimespolresult);
1827 y = _mm_div_ps(xtimespolresult, polevlresult_minus_xtimespolresult);
1828
1829 //Finishing the pade approximation
1830 y = _mm_mul_ps(y, two);
1831 y = _mm_add_ps(y, one);
1832
1833 //Now construct 2**n
1834 __m128i log2etimesx_rounded_i_with_offset = _mm_add_epi32(log2etimesx_rounded_i, expoffset);
1835 log2etimesx_rounded_i_with_offset = _mm_slli_epi32(log2etimesx_rounded_i_with_offset, 23);
1836 __m128 pow2n = _mm_castsi128_ps(log2etimesx_rounded_i_with_offset);
1837
1838 //And multiply
1839 *result = _mm_mul_ps(y, pow2n);
1840
1841 x++;
1842 result++;
1843
1844 }
1845
1846 //Alternative method below. Similar output and errors w.r.t. libm exp()
1847 //__m128 cephes_c1 = _mm_set1_ps(0.693359375);
1848 //__m128 cephes_c2 = _mm_set1_ps(-2.12194440e-4);
1849 //__m128 cephes_p0 = _mm_set1_ps(1.9875691500E-4);
1850 //__m128 cephes_p1 = _mm_set1_ps(1.3981999507E-3);
1851 //__m128 cephes_p2 = _mm_set1_ps(8.3334519073E-3);
1852 //__m128 cephes_p3 = _mm_set1_ps(4.1665795894E-2);
1853 //__m128 cephes_p4 = _mm_set1_ps(1.6666665459E-1);
1854 //__m128 cephes_p5 = _mm_set1_ps(5.0000001201E-1);
1855 //for (INT4 ii=0; ii<roundedvectorlength; ii++) {
1856 //__m128 y = *x;
1857 //y = _mm_max_ps(y, minexp);
1858 //y = _mm_min_ps(y, maxexp);
1859
1860 //Compute x*ln(2)
1861 //__m128 log2etimesx = _mm_mul_ps(log2e, y);
1862
1863 //Round
1864 //__m128 log2etimesxplushalf = _mm_add_ps(log2etimesx, onehalf);
1865 //__m128i log2etimesx_rounded_i = _mm_cvttps_epi32(log2etimesxplushalf);
1866 //__m128 tmpval = _mm_cvtepi32_ps(log2etimesx_rounded_i);
1867 //__m128 mask = _mm_cmpgt_ps(tmpval, log2etimesxplushalf);
1868 //mask = _mm_and_ps(mask, one);
1869 //__m128 log2etimesx_rounded = _mm_sub_ps(tmpval, mask);
1870
1871 //multiply and subtract as in the cephes code
1872 //tmpval = _mm_mul_ps(log2etimesx_rounded, cephes_c1);
1873 //__m128 tmpval2 = _mm_mul_ps(log2etimesx_rounded, cephes_c2);
1874 //y = _mm_sub_ps(y, tmpval);
1875 //y = _mm_sub_ps(y, tmpval2);
1876
1877 //tmpval2 = _mm_mul_ps(y, y);
1878
1879 //Pade approximation with polynomials
1880 //Now the polynomial part 1
1881 //__m128 polevlresult = cephes_p0;
1882 //polevlresult = _mm_mul_ps(polevlresult, y);
1883 //polevlresult = _mm_add_ps(polevlresult, cephes_p1);
1884 //polevlresult = _mm_mul_ps(polevlresult, y);
1885 //polevlresult = _mm_add_ps(polevlresult, cephes_p2);
1886 //polevlresult = _mm_mul_ps(polevlresult, y);
1887 //polevlresult = _mm_add_ps(polevlresult, cephes_p3);
1888 //polevlresult = _mm_mul_ps(polevlresult, y);
1889 //polevlresult = _mm_add_ps(polevlresult, cephes_p4);
1890 //polevlresult = _mm_mul_ps(polevlresult, y);
1891 //polevlresult = _mm_add_ps(polevlresult, cephes_p5);
1892 //polevlresult = _mm_mul_ps(polevlresult, tmpval2);
1893 //polevlresult = _mm_add_ps(polevlresult, y);
1894 //polevlresult = _mm_add_ps(polevlresult, one);
1895
1896 //Now construct 2**n
1897 //log2etimesx_rounded_i = _mm_cvttps_epi32(log2etimesx_rounded);
1898 //__m128i log2etimesx_rounded_i_with_offset = _mm_add_epi32(log2etimesx_rounded_i, expoffset);
1899 //log2etimesx_rounded_i_with_offset = _mm_slli_epi32(log2etimesx_rounded_i_with_offset, 23);
1900 //__m128 pow2n = _mm_castsi128_ps(log2etimesx_rounded_i_with_offset);
1901
1902 //And multiply
1903 // *result = _mm_mul_ps(polevlresult, pow2n);
1904
1905 //x++;
1906 //result++;
1907 //}
1908
1909 //Copy output aligned memory to non-aligned memory if necessary
1910 if (!outputaligned) memcpy(output->data, alignedoutput, 4*roundedvectorlength*sizeof(REAL4));
1911
1912 //Finish up the remaining part
1913 for (INT4 ii=4*roundedvectorlength; ii<(INT4)input->length; ii++) {
1914 output->data[ii] = expf(input->data[ii]);
1915 }
1916
1917 //FILE *EXPVALS = fopen("./output/expvals.dat","w");
1918 //for (ii=0; ii<(INT4)output->length; ii++) {
1919 // fprintf(EXPVALS, "%f\n", output->data[ii]);
1920 //}
1921 //fclose(EXPVALS);
1922
1923 //Free memory if necessary
1924 if (!vecaligned) XLALFree(allocinput);
1925 if (!outputaligned) XLALFree(allocoutput);
1926
1927 return XLAL_SUCCESS;
1928#else
1929 (void)output;
1930 (void)input;
1931 fprintf(stderr, "%s: Failed because SSE2 is not supported, possibly because -msse2 flag wasn't used for compiling.\n", __func__);
1932 XLAL_ERROR(XLAL_EFAILED);
1933#endif
1934
1935} */
#define __func__
log an I/O error, i.e.
const REAL8 scaling[15]
#define c
#define fprintf
const double scale
multiplicative scaling factor of the coordinate
#define LAL_PI_2
#define LAL_2_PI
#define LAL_PI
#define LAL_1_PI
double REAL8
#define crectf(re, im)
uint32_t UINT4
int32_t INT4
float REAL4
void * XLALMalloc(size_t n)
void XLALFree(void *p)
static const INT4 a
int XLALVectorSinCosREAL4(REAL4 *out1, REAL4 *out2, const REAL4 *in, const UINT4 len)
int XLALVectorScaleREAL4(REAL4 *out, REAL4 scalar, const REAL4 *in, const UINT4 len)
void XLALDestroyREAL4VectorAligned(REAL4VectorAligned *in)
REAL4VectorAligned * XLALCreateREAL4VectorAligned(const UINT4 length, const UINT4 align)
int XLALVectorAddREAL4(REAL4 *out, const REAL4 *in1, const REAL4 *in2, const UINT4 len)
int XLALVectorMultiplyREAL4(REAL4 *out, const REAL4 *in1, const REAL4 *in2, const UINT4 len)
#define XLAL_ERROR(...)
#define XLAL_CHECK(assertion,...)
#define XLAL_CHECK_NULL(assertion,...)
XLAL_ENOMEM
XLAL_SUCCESS
XLAL_EFUNC
XLAL_EINVAL
XLAL_EFAILED
list y
n
COMPLEX8 * data
REAL4VectorAligned ** data
alignedREAL8Vector ** data
INT4 avxSSVectorSubtract(REAL4VectorAligned *output, REAL4VectorAligned *input1, REAL4VectorAligned *input2)
Subtract two REAL4VectorAligned using AVX.
Definition: vectormath.c:790
INT4 avxScaleREAL8Vector(alignedREAL8Vector *output, alignedREAL8Vector *input, REAL8 scale)
Scale the elements of a alignedREAL8Vector by a REAL8 value using AVX.
Definition: vectormath.c:1240
INT4 sseScaleREAL8Vector(alignedREAL8Vector *output, alignedREAL8Vector *input, REAL8 scale)
Scale the elements of a alignedREAL8Vector by a REAL8 value using SSE.
Definition: vectormath.c:1197
INT4 VectorAbsREAL4(REAL4VectorAligned *output, REAL4VectorAligned *input, INT4 vectorMath)
Definition: vectormath.c:427
INT4 VectorShiftREAL8(alignedREAL8Vector *output, alignedREAL8Vector *input, REAL8 shift, INT4 vectorMath)
Definition: vectormath.c:269
void destroyAlignedREAL8Vector(alignedREAL8Vector *vector)
Definition: vectormath.c:67
INT4 avxDDVectorSum(alignedREAL8Vector *output, alignedREAL8Vector *input1, alignedREAL8Vector *input2)
Sum two alignedREAL8Vector using AVX.
Definition: vectormath.c:700
INT4 DirichletRatioVector(COMPLEX8Vector *output, alignedREAL8Vector *delta0, alignedREAL8Vector *delta1, alignedREAL8Vector *scaling, const UserInput_t *params)
Definition: vectormath.c:122
size_t uintptr_t
Definition: vectormath.c:42
INT4 sseSSVectorSubtract(REAL4VectorAligned *output, REAL4VectorAligned *input1, REAL4VectorAligned *input2)
Subtract two REAL4VectorAligned using SSE.
Definition: vectormath.c:748
INT4 sse_exp_REAL8Vector(alignedREAL8Vector *output, alignedREAL8Vector *input)
Exponential of input vector is computed using SSE, based on the Cephes library.
Definition: vectormath.c:1609
INT4 VectorCabsfCOMPLEX8(REAL4VectorAligned *output, COMPLEX8Vector *input, INT4 vectorMath)
Definition: vectormath.c:533
INT4 VectorInvertREAL8(alignedREAL8Vector *output, alignedREAL8Vector *input, INT4 vectorMath)
Definition: vectormath.c:321
INT4 sseDDVectorSum(alignedREAL8Vector *output, alignedREAL8Vector *input1, alignedREAL8Vector *input2)
Sum two alignedREAL8Vector using SSE2.
Definition: vectormath.c:657
INT4 VectorRoundREAL4(REAL4VectorAligned *output, REAL4VectorAligned *input, INT4 vectorMath)
Definition: vectormath.c:365
INT4 avxAddScalarToREAL8Vector(alignedREAL8Vector *output, alignedREAL8Vector *input, REAL8 scalar)
Add a REAL8 scalar value to the elements of a alignedREAL8Vector using AVX.
Definition: vectormath.c:1149
INT4 avxDDVectorMultiply(alignedREAL8Vector *output, alignedREAL8Vector *input1, alignedREAL8Vector *input2)
Multiply two alignedREAL8Vector using AVX.
Definition: vectormath.c:971
void destroyAlignedREAL8VectorArray(alignedREAL8VectorArray *array)
Definition: vectormath.c:88
INT4 VectorRoundREAL8(alignedREAL8Vector *output, alignedREAL8Vector *input, INT4 vectorMath)
Definition: vectormath.c:396
INT4 VectorArraySum(REAL4VectorAlignedArray *output, REAL4VectorAlignedArray *input1, REAL4VectorAlignedArray *input2, INT4 vectorpos1, INT4 vectorpos2, INT4 outputvectorpos, INT4 numvectors)
Sum vectors from REAL4VectorAlignedArrays into an output REAL4VectorAlignedArray using SIMD.
Definition: vectormath.c:1292
INT4 VectorAddREAL8(alignedREAL8Vector *output, alignedREAL8Vector *input1, alignedREAL8Vector *input2, INT4 vectorMath)
Definition: vectormath.c:282
alignedREAL8VectorArray * createAlignedREAL8VectorArray(const UINT4 length, const UINT4 vectorLength, const size_t align)
Definition: vectormath.c:77
INT4 VectorFloorREAL8(alignedREAL8Vector *output, alignedREAL8Vector *input, INT4 vectorMath)
Definition: vectormath.c:334
REAL4VectorAlignedArray * createREAL4VectorAlignedArray(const UINT4 length, const UINT4 vectorLength, const size_t align)
Definition: vectormath.c:99
INT4 VectorScaleREAL8(alignedREAL8Vector *output, alignedREAL8Vector *input, REAL8 scale, INT4 vectorMath)
Definition: vectormath.c:256
void destroyREAL4VectorAlignedArray(REAL4VectorAlignedArray *array)
Definition: vectormath.c:110
INT4 avxInvertREAL8Vector(alignedREAL8Vector *output, alignedREAL8Vector *input1)
Invert a alignedREAL8Vector using AVX.
Definition: vectormath.c:1059
INT4 VectorAbsREAL8(alignedREAL8Vector *output, alignedREAL8Vector *input, INT4 vectorMath)
Definition: vectormath.c:480
INT4 avxDDVectorSubtract(alignedREAL8Vector *output, alignedREAL8Vector *input1, alignedREAL8Vector *input2)
Subtract two alignedREAL8Vector using AVX.
Definition: vectormath.c:880
INT4 fastSSVectorMultiply_with_stride_and_offset(REAL4VectorAligned *output, const REAL4VectorAligned *input1, const REAL4VectorAligned *input2, const INT4 stride1, const INT4 stride2, const INT4 offset1, const INT4 offset2)
Computes a multiplication of two vectors with a stride and initial offset.
Definition: vectormath.c:221
INT4 VectorSubtractREAL4(REAL4VectorAligned *output, REAL4VectorAligned *input1, REAL4VectorAligned *input2, INT4 vectorMath)
Definition: vectormath.c:243
INT4 sseAddScalarToREAL8Vector(alignedREAL8Vector *output, alignedREAL8Vector *input, REAL8 scalar)
Add a REAL8 scalar value to the elements of a alignedREAL8Vector using SSE.
Definition: vectormath.c:1106
INT4 sseDDVectorSubtract(alignedREAL8Vector *output, alignedREAL8Vector *input1, alignedREAL8Vector *input2)
Subtract two alignedREAL8Vector using SSE.
Definition: vectormath.c:838
alignedREAL8Vector * createAlignedREAL8Vector(UINT4 length, const size_t align)
Definition: vectormath.c:55
INT4 sseDDVectorMultiply(alignedREAL8Vector *output, alignedREAL8Vector *input1, alignedREAL8Vector *input2)
Multiply two alignedREAL8Vector using SSE.
Definition: vectormath.c:928
INT4 sseInvertREAL8Vector(alignedREAL8Vector *output, alignedREAL8Vector *input1)
Invert a alignedREAL8Vector using SSE.
Definition: vectormath.c:1018
INT4 VectorSubtractREAL8(alignedREAL8Vector *output, alignedREAL8Vector *input1, alignedREAL8Vector *input2, INT4 vectorMath)
Definition: vectormath.c:295
INT4 VectorMultiplyREAL8(alignedREAL8Vector *output, alignedREAL8Vector *input1, alignedREAL8Vector *input2, INT4 vectorMath)
Definition: vectormath.c:308
INT4 VectorCabsCOMPLEX8(alignedREAL8Vector *output, COMPLEX8Vector *input, INT4 vectorMath)
Definition: vectormath.c:591