Akos hotloop Altivec code (Dterms=8) More...

Detailed Description

Akos hotloop Altivec code (Dterms=8)

{
  {
    REAL4 s_alpha, c_alpha;   /* sin(2pi kappa_alpha) and (cos(2pi kappa_alpha)-1) */
    XLALSinCos2PiLUTtrimmed ( &s_alpha, &c_alpha, kappa_star );
    c_alpha -= 1.0f;
    {
      REAL4 *Xalpha_kR4 = (REAL4*)(Xalpha_l);
      REAL4 kappa_max = kappa_star + 1.0f * DTERMS - 1.0f;
 
      float STn[4] __attribute__ ((aligned (16)));       /* aligned for vector output */
      /* the vectors actually become registers in the AVUnit */
      vector unsigned char perm;   /* permutation pattern for unaligned memory access */
      vector float load0, load1, load2; /* temp registers for unaligned memory access */
      vector float XaiV   /* xmm3 */;                  /* SFT data loaded from memory */
      vector float STnV   /* xmm1 */;                         /* sums up the dividend */
      vector float V0000             = {0,0,0,0};             /* zero vector constant */
      vector float V2222  /* xmm4 */ = {2,2,2,2};                  /* vector constant */
      vector float pnV    /* xmm2 */ = {((float)(kappa_max)),
                                        ((float)(kappa_max)),
                                        ((float)(kappa_max - 1)),
                                        ((float)(kappa_max - 1)) };
      vector float qnV    /* xmm0 */ = pnV;   /* common divisor, initally = 1.0 * pnV */
      /*    this column above (^) lists the corresponding register in the SSE version */
 
      vector float tV;          /* temporary vector used for Newton-Rhapson iterarion */
 
      /* init the memory access (load0,load1) */
      load0   = vec_ld  (0,(Xalpha_kR4));
      perm    = vec_lvsl(0,(Xalpha_kR4));
      load1   = vec_ld  (0,(Xalpha_kR4+4));
 
      /* first "iteration" & initialization */
      XaiV    = vec_perm(load0,load1,perm);
      qnV     = vec_re(pnV);
      STnV    = vec_madd(XaiV, qnV, V0000);
 
      /* use a reciprocal estimate as a replacement for a division.
         in our case this is only valid for the "outer" elements of the kernel loop */
#define VEC_LOOP_RE(n,a,b)\
      pnV     = vec_sub(pnV,V2222);\
      perm    = vec_lvsl(0,(Xalpha_kR4+(n)));\
      load##b = vec_ld(0,(Xalpha_kR4+(n)+4));\
      XaiV    = vec_perm(load##a,load##b,perm);\
      qnV     = vec_re(pnV);\
      STnV    = vec_madd(XaiV, qnV, STnV);  /* STnV = XaiV * qnV + STnV */
 
      /* refine the reciprocal estimate to by a Newton-Rhapson iteration.
         re1(x) = re0(x) * (2 - x * re0(x))
         (see http://en.wikipedia.org/wiki/Division_(digital)#Newton-Raphson_division)
         this should give as much precision as a normal float division */
#define VEC_LOOP_RE_NR(n,a,b)\
      pnV     = vec_sub(pnV,V2222);\
      perm    = vec_lvsl(0,(Xalpha_kR4+(n)));\
      load##b = vec_ld(0,(Xalpha_kR4+(n)+4));\
      XaiV    = vec_perm(load##a,load##b,perm);\
      qnV     = vec_re(pnV);\
      tV      = vec_madd(qnV,pnV,V0000);\
      tV      = vec_sub(V2222,tV);\
      qnV     = vec_madd(qnV,tV,V0000);\
      STnV    = vec_madd(XaiV, qnV, STnV);
 
      /* actual "hot loop" (unrolled) */
      VEC_LOOP_RE     (4,1,2);
      VEC_LOOP_RE     (8,2,0);
      VEC_LOOP_RE_NR (12,0,1);
      VEC_LOOP_RE_NR (16,1,2);
      VEC_LOOP_RE_NR (20,2,0);
      VEC_LOOP_RE    (24,0,1);
      VEC_LOOP_RE    (28,1,0);
 
      /* output the vector */
      vec_st(STnV,0,STn);
 
      /* combine the sums */
      {
        REAL4 U_alpha = STn[0] + STn[2];
        REAL4 V_alpha = STn[1] + STn[3];
 
        realXP = s_alpha * U_alpha - c_alpha * V_alpha;
        imagXP = c_alpha * U_alpha + s_alpha * V_alpha;
      }
    }
 
    REAL8 _lambda_alpha = lambda_alpha;
    XLALSinCos2PiLUT( &imagQ, &realQ, _lambda_alpha );
  }
}

Definition in file ComputeFstat_DemodHL_Altivec.c.

Go to the source code of this file.

Macros
#define	FUNC XLALComputeFaFb_Altivec

#define	HOTLOOP_SOURCE "ComputeFstat_DemodHL_Altivec.i"

Macro Definition Documentation

◆ FUNC

#define FUNC XLALComputeFaFb_Altivec

Definition at line 40 of file ComputeFstat_DemodHL_Altivec.c.

◆ HOTLOOP_SOURCE

#define HOTLOOP_SOURCE "ComputeFstat_DemodHL_Altivec.i"

Definition at line 41 of file ComputeFstat_DemodHL_Altivec.c.

Detailed Description

Macros

Macro Definition Documentation

◆ FUNC

◆ HOTLOOP_SOURCE