tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/TransformFunctions/arm_cfft_radix4_q31.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:            arm_cfft_radix4_q31.c
   9 *
  10 * Description:  This file has function definition of Radix-4 FFT & IFFT function and
  11 *                               In-place bit reversal using bit reversal table
  12 *
  13 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  14 *
  15 * Redistribution and use in source and binary forms, with or without
  16 * modification, are permitted provided that the following conditions
  17 * are met:
  18 *   - Redistributions of source code must retain the above copyright
  19 *     notice, this list of conditions and the following disclaimer.
  20 *   - Redistributions in binary form must reproduce the above copyright
  21 *     notice, this list of conditions and the following disclaimer in
  22 *     the documentation and/or other materials provided with the
  23 *     distribution.
  24 *   - Neither the name of ARM LIMITED nor the names of its contributors
  25 *     may be used to endorse or promote products derived from this
  26 *     software without specific prior written permission.
  27 *
  28 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  29 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  30 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  31 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  32 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  33 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  34 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  35 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  36 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  38 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  39 * POSSIBILITY OF SUCH DAMAGE.
  40 * -------------------------------------------------------------------- */
  41
  42 #include "arm_math.h"
  43
  44 void arm_radix4_butterfly_inverse_q31(
  45 q31_t * pSrc,
  46 uint32_t fftLen,
  47 q31_t * pCoef,
  48 uint32_t twidCoefModifier);
  49
  50 void arm_radix4_butterfly_q31(
  51 q31_t * pSrc,
  52 uint32_t fftLen,
  53 q31_t * pCoef,
  54 uint32_t twidCoefModifier);
  55
  56 void arm_bitreversal_q31(
  57 q31_t * pSrc,
  58 uint32_t fftLen,
  59 uint16_t bitRevFactor,
  60 uint16_t * pBitRevTab);
  61
  62 /**
  63  * @ingroup groupTransforms
  64  */
  65
  66 /**
  67  * @addtogroup ComplexFFT
  68  * @{
  69  */
  70
  71 /**
  72  * @details
  73  * @brief Processing function for the Q31 CFFT/CIFFT.
  74  * @param[in]      *S    points to an instance of the Q31 CFFT/CIFFT structure.
  75  * @param[in, out] *pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place.
  76  * @return none.
  77  *
  78  * \par Input and output formats:
  79  * \par
  80  * Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
  81  * Hence the output format is different for different FFT sizes.
  82  * The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
  83  * \par
  84  * \image html CFFTQ31.gif "Input and Output Formats for Q31 CFFT"
  85  * \image html CIFFTQ31.gif "Input and Output Formats for Q31 CIFFT"
  86  *
  87  */
  88
  89 void arm_cfft_radix4_q31(
  90   const arm_cfft_radix4_instance_q31 * S,
  91   q31_t * pSrc)
  92 {
  93   if(S->ifftFlag == 1u)
  94   {
  95     /* Complex IFFT radix-4 */
  96     arm_radix4_butterfly_inverse_q31(pSrc, S->fftLen, S->pTwiddle,
  97                                      S->twidCoefModifier);
  98   }
  99   else
 100   {
 101     /* Complex FFT radix-4 */
 102     arm_radix4_butterfly_q31(pSrc, S->fftLen, S->pTwiddle,
 103                              S->twidCoefModifier);
 104   }
 105
 106
 107   if(S->bitReverseFlag == 1u)
 108   {
 109     /*  Bit Reversal */
 110     arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
 111   }
 112
 113 }
 114
 115 /**
 116  * @} end of ComplexFFT group
 117  */
 118
 119 /*
 120 * Radix-4 FFT algorithm used is :
 121 *
 122 * Input real and imaginary data:
 123 * x(n) = xa + j * ya
 124 * x(n+N/4 ) = xb + j * yb
 125 * x(n+N/2 ) = xc + j * yc
 126 * x(n+3N 4) = xd + j * yd
 127 *
 128 *
 129 * Output real and imaginary data:
 130 * x(4r) = xa'+ j * ya'
 131 * x(4r+1) = xb'+ j * yb'
 132 * x(4r+2) = xc'+ j * yc'
 133 * x(4r+3) = xd'+ j * yd'
 134 *
 135 *
 136 * Twiddle factors for radix-4 FFT:
 137 * Wn = co1 + j * (- si1)
 138 * W2n = co2 + j * (- si2)
 139 * W3n = co3 + j * (- si3)
 140 *
 141 *  Butterfly implementation:
 142 * xa' = xa + xb + xc + xd
 143 * ya' = ya + yb + yc + yd
 144 * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
 145 * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
 146 * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
 147 * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
 148 * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
 149 * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
 150 *
 151 */
 152
 153 /**
 154  * @brief  Core function for the Q31 CFFT butterfly process.
 155  * @param[in, out] *pSrc            points to the in-place buffer of Q31 data type.
 156  * @param[in]      fftLen           length of the FFT.
 157  * @param[in]      *pCoef           points to twiddle coefficient buffer.
 158  * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
 159  * @return none.
 160  */
 161
 162 void arm_radix4_butterfly_q31(
 163   q31_t * pSrc,
 164   uint32_t fftLen,
 165   q31_t * pCoef,
 166   uint32_t twidCoefModifier)
 167 {
 168   uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
 169   q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
 170
 171   q31_t xa, xb, xc, xd;
 172   q31_t ya, yb, yc, yd;
 173   q31_t xa_out, xb_out, xc_out, xd_out;
 174   q31_t ya_out, yb_out, yc_out, yd_out;
 175
 176   q31_t *ptr1;
 177   q63_t xaya, xbyb, xcyc, xdyd;
 178   /* Total process is divided into three stages */
 179
 180   /* process first stage, middle stages, & last stage */
 181
 182
 183   /* start of first stage process */
 184
 185   /*  Initializations for the first stage */
 186   n2 = fftLen;
 187   n1 = n2;
 188   /* n2 = fftLen/4 */
 189   n2 >>= 2u;
 190   i0 = 0u;
 191   ia1 = 0u;
 192
 193   j = n2;
 194
 195   /*  Calculation of first stage */
 196   do
 197   {
 198     /*  index calculation for the input as, */
 199     /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2u], pSrc[i0 + 3fftLen/4] */
 200     i1 = i0 + n2;
 201     i2 = i1 + n2;
 202     i3 = i2 + n2;
 203
 204     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
 205
 206     /*  Butterfly implementation */
 207     /* xa + xc */
 208     r1 = (pSrc[(2u * i0)] >> 4u) + (pSrc[(2u * i2)] >> 4u);
 209     /* xa - xc */
 210     r2 = (pSrc[2u * i0] >> 4u) - (pSrc[2u * i2] >> 4u);
 211
 212     /* xb + xd */
 213     t1 = (pSrc[2u * i1] >> 4u) + (pSrc[2u * i3] >> 4u);
 214
 215     /* ya + yc */
 216     s1 = (pSrc[(2u * i0) + 1u] >> 4u) + (pSrc[(2u * i2) + 1u] >> 4u);
 217     /* ya - yc */
 218     s2 = (pSrc[(2u * i0) + 1u] >> 4u) - (pSrc[(2u * i2) + 1u] >> 4u);
 219
 220     /* xa' = xa + xb + xc + xd */
 221     pSrc[2u * i0] = (r1 + t1);
 222     /* (xa + xc) - (xb + xd) */
 223     r1 = r1 - t1;
 224     /* yb + yd */
 225     t2 = (pSrc[(2u * i1) + 1u] >> 4u) + (pSrc[(2u * i3) + 1u] >> 4u);
 226
 227     /* ya' = ya + yb + yc + yd */
 228     pSrc[(2u * i0) + 1u] = (s1 + t2);
 229
 230     /* (ya + yc) - (yb + yd) */
 231     s1 = s1 - t2;
 232
 233     /* yb - yd */
 234     t1 = (pSrc[(2u * i1) + 1u] >> 4u) - (pSrc[(2u * i3) + 1u] >> 4u);
 235     /* xb - xd */
 236     t2 = (pSrc[2u * i1] >> 4u) - (pSrc[2u * i3] >> 4u);
 237
 238     /*  index calculation for the coefficients */
 239     ia2 = 2u * ia1;
 240     co2 = pCoef[ia2 * 2u];
 241     si2 = pCoef[(ia2 * 2u) + 1u];
 242
 243     /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
 244     pSrc[2u * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
 245                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1u;
 246
 247     /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
 248     pSrc[(2u * i1) + 1u] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
 249                             ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1u;
 250
 251     /* (xa - xc) + (yb - yd) */
 252     r1 = r2 + t1;
 253     /* (xa - xc) - (yb - yd) */
 254     r2 = r2 - t1;
 255
 256     /* (ya - yc) - (xb - xd) */
 257     s1 = s2 - t2;
 258     /* (ya - yc) + (xb - xd) */
 259     s2 = s2 + t2;
 260
 261     co1 = pCoef[ia1 * 2u];
 262     si1 = pCoef[(ia1 * 2u) + 1u];
 263
 264     /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
 265     pSrc[2u * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
 266                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1u;
 267
 268     /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
 269     pSrc[(2u * i2) + 1u] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
 270                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1u;
 271
 272     /*  index calculation for the coefficients */
 273     ia3 = 3u * ia1;
 274     co3 = pCoef[ia3 * 2u];
 275     si3 = pCoef[(ia3 * 2u) + 1u];
 276
 277     /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
 278     pSrc[2u * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
 279                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1u;
 280
 281     /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
 282     pSrc[(2u * i3) + 1u] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
 283                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1u;
 284
 285     /*  Twiddle coefficients index modifier */
 286     ia1 = ia1 + twidCoefModifier;
 287
 288     /*  Updating input index */
 289     i0 = i0 + 1u;
 290
 291   } while(--j);
 292
 293   /* end of first stage process */
 294
 295   /* data is in 5.27(q27) format */
 296
 297
 298   /* start of Middle stages process */
 299
 300
 301   /* each stage in middle stages provides two down scaling of the input */
 302
 303   twidCoefModifier <<= 2u;
 304
 305
 306   for (k = fftLen / 4u; k > 4u; k >>= 2u)
 307   {
 308     /*  Initializations for the first stage */
 309     n1 = n2;
 310     n2 >>= 2u;
 311     ia1 = 0u;
 312
 313     /*  Calculation of first stage */
 314     for (j = 0u; j <= (n2 - 1u); j++)
 315     {
 316       /*  index calculation for the coefficients */
 317       ia2 = ia1 + ia1;
 318       ia3 = ia2 + ia1;
 319       co1 = pCoef[ia1 * 2u];
 320       si1 = pCoef[(ia1 * 2u) + 1u];
 321       co2 = pCoef[ia2 * 2u];
 322       si2 = pCoef[(ia2 * 2u) + 1u];
 323       co3 = pCoef[ia3 * 2u];
 324       si3 = pCoef[(ia3 * 2u) + 1u];
 325       /*  Twiddle coefficients index modifier */
 326       ia1 = ia1 + twidCoefModifier;
 327
 328       for (i0 = j; i0 < fftLen; i0 += n1)
 329       {
 330         /*  index calculation for the input as, */
 331         /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2u], pSrc[i0 + 3fftLen/4] */
 332         i1 = i0 + n2;
 333         i2 = i1 + n2;
 334         i3 = i2 + n2;
 335
 336         /*  Butterfly implementation */
 337         /* xa + xc */
 338         r1 = pSrc[2u * i0] + pSrc[2u * i2];
 339         /* xa - xc */
 340         r2 = pSrc[2u * i0] - pSrc[2u * i2];
 341
 342         /* ya + yc */
 343         s1 = pSrc[(2u * i0) + 1u] + pSrc[(2u * i2) + 1u];
 344         /* ya - yc */
 345         s2 = pSrc[(2u * i0) + 1u] - pSrc[(2u * i2) + 1u];
 346
 347         /* xb + xd */
 348         t1 = pSrc[2u * i1] + pSrc[2u * i3];
 349
 350         /* xa' = xa + xb + xc + xd */
 351         pSrc[2u * i0] = (r1 + t1) >> 2u;
 352         /* xa + xc -(xb + xd) */
 353         r1 = r1 - t1;
 354
 355         /* yb + yd */
 356         t2 = pSrc[(2u * i1) + 1u] + pSrc[(2u * i3) + 1u];
 357         /* ya' = ya + yb + yc + yd */
 358         pSrc[(2u * i0) + 1u] = (s1 + t2) >> 2u;
 359
 360         /* (ya + yc) - (yb + yd) */
 361         s1 = s1 - t2;
 362
 363         /* (yb - yd) */
 364         t1 = pSrc[(2u * i1) + 1u] - pSrc[(2u * i3) + 1u];
 365         /* (xb - xd) */
 366         t2 = pSrc[2u * i1] - pSrc[2u * i3];
 367
 368         /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
 369         pSrc[2u * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
 370                          ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1u;
 371
 372         /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
 373         pSrc[(2u * i1) + 1u] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
 374                                 ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1u;
 375
 376         /* (xa - xc) + (yb - yd) */
 377         r1 = r2 + t1;
 378         /* (xa - xc) - (yb - yd) */
 379         r2 = r2 - t1;
 380
 381         /* (ya - yc) -  (xb - xd) */
 382         s1 = s2 - t2;
 383         /* (ya - yc) +  (xb - xd) */
 384         s2 = s2 + t2;
 385
 386         /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
 387         pSrc[2u * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
 388                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1u;
 389
 390         /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
 391         pSrc[(2u * i2) + 1u] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
 392                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1u;
 393
 394         /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
 395         pSrc[2u * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
 396                          ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1u;
 397
 398         /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
 399         pSrc[(2u * i3) + 1u] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
 400                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1u;
 401       }
 402     }
 403     twidCoefModifier <<= 2u;
 404   }
 405
 406   /* End of Middle stages process */
 407
 408   /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
 409   /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
 410   /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
 411   /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
 412
 413
 414   /* start of Last stage process */
 415   /*  Initializations for the last stage */
 416   j = fftLen >> 2;
 417   ptr1 = &pSrc[0];
 418
 419   /*  Calculations of last stage */
 420   do
 421   {
 422
 423 #ifndef ARM_MATH_BIG_ENDIAN
 424
 425     /* Read xa (real), ya(imag) input */
 426     xaya = *__SIMD64(ptr1)++;
 427     xa = (q31_t) xaya;
 428     ya = (q31_t) (xaya >> 32);
 429
 430     /* Read xb (real), yb(imag) input */
 431     xbyb = *__SIMD64(ptr1)++;
 432     xb = (q31_t) xbyb;
 433     yb = (q31_t) (xbyb >> 32);
 434
 435     /* Read xc (real), yc(imag) input */
 436     xcyc = *__SIMD64(ptr1)++;
 437     xc = (q31_t) xcyc;
 438     yc = (q31_t) (xcyc >> 32);
 439
 440     /* Read xc (real), yc(imag) input */
 441     xdyd = *__SIMD64(ptr1)++;
 442     xd = (q31_t) xdyd;
 443     yd = (q31_t) (xdyd >> 32);
 444
 445 #else
 446
 447     /* Read xa (real), ya(imag) input */
 448     xaya = *__SIMD64(ptr1)++;
 449     ya = (q31_t) xaya;
 450     xa = (q31_t) (xaya >> 32);
 451
 452     /* Read xb (real), yb(imag) input */
 453     xbyb = *__SIMD64(ptr1)++;
 454     yb = (q31_t) xbyb;
 455     xb = (q31_t) (xbyb >> 32);
 456
 457     /* Read xc (real), yc(imag) input */
 458     xcyc = *__SIMD64(ptr1)++;
 459     yc = (q31_t) xcyc;
 460     xc = (q31_t) (xcyc >> 32);
 461
 462     /* Read xc (real), yc(imag) input */
 463     xdyd = *__SIMD64(ptr1)++;
 464     yd = (q31_t) xdyd;
 465     xd = (q31_t) (xdyd >> 32);
 466
 467
 468 #endif
 469
 470     /* xa' = xa + xb + xc + xd */
 471     xa_out = xa + xb + xc + xd;
 472
 473     /* ya' = ya + yb + yc + yd */
 474     ya_out = ya + yb + yc + yd;
 475
 476     /* pointer updation for writing */
 477     ptr1 = ptr1 - 8u;
 478
 479     /* writing xa' and ya' */
 480     *ptr1++ = xa_out;
 481     *ptr1++ = ya_out;
 482
 483     xc_out = (xa - xb + xc - xd);
 484     yc_out = (ya - yb + yc - yd);
 485
 486     /* writing xc' and yc' */
 487     *ptr1++ = xc_out;
 488     *ptr1++ = yc_out;
 489
 490     xb_out = (xa + yb - xc - yd);
 491     yb_out = (ya - xb - yc + xd);
 492
 493     /* writing xb' and yb' */
 494     *ptr1++ = xb_out;
 495     *ptr1++ = yb_out;
 496
 497     xd_out = (xa - yb - xc + yd);
 498     yd_out = (ya + xb - yc - xd);
 499
 500     /* writing xd' and yd' */
 501     *ptr1++ = xd_out;
 502     *ptr1++ = yd_out;
 503
 504
 505   } while(--j);
 506
 507   /* output is in 11.21(q21) format for the 1024 point */
 508   /* output is in 9.23(q23) format for the 256 point */
 509   /* output is in 7.25(q25) format for the 64 point */
 510   /* output is in 5.27(q27) format for the 16 point */
 511
 512   /* End of last stage process */
 513
 514 }
 515
 516
 517 /**
 518  * @brief  Core function for the Q31 CIFFT butterfly process.
 519  * @param[in, out] *pSrc            points to the in-place buffer of Q31 data type.
 520  * @param[in]      fftLen           length of the FFT.
 521  * @param[in]      *pCoef           points to twiddle coefficient buffer.
 522  * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
 523  * @return none.
 524  */
 525
 526
 527 /*
 528 * Radix-4 IFFT algorithm used is :
 529 *
 530 * CIFFT uses same twiddle coefficients as CFFT Function
 531 *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
 532 *
 533 *
 534 * IFFT is implemented with following changes in equations from FFT
 535 *
 536 * Input real and imaginary data:
 537 * x(n) = xa + j * ya
 538 * x(n+N/4 ) = xb + j * yb
 539 * x(n+N/2 ) = xc + j * yc
 540 * x(n+3N 4) = xd + j * yd
 541 *
 542 *
 543 * Output real and imaginary data:
 544 * x(4r) = xa'+ j * ya'
 545 * x(4r+1) = xb'+ j * yb'
 546 * x(4r+2) = xc'+ j * yc'
 547 * x(4r+3) = xd'+ j * yd'
 548 *
 549 *
 550 * Twiddle factors for radix-4 IFFT:
 551 * Wn = co1 + j * (si1)
 552 * W2n = co2 + j * (si2)
 553 * W3n = co3 + j * (si3)
 554
 555 * The real and imaginary output values for the radix-4 butterfly are
 556 * xa' = xa + xb + xc + xd
 557 * ya' = ya + yb + yc + yd
 558 * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
 559 * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
 560 * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
 561 * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
 562 * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
 563 * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
 564 *
 565 */
 566
 567 void arm_radix4_butterfly_inverse_q31(
 568   q31_t * pSrc,
 569   uint32_t fftLen,
 570   q31_t * pCoef,
 571   uint32_t twidCoefModifier)
 572 {
 573   uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
 574   q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
 575   q31_t xa, xb, xc, xd;
 576   q31_t ya, yb, yc, yd;
 577   q31_t xa_out, xb_out, xc_out, xd_out;
 578   q31_t ya_out, yb_out, yc_out, yd_out;
 579
 580   q31_t *ptr1;
 581   q63_t xaya, xbyb, xcyc, xdyd;
 582
 583   /* input is be 1.31(q31) format for all FFT sizes */
 584   /* Total process is divided into three stages */
 585   /* process first stage, middle stages, & last stage */
 586
 587   /* Start of first stage process */
 588
 589   /* Initializations for the first stage */
 590   n2 = fftLen;
 591   n1 = n2;
 592   /* n2 = fftLen/4 */
 593   n2 >>= 2u;
 594   i0 = 0u;
 595   ia1 = 0u;
 596
 597   j = n2;
 598
 599   do
 600   {
 601
 602     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
 603
 604     /*  index calculation for the input as, */
 605     /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2u], pSrc[i0 + 3fftLen/4] */
 606     i1 = i0 + n2;
 607     i2 = i1 + n2;
 608     i3 = i2 + n2;
 609
 610     /*  Butterfly implementation */
 611     /* xa + xc */
 612     r1 = (pSrc[2u * i0] >> 4u) + (pSrc[2u * i2] >> 4u);
 613     /* xa - xc */
 614     r2 = (pSrc[2u * i0] >> 4u) - (pSrc[2u * i2] >> 4u);
 615
 616     /* xb + xd */
 617     t1 = (pSrc[2u * i1] >> 4u) + (pSrc[2u * i3] >> 4u);
 618
 619     /* ya + yc */
 620     s1 = (pSrc[(2u * i0) + 1u] >> 4u) + (pSrc[(2u * i2) + 1u] >> 4u);
 621     /* ya - yc */
 622     s2 = (pSrc[(2u * i0) + 1u] >> 4u) - (pSrc[(2u * i2) + 1u] >> 4u);
 623
 624     /* xa' = xa + xb + xc + xd */
 625     pSrc[2u * i0] = (r1 + t1);
 626     /* (xa + xc) - (xb + xd) */
 627     r1 = r1 - t1;
 628     /* yb + yd */
 629     t2 = (pSrc[(2u * i1) + 1u] >> 4u) + (pSrc[(2u * i3) + 1u] >> 4u);
 630     /* ya' = ya + yb + yc + yd */
 631     pSrc[(2u * i0) + 1u] = (s1 + t2);
 632
 633     /* (ya + yc) - (yb + yd) */
 634     s1 = s1 - t2;
 635
 636     /* yb - yd */
 637     t1 = (pSrc[(2u * i1) + 1u] >> 4u) - (pSrc[(2u * i3) + 1u] >> 4u);
 638     /* xb - xd */
 639     t2 = (pSrc[2u * i1] >> 4u) - (pSrc[2u * i3] >> 4u);
 640
 641     /*  index calculation for the coefficients */
 642     ia2 = 2u * ia1;
 643     co2 = pCoef[ia2 * 2u];
 644     si2 = pCoef[(ia2 * 2u) + 1u];
 645
 646     /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
 647     pSrc[2u * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
 648                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1u;
 649
 650     /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
 651     pSrc[2u * i1 + 1u] = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
 652                           ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1u;
 653
 654     /* (xa - xc) - (yb - yd) */
 655     r1 = r2 - t1;
 656     /* (xa - xc) + (yb - yd) */
 657     r2 = r2 + t1;
 658
 659     /* (ya - yc) + (xb - xd) */
 660     s1 = s2 + t2;
 661     /* (ya - yc) - (xb - xd) */
 662     s2 = s2 - t2;
 663
 664     co1 = pCoef[ia1 * 2u];
 665     si1 = pCoef[(ia1 * 2u) + 1u];
 666
 667     /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
 668     pSrc[2u * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
 669                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1u;
 670
 671     /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
 672     pSrc[(2u * i2) + 1u] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
 673                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1u;
 674
 675     /*  index calculation for the coefficients */
 676     ia3 = 3u * ia1;
 677     co3 = pCoef[ia3 * 2u];
 678     si3 = pCoef[(ia3 * 2u) + 1u];
 679
 680     /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
 681     pSrc[2u * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
 682                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1u;
 683
 684     /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
 685     pSrc[(2u * i3) + 1u] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
 686                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1u;
 687
 688     /*  Twiddle coefficients index modifier */
 689     ia1 = ia1 + twidCoefModifier;
 690
 691     /*  Updating input index */
 692     i0 = i0 + 1u;
 693
 694   } while(--j);
 695
 696   /* data is in 5.27(q27) format */
 697   /* each stage provides two down scaling of the input */
 698
 699
 700   /* Start of Middle stages process */
 701
 702   twidCoefModifier <<= 2u;
 703
 704   /*  Calculation of second stage to excluding last stage */
 705   for (k = fftLen / 4u; k > 4u; k >>= 2u)
 706   {
 707     /*  Initializations for the first stage */
 708     n1 = n2;
 709     n2 >>= 2u;
 710     ia1 = 0u;
 711
 712     for (j = 0; j <= (n2 - 1u); j++)
 713     {
 714       /*  index calculation for the coefficients */
 715       ia2 = ia1 + ia1;
 716       ia3 = ia2 + ia1;
 717       co1 = pCoef[ia1 * 2u];
 718       si1 = pCoef[(ia1 * 2u) + 1u];
 719       co2 = pCoef[ia2 * 2u];
 720       si2 = pCoef[(ia2 * 2u) + 1u];
 721       co3 = pCoef[ia3 * 2u];
 722       si3 = pCoef[(ia3 * 2u) + 1u];
 723       /*  Twiddle coefficients index modifier */
 724       ia1 = ia1 + twidCoefModifier;
 725
 726       for (i0 = j; i0 < fftLen; i0 += n1)
 727       {
 728         /*  index calculation for the input as, */
 729         /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2u], pSrc[i0 + 3fftLen/4] */
 730         i1 = i0 + n2;
 731         i2 = i1 + n2;
 732         i3 = i2 + n2;
 733
 734         /*  Butterfly implementation */
 735         /* xa + xc */
 736         r1 = pSrc[2u * i0] + pSrc[2u * i2];
 737         /* xa - xc */
 738         r2 = pSrc[2u * i0] - pSrc[2u * i2];
 739
 740         /* ya + yc */
 741         s1 = pSrc[(2u * i0) + 1u] + pSrc[(2u * i2) + 1u];
 742         /* ya - yc */
 743         s2 = pSrc[(2u * i0) + 1u] - pSrc[(2u * i2) + 1u];
 744
 745         /* xb + xd */
 746         t1 = pSrc[2u * i1] + pSrc[2u * i3];
 747
 748         /* xa' = xa + xb + xc + xd */
 749         pSrc[2u * i0] = (r1 + t1) >> 2u;
 750         /* xa + xc -(xb + xd) */
 751         r1 = r1 - t1;
 752         /* yb + yd */
 753         t2 = pSrc[(2u * i1) + 1u] + pSrc[(2u * i3) + 1u];
 754         /* ya' = ya + yb + yc + yd */
 755         pSrc[(2u * i0) + 1u] = (s1 + t2) >> 2u;
 756
 757         /* (ya + yc) - (yb + yd) */
 758         s1 = s1 - t2;
 759
 760         /* (yb - yd) */
 761         t1 = pSrc[(2u * i1) + 1u] - pSrc[(2u * i3) + 1u];
 762         /* (xb - xd) */
 763         t2 = pSrc[2u * i1] - pSrc[2u * i3];
 764
 765         /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
 766         pSrc[2u * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32u)) -
 767                          ((int32_t) (((q63_t) s1 * si2) >> 32u))) >> 1u;
 768
 769         /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
 770         pSrc[(2u * i1) + 1u] =
 771           (((int32_t) (((q63_t) s1 * co2) >> 32u)) +
 772            ((int32_t) (((q63_t) r1 * si2) >> 32u))) >> 1u;
 773
 774         /* (xa - xc) - (yb - yd) */
 775         r1 = r2 - t1;
 776         /* (xa - xc) + (yb - yd) */
 777         r2 = r2 + t1;
 778
 779         /* (ya - yc) +  (xb - xd) */
 780         s1 = s2 + t2;
 781         /* (ya - yc) -  (xb - xd) */
 782         s2 = s2 - t2;
 783
 784         /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
 785         pSrc[2u * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
 786                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1u;
 787
 788         /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
 789         pSrc[(2u * i2) + 1u] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
 790                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1u;
 791
 792         /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
 793         pSrc[(2u * i3)] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
 794                            ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1u;
 795
 796         /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
 797         pSrc[(2u * i3) + 1u] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
 798                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1u;
 799       }
 800     }
 801     twidCoefModifier <<= 2u;
 802   }
 803
 804   /* End of Middle stages process */
 805
 806   /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
 807   /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
 808   /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
 809   /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
 810
 811
 812   /* Start of last stage process */
 813
 814
 815   /*  Initializations for the last stage */
 816   j = fftLen >> 2;
 817   ptr1 = &pSrc[0];
 818
 819   /*  Calculations of last stage */
 820   do
 821   {
 822 #ifndef ARM_MATH_BIG_ENDIAN
 823     /* Read xa (real), ya(imag) input */
 824     xaya = *__SIMD64(ptr1)++;
 825     xa = (q31_t) xaya;
 826     ya = (q31_t) (xaya >> 32);
 827
 828     /* Read xb (real), yb(imag) input */
 829     xbyb = *__SIMD64(ptr1)++;
 830     xb = (q31_t) xbyb;
 831     yb = (q31_t) (xbyb >> 32);
 832
 833     /* Read xc (real), yc(imag) input */
 834     xcyc = *__SIMD64(ptr1)++;
 835     xc = (q31_t) xcyc;
 836     yc = (q31_t) (xcyc >> 32);
 837
 838     /* Read xc (real), yc(imag) input */
 839     xdyd = *__SIMD64(ptr1)++;
 840     xd = (q31_t) xdyd;
 841     yd = (q31_t) (xdyd >> 32);
 842
 843 #else
 844
 845     /* Read xa (real), ya(imag) input */
 846     xaya = *__SIMD64(ptr1)++;
 847     ya = (q31_t) xaya;
 848     xa = (q31_t) (xaya >> 32);
 849
 850     /* Read xb (real), yb(imag) input */
 851     xbyb = *__SIMD64(ptr1)++;
 852     yb = (q31_t) xbyb;
 853     xb = (q31_t) (xbyb >> 32);
 854
 855     /* Read xc (real), yc(imag) input */
 856     xcyc = *__SIMD64(ptr1)++;
 857     yc = (q31_t) xcyc;
 858     xc = (q31_t) (xcyc >> 32);
 859
 860     /* Read xc (real), yc(imag) input */
 861     xdyd = *__SIMD64(ptr1)++;
 862     yd = (q31_t) xdyd;
 863     xd = (q31_t) (xdyd >> 32);
 864
 865
 866 #endif
 867
 868     /* xa' = xa + xb + xc + xd */
 869     xa_out = xa + xb + xc + xd;
 870
 871     /* ya' = ya + yb + yc + yd */
 872     ya_out = ya + yb + yc + yd;
 873
 874     /* pointer updation for writing */
 875     ptr1 = ptr1 - 8u;
 876
 877     /* writing xa' and ya' */
 878     *ptr1++ = xa_out;
 879     *ptr1++ = ya_out;
 880
 881     xc_out = (xa - xb + xc - xd);
 882     yc_out = (ya - yb + yc - yd);
 883
 884     /* writing xc' and yc' */
 885     *ptr1++ = xc_out;
 886     *ptr1++ = yc_out;
 887
 888     xb_out = (xa - yb - xc + yd);
 889     yb_out = (ya + xb - yc - xd);
 890
 891     /* writing xb' and yb' */
 892     *ptr1++ = xb_out;
 893     *ptr1++ = yb_out;
 894
 895     xd_out = (xa + yb - xc - yd);
 896     yd_out = (ya - xb - yc + xd);
 897
 898     /* writing xd' and yd' */
 899     *ptr1++ = xd_out;
 900     *ptr1++ = yd_out;
 901
 902
 903   } while(--j);
 904
 905   /* output is in 11.21(q21) format for the 1024 point */
 906   /* output is in 9.23(q23) format for the 256 point */
 907   /* output is in 7.25(q25) format for the 64 point */
 908   /* output is in 5.27(q27) format for the 16 point */
 909
 910   /* End of last stage process */
 911 }