tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/TransformFunctions/arm_cfft_radix4_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:            arm_cfft_radix4_q15.c
   9 *
  10 * Description:  This file has function definition of Radix-4 FFT & IFFT function and
  11 *                               In-place bit reversal using bit reversal table
  12 *
  13 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  14 *
  15 * Redistribution and use in source and binary forms, with or without
  16 * modification, are permitted provided that the following conditions
  17 * are met:
  18 *   - Redistributions of source code must retain the above copyright
  19 *     notice, this list of conditions and the following disclaimer.
  20 *   - Redistributions in binary form must reproduce the above copyright
  21 *     notice, this list of conditions and the following disclaimer in
  22 *     the documentation and/or other materials provided with the
  23 *     distribution.
  24 *   - Neither the name of ARM LIMITED nor the names of its contributors
  25 *     may be used to endorse or promote products derived from this
  26 *     software without specific prior written permission.
  27 *
  28 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  29 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  30 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  31 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  32 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  33 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  34 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  35 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  36 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  38 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  39 * POSSIBILITY OF SUCH DAMAGE.
  40 * -------------------------------------------------------------------- */
  41
  42 #include "arm_math.h"
  43
  44
  45 void arm_radix4_butterfly_q15(
  46   q15_t * pSrc16,
  47   uint32_t fftLen,
  48   q15_t * pCoef16,
  49   uint32_t twidCoefModifier);
  50
  51 void arm_radix4_butterfly_inverse_q15(
  52   q15_t * pSrc16,
  53   uint32_t fftLen,
  54   q15_t * pCoef16,
  55   uint32_t twidCoefModifier);
  56
  57 void arm_bitreversal_q15(
  58   q15_t * pSrc,
  59   uint32_t fftLen,
  60   uint16_t bitRevFactor,
  61   uint16_t * pBitRevTab);
  62
  63 /**
  64  * @ingroup groupTransforms
  65  */
  66
  67 /**
  68  * @addtogroup ComplexFFT
  69  * @{
  70  */
  71
  72
  73 /**
  74  * @details
  75  * @brief Processing function for the Q15 CFFT/CIFFT.
  76  * @param[in]      *S    points to an instance of the Q15 CFFT/CIFFT structure.
  77  * @param[in, out] *pSrc points to the complex data buffer. Processing occurs in-place.
  78  * @return none.
  79  *
  80  * \par Input and output formats:
  81  * \par
  82  * Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
  83  * Hence the output format is different for different FFT sizes.
  84  * The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
  85  * \par
  86  * \image html CFFTQ15.gif "Input and Output Formats for Q15 CFFT"
  87  * \image html CIFFTQ15.gif "Input and Output Formats for Q15 CIFFT"
  88  */
  89
  90 void arm_cfft_radix4_q15(
  91   const arm_cfft_radix4_instance_q15 * S,
  92   q15_t * pSrc)
  93 {
  94   if(S->ifftFlag == 1u)
  95   {
  96     /*  Complex IFFT radix-4  */
  97     arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle,
  98                                      S->twidCoefModifier);
  99   }
 100   else
 101   {
 102     /*  Complex FFT radix-4  */
 103     arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle,
 104                              S->twidCoefModifier);
 105   }
 106
 107   if(S->bitReverseFlag == 1u)
 108   {
 109     /*  Bit Reversal */
 110     arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
 111   }
 112
 113 }
 114
 115 /**
 116  * @} end of ComplexFFT group
 117  */
 118
 119 /*
 120 * Radix-4 FFT algorithm used is :
 121 *
 122 * Input real and imaginary data:
 123 * x(n) = xa + j * ya
 124 * x(n+N/4 ) = xb + j * yb
 125 * x(n+N/2 ) = xc + j * yc
 126 * x(n+3N 4) = xd + j * yd
 127 *
 128 *
 129 * Output real and imaginary data:
 130 * x(4r) = xa'+ j * ya'
 131 * x(4r+1) = xb'+ j * yb'
 132 * x(4r+2) = xc'+ j * yc'
 133 * x(4r+3) = xd'+ j * yd'
 134 *
 135 *
 136 * Twiddle factors for radix-4 FFT:
 137 * Wn = co1 + j * (- si1)
 138 * W2n = co2 + j * (- si2)
 139 * W3n = co3 + j * (- si3)
 140
 141 * The real and imaginary output values for the radix-4 butterfly are
 142 * xa' = xa + xb + xc + xd
 143 * ya' = ya + yb + yc + yd
 144 * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
 145 * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
 146 * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
 147 * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
 148 * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
 149 * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
 150 *
 151 */
 152
 153 /**
 154  * @brief  Core function for the Q15 CFFT butterfly process.
 155  * @param[in, out] *pSrc16          points to the in-place buffer of Q15 data type.
 156  * @param[in]      fftLen           length of the FFT.
 157  * @param[in]      *pCoef16         points to twiddle coefficient buffer.
 158  * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
 159  * @return none.
 160  */
 161
 162 void arm_radix4_butterfly_q15(
 163   q15_t * pSrc16,
 164   uint32_t fftLen,
 165   q15_t * pCoef16,
 166   uint32_t twidCoefModifier)
 167 {
 168
 169 #ifndef ARM_MATH_CM0_FAMILY
 170
 171   /* Run the below code for Cortex-M4 and Cortex-M3 */
 172
 173   q31_t R, S, T, U;
 174   q31_t C1, C2, C3, out1, out2;
 175   uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
 176   q15_t in;
 177
 178   q15_t *ptr1;
 179
 180
 181
 182   q31_t xaya, xbyb, xcyc, xdyd;
 183
 184   /* Total process is divided into three stages */
 185
 186   /* process first stage, middle stages, & last stage */
 187
 188   /*  Initializations for the first stage */
 189   n2 = fftLen;
 190   n1 = n2;
 191
 192   /* n2 = fftLen/4 */
 193   n2 >>= 2u;
 194
 195   /* Index for twiddle coefficient */
 196   ic = 0u;
 197
 198   /* Index for input read and output write */
 199   i0 = 0u;
 200   j = n2;
 201
 202   /* Input is in 1.15(q15) format */
 203
 204   /*  start of first stage process */
 205   do
 206   {
 207     /*  Butterfly implementation */
 208
 209     /*  index calculation for the input as, */
 210     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
 211     i1 = i0 + n2;
 212     i2 = i1 + n2;
 213     i3 = i2 + n2;
 214
 215     /*  Reading i0, i0+fftLen/2 inputs */
 216     /* Read ya (real), xa(imag) input */
 217     T = _SIMD32_OFFSET(pSrc16 + (2u * i0));
 218     in = ((int16_t) (T & 0xFFFF)) >> 2;
 219     T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
 220
 221     /* Read yc (real), xc(imag) input */
 222     S = _SIMD32_OFFSET(pSrc16 + (2u * i2));
 223     in = ((int16_t) (S & 0xFFFF)) >> 2;
 224     S = ((S >> 2) & 0xFFFF0000) | (in & 0xFFFF);
 225
 226     /* R = packed((ya + yc), (xa + xc) ) */
 227     R = __QADD16(T, S);
 228
 229     /* S = packed((ya - yc), (xa - xc) ) */
 230     S = __QSUB16(T, S);
 231
 232     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
 233     /* Read yb (real), xb(imag) input */
 234     T = _SIMD32_OFFSET(pSrc16 + (2u * i1));
 235     in = ((int16_t) (T & 0xFFFF)) >> 2;
 236     T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
 237
 238     /* Read yd (real), xd(imag) input */
 239     U = _SIMD32_OFFSET(pSrc16 + (2u * i3));
 240     in = ((int16_t) (U & 0xFFFF)) >> 2;
 241     U = ((U >> 2) & 0xFFFF0000) | (in & 0xFFFF);
 242
 243     /* T = packed((yb + yd), (xb + xd) ) */
 244     T = __QADD16(T, U);
 245
 246     /*  writing the butterfly processed i0 sample */
 247     /* xa' = xa + xb + xc + xd */
 248     /* ya' = ya + yb + yc + yd */
 249     _SIMD32_OFFSET(pSrc16 + (2u * i0)) = __SHADD16(R, T);
 250
 251     /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
 252     R = __QSUB16(R, T);
 253
 254     /* co2 & si2 are read from SIMD Coefficient pointer */
 255     C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
 256
 257 #ifndef ARM_MATH_BIG_ENDIAN
 258
 259     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
 260     out1 = __SMUAD(C2, R) >> 16u;
 261     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
 262     out2 = __SMUSDX(C2, R);
 263
 264 #else
 265
 266     /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
 267     out1 = __SMUSDX(R, C2) >> 16u;
 268     /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
 269     out2 = __SMUAD(C2, R);
 270
 271 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 272
 273     /*  Reading i0+fftLen/4 */
 274     /* T = packed(yb, xb) */
 275     T = _SIMD32_OFFSET(pSrc16 + (2u * i1));
 276     in = ((int16_t) (T & 0xFFFF)) >> 2;
 277     T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
 278
 279     /* writing the butterfly processed i0 + fftLen/4 sample */
 280     /* writing output(xc', yc') in little endian format */
 281     _SIMD32_OFFSET(pSrc16 + (2u * i1)) =
 282       (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
 283
 284     /*  Butterfly calculations */
 285     /* U = packed(yd, xd) */
 286     U = _SIMD32_OFFSET(pSrc16 + (2u * i3));
 287     in = ((int16_t) (U & 0xFFFF)) >> 2;
 288     U = ((U >> 2) & 0xFFFF0000) | (in & 0xFFFF);
 289
 290     /* T = packed(yb-yd, xb-xd) */
 291     T = __QSUB16(T, U);
 292
 293 #ifndef ARM_MATH_BIG_ENDIAN
 294
 295     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
 296     R = __QASX(S, T);
 297     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
 298     S = __QSAX(S, T);
 299
 300 #else
 301
 302     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
 303     R = __QSAX(S, T);
 304     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
 305     S = __QASX(S, T);
 306
 307 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 308
 309     /* co1 & si1 are read from SIMD Coefficient pointer */
 310     C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
 311     /*  Butterfly process for the i0+fftLen/2 sample */
 312
 313 #ifndef ARM_MATH_BIG_ENDIAN
 314
 315     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
 316     out1 = __SMUAD(C1, S) >> 16u;
 317     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
 318     out2 = __SMUSDX(C1, S);
 319
 320 #else
 321
 322     /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
 323     out1 = __SMUSDX(S, C1) >> 16u;
 324     /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
 325     out2 = __SMUAD(C1, S);
 326
 327 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 328
 329     /* writing output(xb', yb') in little endian format */
 330     _SIMD32_OFFSET(pSrc16 + (2u * i2)) =
 331       ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
 332
 333
 334     /* co3 & si3 are read from SIMD Coefficient pointer */
 335     C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
 336     /*  Butterfly process for the i0+3fftLen/4 sample */
 337
 338 #ifndef ARM_MATH_BIG_ENDIAN
 339
 340     /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
 341     out1 = __SMUAD(C3, R) >> 16u;
 342     /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
 343     out2 = __SMUSDX(C3, R);
 344
 345 #else
 346
 347     /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
 348     out1 = __SMUSDX(R, C3) >> 16u;
 349     /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
 350     out2 = __SMUAD(C3, R);
 351
 352 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 353
 354     /* writing output(xd', yd') in little endian format */
 355     _SIMD32_OFFSET(pSrc16 + (2u * i3)) =
 356       ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
 357
 358     /*  Twiddle coefficients index modifier */
 359     ic = ic + twidCoefModifier;
 360
 361     /*  Updating input index */
 362     i0 = i0 + 1u;
 363
 364   } while(--j);
 365   /* data is in 4.11(q11) format */
 366
 367   /* end of first stage process */
 368
 369
 370   /* start of middle stage process */
 371
 372   /*  Twiddle coefficients index modifier */
 373   twidCoefModifier <<= 2u;
 374
 375   /*  Calculation of Middle stage */
 376   for (k = fftLen / 4u; k > 4u; k >>= 2u)
 377   {
 378     /*  Initializations for the middle stage */
 379     n1 = n2;
 380     n2 >>= 2u;
 381     ic = 0u;
 382
 383     for (j = 0u; j <= (n2 - 1u); j++)
 384     {
 385       /*  index calculation for the coefficients */
 386       C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
 387       C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
 388       C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
 389
 390       /*  Twiddle coefficients index modifier */
 391       ic = ic + twidCoefModifier;
 392
 393       /*  Butterfly implementation */
 394       for (i0 = j; i0 < fftLen; i0 += n1)
 395       {
 396         /*  index calculation for the input as, */
 397         /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
 398         i1 = i0 + n2;
 399         i2 = i1 + n2;
 400         i3 = i2 + n2;
 401
 402         /*  Reading i0, i0+fftLen/2 inputs */
 403         /* Read ya (real), xa(imag) input */
 404         T = _SIMD32_OFFSET(pSrc16 + (2u * i0));
 405
 406         /* Read yc (real), xc(imag) input */
 407         S = _SIMD32_OFFSET(pSrc16 + (2u * i2));
 408
 409         /* R = packed( (ya + yc), (xa + xc)) */
 410         R = __QADD16(T, S);
 411
 412         /* S = packed((ya - yc), (xa - xc)) */
 413         S = __QSUB16(T, S);
 414
 415         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
 416         /* Read yb (real), xb(imag) input */
 417         T = _SIMD32_OFFSET(pSrc16 + (2u * i1));
 418
 419         /* Read yd (real), xd(imag) input */
 420         U = _SIMD32_OFFSET(pSrc16 + (2u * i3));
 421
 422         /* T = packed( (yb + yd), (xb + xd)) */
 423         T = __QADD16(T, U);
 424
 425         /*  writing the butterfly processed i0 sample */
 426
 427         /* xa' = xa + xb + xc + xd */
 428         /* ya' = ya + yb + yc + yd */
 429         out1 = __SHADD16(R, T);
 430         in = ((int16_t) (out1 & 0xFFFF)) >> 1;
 431         out1 = ((out1 >> 1) & 0xFFFF0000) | (in & 0xFFFF);
 432         _SIMD32_OFFSET(pSrc16 + (2u * i0)) = out1;
 433
 434         /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
 435         R = __SHSUB16(R, T);
 436
 437 #ifndef ARM_MATH_BIG_ENDIAN
 438
 439         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
 440         out1 = __SMUAD(C2, R) >> 16u;
 441
 442         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
 443         out2 = __SMUSDX(C2, R);
 444
 445 #else
 446
 447         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
 448         out1 = __SMUSDX(R, C2) >> 16u;
 449
 450         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
 451         out2 = __SMUAD(C2, R);
 452
 453 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 454
 455         /*  Reading i0+3fftLen/4 */
 456         /* Read yb (real), xb(imag) input */
 457         T = _SIMD32_OFFSET(pSrc16 + (2u * i1));
 458
 459         /*  writing the butterfly processed i0 + fftLen/4 sample */
 460         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
 461         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
 462         _SIMD32_OFFSET(pSrc16 + (2u * i1)) =
 463           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
 464
 465         /*  Butterfly calculations */
 466
 467         /* Read yd (real), xd(imag) input */
 468         U = _SIMD32_OFFSET(pSrc16 + (2u * i3));
 469
 470         /* T = packed(yb-yd, xb-xd) */
 471         T = __QSUB16(T, U);
 472
 473 #ifndef ARM_MATH_BIG_ENDIAN
 474
 475         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
 476         R = __SHASX(S, T);
 477
 478         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
 479         S = __SHSAX(S, T);
 480
 481
 482         /*  Butterfly process for the i0+fftLen/2 sample */
 483         out1 = __SMUAD(C1, S) >> 16u;
 484         out2 = __SMUSDX(C1, S);
 485
 486 #else
 487
 488         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
 489         R = __SHSAX(S, T);
 490
 491         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
 492         S = __SHASX(S, T);
 493
 494
 495         /*  Butterfly process for the i0+fftLen/2 sample */
 496         out1 = __SMUSDX(S, C1) >> 16u;
 497         out2 = __SMUAD(C1, S);
 498
 499 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 500
 501         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
 502         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
 503         _SIMD32_OFFSET(pSrc16 + (2u * i2)) =
 504           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
 505
 506         /*  Butterfly process for the i0+3fftLen/4 sample */
 507
 508 #ifndef ARM_MATH_BIG_ENDIAN
 509
 510         out1 = __SMUAD(C3, R) >> 16u;
 511         out2 = __SMUSDX(C3, R);
 512
 513 #else
 514
 515         out1 = __SMUSDX(R, C3) >> 16u;
 516         out2 = __SMUAD(C3, R);
 517
 518 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 519
 520         /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
 521         /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
 522         _SIMD32_OFFSET(pSrc16 + (2u * i3)) =
 523           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
 524       }
 525     }
 526     /*  Twiddle coefficients index modifier */
 527     twidCoefModifier <<= 2u;
 528   }
 529   /* end of middle stage process */
 530
 531
 532   /* data is in 10.6(q6) format for the 1024 point */
 533   /* data is in 8.8(q8) format for the 256 point */
 534   /* data is in 6.10(q10) format for the 64 point */
 535   /* data is in 4.12(q12) format for the 16 point */
 536
 537   /*  Initializations for the last stage */
 538   j = fftLen >> 2;
 539
 540   ptr1 = &pSrc16[0];
 541
 542   /* start of last stage process */
 543
 544   /*  Butterfly implementation */
 545   do
 546   {
 547     /* Read xa (real), ya(imag) input */
 548     xaya = *__SIMD32(ptr1)++;
 549
 550     /* Read xb (real), yb(imag) input */
 551     xbyb = *__SIMD32(ptr1)++;
 552
 553     /* Read xc (real), yc(imag) input */
 554     xcyc = *__SIMD32(ptr1)++;
 555
 556     /* Read xd (real), yd(imag) input */
 557     xdyd = *__SIMD32(ptr1)++;
 558
 559     /* R = packed((ya + yc), (xa + xc)) */
 560     R = __QADD16(xaya, xcyc);
 561
 562     /* T = packed((yb + yd), (xb + xd)) */
 563     T = __QADD16(xbyb, xdyd);
 564
 565     /* pointer updation for writing */
 566     ptr1 = ptr1 - 8u;
 567
 568
 569     /* xa' = xa + xb + xc + xd */
 570     /* ya' = ya + yb + yc + yd */
 571     *__SIMD32(ptr1)++ = __SHADD16(R, T);
 572
 573     /* T = packed((yb + yd), (xb + xd)) */
 574     T = __QADD16(xbyb, xdyd);
 575
 576     /* xc' = (xa-xb+xc-xd) */
 577     /* yc' = (ya-yb+yc-yd) */
 578     *__SIMD32(ptr1)++ = __SHSUB16(R, T);
 579
 580     /* S = packed((ya - yc), (xa - xc)) */
 581     S = __QSUB16(xaya, xcyc);
 582
 583     /* Read yd (real), xd(imag) input */
 584     /* T = packed( (yb - yd), (xb - xd))  */
 585     U = __QSUB16(xbyb, xdyd);
 586
 587 #ifndef ARM_MATH_BIG_ENDIAN
 588
 589     /* xb' = (xa+yb-xc-yd) */
 590     /* yb' = (ya-xb-yc+xd) */
 591     *__SIMD32(ptr1)++ = __SHSAX(S, U);
 592
 593
 594     /* xd' = (xa-yb-xc+yd) */
 595     /* yd' = (ya+xb-yc-xd) */
 596     *__SIMD32(ptr1)++ = __SHASX(S, U);
 597
 598 #else
 599
 600     /* xb' = (xa+yb-xc-yd) */
 601     /* yb' = (ya-xb-yc+xd) */
 602     *__SIMD32(ptr1)++ = __SHASX(S, U);
 603
 604
 605     /* xd' = (xa-yb-xc+yd) */
 606     /* yd' = (ya+xb-yc-xd) */
 607     *__SIMD32(ptr1)++ = __SHSAX(S, U);
 608
 609 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 610
 611   } while(--j);
 612
 613   /* end of last stage process */
 614
 615   /* output is in 11.5(q5) format for the 1024 point */
 616   /* output is in 9.7(q7) format for the 256 point   */
 617   /* output is in 7.9(q9) format for the 64 point  */
 618   /* output is in 5.11(q11) format for the 16 point  */
 619
 620
 621 #else
 622
 623   /* Run the below code for Cortex-M0 */
 624
 625   q15_t R0, R1, S0, S1, T0, T1, U0, U1;
 626   q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
 627   uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
 628
 629   /* Total process is divided into three stages */
 630
 631   /* process first stage, middle stages, & last stage */
 632
 633   /*  Initializations for the first stage */
 634   n2 = fftLen;
 635   n1 = n2;
 636
 637   /* n2 = fftLen/4 */
 638   n2 >>= 2u;
 639
 640   /* Index for twiddle coefficient */
 641   ic = 0u;
 642
 643   /* Index for input read and output write */
 644   i0 = 0u;
 645   j = n2;
 646
 647   /* Input is in 1.15(q15) format */
 648
 649   /*  start of first stage process */
 650   do
 651   {
 652     /*  Butterfly implementation */
 653
 654     /*  index calculation for the input as, */
 655     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
 656     i1 = i0 + n2;
 657     i2 = i1 + n2;
 658     i3 = i2 + n2;
 659
 660     /*  Reading i0, i0+fftLen/2 inputs */
 661
 662     /* input is down scale by 4 to avoid overflow */
 663     /* Read ya (real), xa(imag) input */
 664     T0 = pSrc16[i0 * 2u] >> 2u;
 665     T1 = pSrc16[(i0 * 2u) + 1u] >> 2u;
 666
 667     /* input is down scale by 4 to avoid overflow */
 668     /* Read yc (real), xc(imag) input */
 669     S0 = pSrc16[i2 * 2u] >> 2u;
 670     S1 = pSrc16[(i2 * 2u) + 1u] >> 2u;
 671
 672     /* R0 = (ya + yc) */
 673     R0 = __SSAT(T0 + S0, 16u);
 674     /* R1 = (xa + xc) */
 675     R1 = __SSAT(T1 + S1, 16u);
 676
 677     /* S0 = (ya - yc) */
 678     S0 = __SSAT(T0 - S0, 16);
 679     /* S1 = (xa - xc) */
 680     S1 = __SSAT(T1 - S1, 16);
 681
 682     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
 683     /* input is down scale by 4 to avoid overflow */
 684     /* Read yb (real), xb(imag) input */
 685     T0 = pSrc16[i1 * 2u] >> 2u;
 686     T1 = pSrc16[(i1 * 2u) + 1u] >> 2u;
 687
 688     /* input is down scale by 4 to avoid overflow */
 689     /* Read yd (real), xd(imag) input */
 690     U0 = pSrc16[i3 * 2u] >> 2u;
 691     U1 = pSrc16[(i3 * 2u) + 1] >> 2u;
 692
 693     /* T0 = (yb + yd) */
 694     T0 = __SSAT(T0 + U0, 16u);
 695     /* T1 = (xb + xd) */
 696     T1 = __SSAT(T1 + U1, 16u);
 697
 698     /*  writing the butterfly processed i0 sample */
 699     /* ya' = ya + yb + yc + yd */
 700     /* xa' = xa + xb + xc + xd */
 701     pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
 702     pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
 703
 704     /* R0 = (ya + yc) - (yb + yd) */
 705     /* R1 = (xa + xc) - (xb + xd) */
 706     R0 = __SSAT(R0 - T0, 16u);
 707     R1 = __SSAT(R1 - T1, 16u);
 708
 709     /* co2 & si2 are read from Coefficient pointer */
 710     Co2 = pCoef16[2u * ic * 2u];
 711     Si2 = pCoef16[(2u * ic * 2u) + 1];
 712
 713     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
 714     out1 = (short) ((Co2 * R0 + Si2 * R1) >> 16u);
 715     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
 716     out2 = (short) ((-Si2 * R0 + Co2 * R1) >> 16u);
 717
 718     /*  Reading i0+fftLen/4 */
 719     /* input is down scale by 4 to avoid overflow */
 720     /* T0 = yb, T1 =  xb */
 721     T0 = pSrc16[i1 * 2u] >> 2;
 722     T1 = pSrc16[(i1 * 2u) + 1] >> 2;
 723
 724     /* writing the butterfly processed i0 + fftLen/4 sample */
 725     /* writing output(xc', yc') in little endian format */
 726     pSrc16[i1 * 2u] = out1;
 727     pSrc16[(i1 * 2u) + 1] = out2;
 728
 729     /*  Butterfly calculations */
 730     /* input is down scale by 4 to avoid overflow */
 731     /* U0 = yd, U1 = xd */
 732     U0 = pSrc16[i3 * 2u] >> 2;
 733     U1 = pSrc16[(i3 * 2u) + 1] >> 2;
 734     /* T0 = yb-yd */
 735     T0 = __SSAT(T0 - U0, 16);
 736     /* T1 = xb-xd */
 737     T1 = __SSAT(T1 - U1, 16);
 738
 739     /* R1 = (ya-yc) + (xb- xd),  R0 = (xa-xc) - (yb-yd)) */
 740     R0 = (short) __SSAT((q31_t) (S0 - T1), 16);
 741     R1 = (short) __SSAT((q31_t) (S1 + T0), 16);
 742
 743     /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
 744     S0 = (short) __SSAT(((q31_t) S0 + T1), 16u);
 745     S1 = (short) __SSAT(((q31_t) S1 - T0), 16u);
 746
 747     /* co1 & si1 are read from Coefficient pointer */
 748     Co1 = pCoef16[ic * 2u];
 749     Si1 = pCoef16[(ic * 2u) + 1];
 750     /*  Butterfly process for the i0+fftLen/2 sample */
 751     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
 752     out1 = (short) ((Si1 * S1 + Co1 * S0) >> 16);
 753     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
 754     out2 = (short) ((-Si1 * S0 + Co1 * S1) >> 16);
 755
 756     /* writing output(xb', yb') in little endian format */
 757     pSrc16[i2 * 2u] = out1;
 758     pSrc16[(i2 * 2u) + 1] = out2;
 759
 760     /* Co3 & si3 are read from Coefficient pointer */
 761     Co3 = pCoef16[3u * (ic * 2u)];
 762     Si3 = pCoef16[(3u * (ic * 2u)) + 1];
 763     /*  Butterfly process for the i0+3fftLen/4 sample */
 764     /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
 765     out1 = (short) ((Si3 * R1 + Co3 * R0) >> 16u);
 766     /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
 767     out2 = (short) ((-Si3 * R0 + Co3 * R1) >> 16u);
 768     /* writing output(xd', yd') in little endian format */
 769     pSrc16[i3 * 2u] = out1;
 770     pSrc16[(i3 * 2u) + 1] = out2;
 771
 772     /*  Twiddle coefficients index modifier */
 773     ic = ic + twidCoefModifier;
 774
 775     /*  Updating input index */
 776     i0 = i0 + 1u;
 777
 778   } while(--j);
 779   /* data is in 4.11(q11) format */
 780
 781   /* end of first stage process */
 782
 783
 784   /* start of middle stage process */
 785
 786   /*  Twiddle coefficients index modifier */
 787   twidCoefModifier <<= 2u;
 788
 789   /*  Calculation of Middle stage */
 790   for (k = fftLen / 4u; k > 4u; k >>= 2u)
 791   {
 792     /*  Initializations for the middle stage */
 793     n1 = n2;
 794     n2 >>= 2u;
 795     ic = 0u;
 796
 797     for (j = 0u; j <= (n2 - 1u); j++)
 798     {
 799       /*  index calculation for the coefficients */
 800       Co1 = pCoef16[ic * 2u];
 801       Si1 = pCoef16[(ic * 2u) + 1u];
 802       Co2 = pCoef16[2u * (ic * 2u)];
 803       Si2 = pCoef16[(2u * (ic * 2u)) + 1u];
 804       Co3 = pCoef16[3u * (ic * 2u)];
 805       Si3 = pCoef16[(3u * (ic * 2u)) + 1u];
 806
 807       /*  Twiddle coefficients index modifier */
 808       ic = ic + twidCoefModifier;
 809
 810       /*  Butterfly implementation */
 811       for (i0 = j; i0 < fftLen; i0 += n1)
 812       {
 813         /*  index calculation for the input as, */
 814         /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
 815         i1 = i0 + n2;
 816         i2 = i1 + n2;
 817         i3 = i2 + n2;
 818
 819         /*  Reading i0, i0+fftLen/2 inputs */
 820         /* Read ya (real), xa(imag) input */
 821         T0 = pSrc16[i0 * 2u];
 822         T1 = pSrc16[(i0 * 2u) + 1u];
 823
 824         /* Read yc (real), xc(imag) input */
 825         S0 = pSrc16[i2 * 2u];
 826         S1 = pSrc16[(i2 * 2u) + 1u];
 827
 828         /* R0 = (ya + yc), R1 = (xa + xc) */
 829         R0 = __SSAT(T0 + S0, 16);
 830         R1 = __SSAT(T1 + S1, 16);
 831
 832         /* S0 = (ya - yc), S1 =(xa - xc) */
 833         S0 = __SSAT(T0 - S0, 16);
 834         S1 = __SSAT(T1 - S1, 16);
 835
 836         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
 837         /* Read yb (real), xb(imag) input */
 838         T0 = pSrc16[i1 * 2u];
 839         T1 = pSrc16[(i1 * 2u) + 1u];
 840
 841         /* Read yd (real), xd(imag) input */
 842         U0 = pSrc16[i3 * 2u];
 843         U1 = pSrc16[(i3 * 2u) + 1u];
 844
 845
 846         /* T0 = (yb + yd), T1 = (xb + xd) */
 847         T0 = __SSAT(T0 + U0, 16);
 848         T1 = __SSAT(T1 + U1, 16);
 849
 850         /*  writing the butterfly processed i0 sample */
 851
 852         /* xa' = xa + xb + xc + xd */
 853         /* ya' = ya + yb + yc + yd */
 854         out1 = ((R0 >> 1u) + (T0 >> 1u)) >> 1u;
 855         out2 = ((R1 >> 1u) + (T1 >> 1u)) >> 1u;
 856
 857         pSrc16[i0 * 2u] = out1;
 858         pSrc16[(2u * i0) + 1u] = out2;
 859
 860         /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
 861         R0 = (R0 >> 1u) - (T0 >> 1u);
 862         R1 = (R1 >> 1u) - (T1 >> 1u);
 863
 864         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
 865         out1 = (short) ((Co2 * R0 + Si2 * R1) >> 16u);
 866
 867         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
 868         out2 = (short) ((-Si2 * R0 + Co2 * R1) >> 16u);
 869
 870         /*  Reading i0+3fftLen/4 */
 871         /* Read yb (real), xb(imag) input */
 872         T0 = pSrc16[i1 * 2u];
 873         T1 = pSrc16[(i1 * 2u) + 1u];
 874
 875         /*  writing the butterfly processed i0 + fftLen/4 sample */
 876         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
 877         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
 878         pSrc16[i1 * 2u] = out1;
 879         pSrc16[(i1 * 2u) + 1u] = out2;
 880
 881         /*  Butterfly calculations */
 882
 883         /* Read yd (real), xd(imag) input */
 884         U0 = pSrc16[i3 * 2u];
 885         U1 = pSrc16[(i3 * 2u) + 1u];
 886
 887         /* T0 = yb-yd, T1 = xb-xd */
 888         T0 = __SSAT(T0 - U0, 16);
 889         T1 = __SSAT(T1 - U1, 16);
 890
 891         /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
 892         R0 = (S0 >> 1u) - (T1 >> 1u);
 893         R1 = (S1 >> 1u) + (T0 >> 1u);
 894
 895         /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
 896         S0 = (S0 >> 1u) + (T1 >> 1u);
 897         S1 = (S1 >> 1u) - (T0 >> 1u);
 898
 899         /*  Butterfly process for the i0+fftLen/2 sample */
 900         out1 = (short) ((Co1 * S0 + Si1 * S1) >> 16u);
 901
 902         out2 = (short) ((-Si1 * S0 + Co1 * S1) >> 16u);
 903
 904         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
 905         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
 906         pSrc16[i2 * 2u] = out1;
 907         pSrc16[(i2 * 2u) + 1u] = out2;
 908
 909         /*  Butterfly process for the i0+3fftLen/4 sample */
 910         out1 = (short) ((Si3 * R1 + Co3 * R0) >> 16u);
 911
 912         out2 = (short) ((-Si3 * R0 + Co3 * R1) >> 16u);
 913         /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
 914         /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
 915         pSrc16[i3 * 2u] = out1;
 916         pSrc16[(i3 * 2u) + 1u] = out2;
 917       }
 918     }
 919     /*  Twiddle coefficients index modifier */
 920     twidCoefModifier <<= 2u;
 921   }
 922   /* end of middle stage process */
 923
 924
 925   /* data is in 10.6(q6) format for the 1024 point */
 926   /* data is in 8.8(q8) format for the 256 point */
 927   /* data is in 6.10(q10) format for the 64 point */
 928   /* data is in 4.12(q12) format for the 16 point */
 929
 930   /*  Initializations for the last stage */
 931   n1 = n2;
 932   n2 >>= 2u;
 933
 934   /* start of last stage process */
 935
 936   /*  Butterfly implementation */
 937   for (i0 = 0u; i0 <= (fftLen - n1); i0 += n1)
 938   {
 939     /*  index calculation for the input as, */
 940     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
 941     i1 = i0 + n2;
 942     i2 = i1 + n2;
 943     i3 = i2 + n2;
 944
 945     /*  Reading i0, i0+fftLen/2 inputs */
 946     /* Read ya (real), xa(imag) input */
 947     T0 = pSrc16[i0 * 2u];
 948     T1 = pSrc16[(i0 * 2u) + 1u];
 949
 950     /* Read yc (real), xc(imag) input */
 951     S0 = pSrc16[i2 * 2u];
 952     S1 = pSrc16[(i2 * 2u) + 1u];
 953
 954     /* R0 = (ya + yc), R1 = (xa + xc) */
 955     R0 = __SSAT(T0 + S0, 16u);
 956     R1 = __SSAT(T1 + S1, 16u);
 957
 958     /* S0 = (ya - yc), S1 = (xa - xc) */
 959     S0 = __SSAT(T0 - S0, 16u);
 960     S1 = __SSAT(T1 - S1, 16u);
 961
 962     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
 963     /* Read yb (real), xb(imag) input */
 964     T0 = pSrc16[i1 * 2u];
 965     T1 = pSrc16[(i1 * 2u) + 1u];
 966     /* Read yd (real), xd(imag) input */
 967     U0 = pSrc16[i3 * 2u];
 968     U1 = pSrc16[(i3 * 2u) + 1u];
 969
 970     /* T0 = (yb + yd), T1 = (xb + xd)) */
 971     T0 = __SSAT(T0 + U0, 16u);
 972     T1 = __SSAT(T1 + U1, 16u);
 973
 974     /*  writing the butterfly processed i0 sample */
 975     /* xa' = xa + xb + xc + xd */
 976     /* ya' = ya + yb + yc + yd */
 977     pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
 978     pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
 979
 980     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
 981     R0 = (R0 >> 1u) - (T0 >> 1u);
 982     R1 = (R1 >> 1u) - (T1 >> 1u);
 983     /* Read yb (real), xb(imag) input */
 984     T0 = pSrc16[i1 * 2u];
 985     T1 = pSrc16[(i1 * 2u) + 1u];
 986
 987     /*  writing the butterfly processed i0 + fftLen/4 sample */
 988     /* xc' = (xa-xb+xc-xd) */
 989     /* yc' = (ya-yb+yc-yd) */
 990     pSrc16[i1 * 2u] = R0;
 991     pSrc16[(i1 * 2u) + 1u] = R1;
 992
 993     /* Read yd (real), xd(imag) input */
 994     U0 = pSrc16[i3 * 2u];
 995     U1 = pSrc16[(i3 * 2u) + 1u];
 996     /* T0 = (yb - yd), T1 = (xb - xd)  */
 997     T0 = __SSAT(T0 - U0, 16u);
 998     T1 = __SSAT(T1 - U1, 16u);
 999
1000     /*  writing the butterfly processed i0 + fftLen/2 sample */
1001     /* xb' = (xa+yb-xc-yd) */
1002     /* yb' = (ya-xb-yc+xd) */
1003     pSrc16[i2 * 2u] = (S0 >> 1u) + (T1 >> 1u);
1004     pSrc16[(i2 * 2u) + 1u] = (S1 >> 1u) - (T0 >> 1u);
1005
1006     /*  writing the butterfly processed i0 + 3fftLen/4 sample */
1007     /* xd' = (xa-yb-xc+yd) */
1008     /* yd' = (ya+xb-yc-xd) */
1009     pSrc16[i3 * 2u] = (S0 >> 1u) - (T1 >> 1u);
1010     pSrc16[(i3 * 2u) + 1u] = (S1 >> 1u) + (T0 >> 1u);
1011
1012   }
1013
1014   /* end of last stage process */
1015
1016   /* output is in 11.5(q5) format for the 1024 point */
1017   /* output is in 9.7(q7) format for the 256 point   */
1018   /* output is in 7.9(q9) format for the 64 point  */
1019   /* output is in 5.11(q11) format for the 16 point  */
1020
1021 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
1022
1023 }
1024
1025
1026 /**
1027  * @brief  Core function for the Q15 CIFFT butterfly process.
1028  * @param[in, out] *pSrc16          points to the in-place buffer of Q15 data type.
1029  * @param[in]      fftLen           length of the FFT.
1030  * @param[in]      *pCoef16         points to twiddle coefficient buffer.
1031  * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
1032  * @return none.
1033  */
1034
1035 /*
1036 * Radix-4 IFFT algorithm used is :
1037 *
1038 * CIFFT uses same twiddle coefficients as CFFT function
1039 *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
1040 *
1041 *
1042 * IFFT is implemented with following changes in equations from FFT
1043 *
1044 * Input real and imaginary data:
1045 * x(n) = xa + j * ya
1046 * x(n+N/4 ) = xb + j * yb
1047 * x(n+N/2 ) = xc + j * yc
1048 * x(n+3N 4) = xd + j * yd
1049 *
1050 *
1051 * Output real and imaginary data:
1052 * x(4r) = xa'+ j * ya'
1053 * x(4r+1) = xb'+ j * yb'
1054 * x(4r+2) = xc'+ j * yc'
1055 * x(4r+3) = xd'+ j * yd'
1056 *
1057 *
1058 * Twiddle factors for radix-4 IFFT:
1059 * Wn = co1 + j * (si1)
1060 * W2n = co2 + j * (si2)
1061 * W3n = co3 + j * (si3)
1062
1063 * The real and imaginary output values for the radix-4 butterfly are
1064 * xa' = xa + xb + xc + xd
1065 * ya' = ya + yb + yc + yd
1066 * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
1067 * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
1068 * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
1069 * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
1070 * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
1071 * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
1072 *
1073 */
1074
1075 void arm_radix4_butterfly_inverse_q15(
1076   q15_t * pSrc16,
1077   uint32_t fftLen,
1078   q15_t * pCoef16,
1079   uint32_t twidCoefModifier)
1080 {
1081
1082 #ifndef ARM_MATH_CM0_FAMILY
1083
1084   /* Run the below code for Cortex-M4 and Cortex-M3 */
1085
1086   q31_t R, S, T, U;
1087   q31_t C1, C2, C3, out1, out2;
1088   uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
1089   q15_t in;
1090
1091   q15_t *ptr1;
1092
1093
1094
1095   q31_t xaya, xbyb, xcyc, xdyd;
1096
1097   /* Total process is divided into three stages */
1098
1099   /* process first stage, middle stages, & last stage */
1100
1101   /*  Initializations for the first stage */
1102   n2 = fftLen;
1103   n1 = n2;
1104
1105   /* n2 = fftLen/4 */
1106   n2 >>= 2u;
1107
1108   /* Index for twiddle coefficient */
1109   ic = 0u;
1110
1111   /* Index for input read and output write */
1112   i0 = 0u;
1113   j = n2;
1114
1115   /* Input is in 1.15(q15) format */
1116
1117   /*  start of first stage process */
1118   do
1119   {
1120     /*  Butterfly implementation */
1121
1122     /*  index calculation for the input as, */
1123     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1124     i1 = i0 + n2;
1125     i2 = i1 + n2;
1126     i3 = i2 + n2;
1127
1128     /*  Reading i0, i0+fftLen/2 inputs */
1129     /* Read ya (real), xa(imag) input */
1130     T = _SIMD32_OFFSET(pSrc16 + (2u * i0));
1131     in = ((int16_t) (T & 0xFFFF)) >> 2;
1132     T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
1133
1134     /* Read yc (real), xc(imag) input */
1135     S = _SIMD32_OFFSET(pSrc16 + (2u * i2));
1136     in = ((int16_t) (S & 0xFFFF)) >> 2;
1137     S = ((S >> 2) & 0xFFFF0000) | (in & 0xFFFF);
1138
1139     /* R = packed((ya + yc), (xa + xc) ) */
1140     R = __QADD16(T, S);
1141
1142     /* S = packed((ya - yc), (xa - xc) ) */
1143     S = __QSUB16(T, S);
1144
1145     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1146     /* Read yb (real), xb(imag) input */
1147     T = _SIMD32_OFFSET(pSrc16 + (2u * i1));
1148     in = ((int16_t) (T & 0xFFFF)) >> 2;
1149     T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
1150
1151     /* Read yd (real), xd(imag) input */
1152     U = _SIMD32_OFFSET(pSrc16 + (2u * i3));
1153     in = ((int16_t) (U & 0xFFFF)) >> 2;
1154     U = ((U >> 2) & 0xFFFF0000) | (in & 0xFFFF);
1155
1156     /* T = packed((yb + yd), (xb + xd) ) */
1157     T = __QADD16(T, U);
1158
1159     /*  writing the butterfly processed i0 sample */
1160     /* xa' = xa + xb + xc + xd */
1161     /* ya' = ya + yb + yc + yd */
1162     _SIMD32_OFFSET(pSrc16 + (2u * i0)) = __SHADD16(R, T);
1163
1164     /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
1165     R = __QSUB16(R, T);
1166
1167     /* co2 & si2 are read from SIMD Coefficient pointer */
1168     C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
1169
1170 #ifndef ARM_MATH_BIG_ENDIAN
1171
1172     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1173     out1 = __SMUSD(C2, R) >> 16u;
1174     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1175     out2 = __SMUADX(C2, R);
1176
1177 #else
1178
1179     /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1180     out1 = __SMUADX(C2, R) >> 16u;
1181     /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1182     out2 = __SMUSD(__QSUB16(0, C2), R);
1183
1184 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1185
1186     /*  Reading i0+fftLen/4 */
1187     /* T = packed(yb, xb) */
1188     T = _SIMD32_OFFSET(pSrc16 + (2u * i1));
1189     in = ((int16_t) (T & 0xFFFF)) >> 2;
1190     T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
1191
1192     /* writing the butterfly processed i0 + fftLen/4 sample */
1193     /* writing output(xc', yc') in little endian format */
1194     _SIMD32_OFFSET(pSrc16 + (2u * i1)) =
1195       (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1196
1197     /*  Butterfly calculations */
1198     /* U = packed(yd, xd) */
1199     U = _SIMD32_OFFSET(pSrc16 + (2u * i3));
1200     in = ((int16_t) (U & 0xFFFF)) >> 2;
1201     U = ((U >> 2) & 0xFFFF0000) | (in & 0xFFFF);
1202
1203     /* T = packed(yb-yd, xb-xd) */
1204     T = __QSUB16(T, U);
1205
1206 #ifndef ARM_MATH_BIG_ENDIAN
1207
1208     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1209     R = __QSAX(S, T);
1210     /* S = packed((ya-yc) + (xb- xd),  (xa-xc) - (yb-yd)) */
1211     S = __QASX(S, T);
1212
1213 #else
1214
1215     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1216     R = __QASX(S, T);
1217     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1218     S = __QSAX(S, T);
1219
1220 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1221
1222     /* co1 & si1 are read from SIMD Coefficient pointer */
1223     C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
1224     /*  Butterfly process for the i0+fftLen/2 sample */
1225
1226 #ifndef ARM_MATH_BIG_ENDIAN
1227
1228     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1229     out1 = __SMUSD(C1, S) >> 16u;
1230     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1231     out2 = __SMUADX(C1, S);
1232
1233 #else
1234
1235     /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1236     out1 = __SMUADX(C1, S) >> 16u;
1237     /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1238     out2 = __SMUSD(__QSUB16(0, C1), S);
1239
1240 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1241
1242     /* writing output(xb', yb') in little endian format */
1243     _SIMD32_OFFSET(pSrc16 + (2u * i2)) =
1244       ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
1245
1246
1247     /* co3 & si3 are read from SIMD Coefficient pointer */
1248     C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
1249     /*  Butterfly process for the i0+3fftLen/4 sample */
1250
1251 #ifndef ARM_MATH_BIG_ENDIAN
1252
1253     /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1254     out1 = __SMUSD(C3, R) >> 16u;
1255     /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1256     out2 = __SMUADX(C3, R);
1257
1258 #else
1259
1260     /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1261     out1 = __SMUADX(C3, R) >> 16u;
1262     /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1263     out2 = __SMUSD(__QSUB16(0, C3), R);
1264
1265 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1266
1267     /* writing output(xd', yd') in little endian format */
1268     _SIMD32_OFFSET(pSrc16 + (2u * i3)) =
1269       ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1270
1271     /*  Twiddle coefficients index modifier */
1272     ic = ic + twidCoefModifier;
1273
1274     /*  Updating input index */
1275     i0 = i0 + 1u;
1276
1277   } while(--j);
1278   /* data is in 4.11(q11) format */
1279
1280   /* end of first stage process */
1281
1282
1283   /* start of middle stage process */
1284
1285   /*  Twiddle coefficients index modifier */
1286   twidCoefModifier <<= 2u;
1287
1288   /*  Calculation of Middle stage */
1289   for (k = fftLen / 4u; k > 4u; k >>= 2u)
1290   {
1291     /*  Initializations for the middle stage */
1292     n1 = n2;
1293     n2 >>= 2u;
1294     ic = 0u;
1295
1296     for (j = 0u; j <= (n2 - 1u); j++)
1297     {
1298       /*  index calculation for the coefficients */
1299       C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
1300       C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
1301       C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
1302
1303       /*  Twiddle coefficients index modifier */
1304       ic = ic + twidCoefModifier;
1305
1306       /*  Butterfly implementation */
1307       for (i0 = j; i0 < fftLen; i0 += n1)
1308       {
1309         /*  index calculation for the input as, */
1310         /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1311         i1 = i0 + n2;
1312         i2 = i1 + n2;
1313         i3 = i2 + n2;
1314
1315         /*  Reading i0, i0+fftLen/2 inputs */
1316         /* Read ya (real), xa(imag) input */
1317         T = _SIMD32_OFFSET(pSrc16 + (2u * i0));
1318
1319         /* Read yc (real), xc(imag) input */
1320         S = _SIMD32_OFFSET(pSrc16 + (2u * i2));
1321
1322         /* R = packed( (ya + yc), (xa + xc)) */
1323         R = __QADD16(T, S);
1324
1325         /* S = packed((ya - yc), (xa - xc)) */
1326         S = __QSUB16(T, S);
1327
1328         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1329         /* Read yb (real), xb(imag) input */
1330         T = _SIMD32_OFFSET(pSrc16 + (2u * i1));
1331
1332         /* Read yd (real), xd(imag) input */
1333         U = _SIMD32_OFFSET(pSrc16 + (2u * i3));
1334
1335         /* T = packed( (yb + yd), (xb + xd)) */
1336         T = __QADD16(T, U);
1337
1338         /*  writing the butterfly processed i0 sample */
1339
1340         /* xa' = xa + xb + xc + xd */
1341         /* ya' = ya + yb + yc + yd */
1342         out1 = __SHADD16(R, T);
1343         in = ((int16_t) (out1 & 0xFFFF)) >> 1;
1344         out1 = ((out1 >> 1) & 0xFFFF0000) | (in & 0xFFFF);
1345         _SIMD32_OFFSET(pSrc16 + (2u * i0)) = out1;
1346
1347         /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
1348         R = __SHSUB16(R, T);
1349
1350 #ifndef ARM_MATH_BIG_ENDIAN
1351
1352         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
1353         out1 = __SMUSD(C2, R) >> 16u;
1354
1355         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1356         out2 = __SMUADX(C2, R);
1357
1358 #else
1359
1360         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1361         out1 = __SMUADX(R, C2) >> 16u;
1362
1363         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
1364         out2 = __SMUSD(__QSUB16(0, C2), R);
1365
1366 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1367
1368         /*  Reading i0+3fftLen/4 */
1369         /* Read yb (real), xb(imag) input */
1370         T = _SIMD32_OFFSET(pSrc16 + (2u * i1));
1371
1372         /*  writing the butterfly processed i0 + fftLen/4 sample */
1373         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1374         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1375         _SIMD32_OFFSET(pSrc16 + (2u * i1)) =
1376           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1377
1378         /*  Butterfly calculations */
1379
1380         /* Read yd (real), xd(imag) input */
1381         U = _SIMD32_OFFSET(pSrc16 + (2u * i3));
1382
1383         /* T = packed(yb-yd, xb-xd) */
1384         T = __QSUB16(T, U);
1385
1386 #ifndef ARM_MATH_BIG_ENDIAN
1387
1388         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1389         R = __SHSAX(S, T);
1390
1391         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1392         S = __SHASX(S, T);
1393
1394
1395         /*  Butterfly process for the i0+fftLen/2 sample */
1396         out1 = __SMUSD(C1, S) >> 16u;
1397         out2 = __SMUADX(C1, S);
1398
1399 #else
1400
1401         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1402         R = __SHASX(S, T);
1403
1404         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1405         S = __SHSAX(S, T);
1406
1407
1408         /*  Butterfly process for the i0+fftLen/2 sample */
1409         out1 = __SMUADX(S, C1) >> 16u;
1410         out2 = __SMUSD(__QSUB16(0, C1), S);
1411
1412 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1413
1414         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1415         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1416         _SIMD32_OFFSET(pSrc16 + (2u * i2)) =
1417           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1418
1419         /*  Butterfly process for the i0+3fftLen/4 sample */
1420
1421 #ifndef ARM_MATH_BIG_ENDIAN
1422
1423         out1 = __SMUSD(C3, R) >> 16u;
1424         out2 = __SMUADX(C3, R);
1425
1426 #else
1427
1428         out1 = __SMUADX(C3, R) >> 16u;
1429         out2 = __SMUSD(__QSUB16(0, C3), R);
1430
1431 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1432
1433         /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1434         /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1435         _SIMD32_OFFSET(pSrc16 + (2u * i3)) =
1436           ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1437       }
1438     }
1439     /*  Twiddle coefficients index modifier */
1440     twidCoefModifier <<= 2u;
1441   }
1442   /* end of middle stage process */
1443
1444   /* data is in 10.6(q6) format for the 1024 point */
1445   /* data is in 8.8(q8) format for the 256 point */
1446   /* data is in 6.10(q10) format for the 64 point */
1447   /* data is in 4.12(q12) format for the 16 point */
1448
1449   /*  Initializations for the last stage */
1450   j = fftLen >> 2;
1451
1452   ptr1 = &pSrc16[0];
1453
1454   /* start of last stage process */
1455
1456   /*  Butterfly implementation */
1457   do
1458   {
1459     /* Read xa (real), ya(imag) input */
1460     xaya = *__SIMD32(ptr1)++;
1461
1462     /* Read xb (real), yb(imag) input */
1463     xbyb = *__SIMD32(ptr1)++;
1464
1465     /* Read xc (real), yc(imag) input */
1466     xcyc = *__SIMD32(ptr1)++;
1467
1468     /* Read xd (real), yd(imag) input */
1469     xdyd = *__SIMD32(ptr1)++;
1470
1471     /* R = packed((ya + yc), (xa + xc)) */
1472     R = __QADD16(xaya, xcyc);
1473
1474     /* T = packed((yb + yd), (xb + xd)) */
1475     T = __QADD16(xbyb, xdyd);
1476
1477     /* pointer updation for writing */
1478     ptr1 = ptr1 - 8u;
1479
1480
1481     /* xa' = xa + xb + xc + xd */
1482     /* ya' = ya + yb + yc + yd */
1483     *__SIMD32(ptr1)++ = __SHADD16(R, T);
1484
1485     /* T = packed((yb + yd), (xb + xd)) */
1486     T = __QADD16(xbyb, xdyd);
1487
1488     /* xc' = (xa-xb+xc-xd) */
1489     /* yc' = (ya-yb+yc-yd) */
1490     *__SIMD32(ptr1)++ = __SHSUB16(R, T);
1491
1492     /* S = packed((ya - yc), (xa - xc)) */
1493     S = __QSUB16(xaya, xcyc);
1494
1495     /* Read yd (real), xd(imag) input */
1496     /* T = packed( (yb - yd), (xb - xd))  */
1497     U = __QSUB16(xbyb, xdyd);
1498
1499 #ifndef ARM_MATH_BIG_ENDIAN
1500
1501     /* xb' = (xa+yb-xc-yd) */
1502     /* yb' = (ya-xb-yc+xd) */
1503     *__SIMD32(ptr1)++ = __SHASX(S, U);
1504
1505
1506     /* xd' = (xa-yb-xc+yd) */
1507     /* yd' = (ya+xb-yc-xd) */
1508     *__SIMD32(ptr1)++ = __SHSAX(S, U);
1509
1510 #else
1511
1512     /* xb' = (xa+yb-xc-yd) */
1513     /* yb' = (ya-xb-yc+xd) */
1514     *__SIMD32(ptr1)++ = __SHSAX(S, U);
1515
1516
1517     /* xd' = (xa-yb-xc+yd) */
1518     /* yd' = (ya+xb-yc-xd) */
1519     *__SIMD32(ptr1)++ = __SHASX(S, U);
1520
1521
1522 #endif /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1523
1524   } while(--j);
1525
1526   /* end of last stage  process */
1527
1528   /* output is in 11.5(q5) format for the 1024 point */
1529   /* output is in 9.7(q7) format for the 256 point   */
1530   /* output is in 7.9(q9) format for the 64 point  */
1531   /* output is in 5.11(q11) format for the 16 point  */
1532
1533
1534 #else
1535
1536   /* Run the below code for Cortex-M0 */
1537
1538   q15_t R0, R1, S0, S1, T0, T1, U0, U1;
1539   q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
1540   uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
1541
1542   /* Total process is divided into three stages */
1543
1544   /* process first stage, middle stages, & last stage */
1545
1546   /*  Initializations for the first stage */
1547   n2 = fftLen;
1548   n1 = n2;
1549
1550   /* n2 = fftLen/4 */
1551   n2 >>= 2u;
1552
1553   /* Index for twiddle coefficient */
1554   ic = 0u;
1555
1556   /* Index for input read and output write */
1557   i0 = 0u;
1558
1559   j = n2;
1560
1561   /* Input is in 1.15(q15) format */
1562
1563   /*  Start of first stage process */
1564   do
1565   {
1566     /*  Butterfly implementation */
1567
1568     /*  index calculation for the input as, */
1569     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1570     i1 = i0 + n2;
1571     i2 = i1 + n2;
1572     i3 = i2 + n2;
1573
1574     /*  Reading i0, i0+fftLen/2 inputs */
1575     /* input is down scale by 4 to avoid overflow */
1576     /* Read ya (real), xa(imag) input */
1577     T0 = pSrc16[i0 * 2u] >> 2u;
1578     T1 = pSrc16[(i0 * 2u) + 1u] >> 2u;
1579     /* input is down scale by 4 to avoid overflow */
1580     /* Read yc (real), xc(imag) input */
1581     S0 = pSrc16[i2 * 2u] >> 2u;
1582     S1 = pSrc16[(i2 * 2u) + 1u] >> 2u;
1583
1584     /* R0 = (ya + yc), R1 = (xa + xc) */
1585     R0 = __SSAT(T0 + S0, 16u);
1586     R1 = __SSAT(T1 + S1, 16u);
1587     /* S0 = (ya - yc), S1 = (xa - xc) */
1588     S0 = __SSAT(T0 - S0, 16u);
1589     S1 = __SSAT(T1 - S1, 16u);
1590
1591     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1592     /* input is down scale by 4 to avoid overflow */
1593     /* Read yb (real), xb(imag) input */
1594     T0 = pSrc16[i1 * 2u] >> 2u;
1595     T1 = pSrc16[(i1 * 2u) + 1u] >> 2u;
1596     /* Read yd (real), xd(imag) input */
1597     /* input is down scale by 4 to avoid overflow */
1598     U0 = pSrc16[i3 * 2u] >> 2u;
1599     U1 = pSrc16[(i3 * 2u) + 1u] >> 2u;
1600
1601     /* T0 = (yb + yd), T1 = (xb + xd) */
1602     T0 = __SSAT(T0 + U0, 16u);
1603     T1 = __SSAT(T1 + U1, 16u);
1604
1605     /*  writing the butterfly processed i0 sample */
1606     /* xa' = xa + xb + xc + xd */
1607     /* ya' = ya + yb + yc + yd */
1608     pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
1609     pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
1610
1611     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
1612     R0 = __SSAT(R0 - T0, 16u);
1613     R1 = __SSAT(R1 - T1, 16u);
1614     /* co2 & si2 are read from Coefficient pointer */
1615     Co2 = pCoef16[2u * ic * 2u];
1616     Si2 = pCoef16[(2u * ic * 2u) + 1u];
1617     /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
1618     out1 = (short) ((Co2 * R0 - Si2 * R1) >> 16u);
1619     /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1620     out2 = (short) ((Si2 * R0 + Co2 * R1) >> 16u);
1621
1622     /*  Reading i0+fftLen/4 */
1623     /* input is down scale by 4 to avoid overflow */
1624     /* T0 = yb, T1 = xb */
1625     T0 = pSrc16[i1 * 2u] >> 2u;
1626     T1 = pSrc16[(i1 * 2u) + 1u] >> 2u;
1627
1628     /* writing the butterfly processed i0 + fftLen/4 sample */
1629     /* writing output(xc', yc') in little endian format */
1630     pSrc16[i1 * 2u] = out1;
1631     pSrc16[(i1 * 2u) + 1u] = out2;
1632
1633     /*  Butterfly calculations */
1634     /* input is down scale by 4 to avoid overflow */
1635     /* U0 = yd, U1 = xd) */
1636     U0 = pSrc16[i3 * 2u] >> 2u;
1637     U1 = pSrc16[(i3 * 2u) + 1u] >> 2u;
1638
1639     /* T0 = yb-yd, T1 = xb-xd) */
1640     T0 = __SSAT(T0 - U0, 16u);
1641     T1 = __SSAT(T1 - U1, 16u);
1642     /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
1643     R0 = (short) __SSAT((q31_t) (S0 + T1), 16);
1644     R1 = (short) __SSAT((q31_t) (S1 - T0), 16);
1645     /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
1646     S0 = (short) __SSAT((q31_t) (S0 - T1), 16);
1647     S1 = (short) __SSAT((q31_t) (S1 + T0), 16);
1648
1649     /* co1 & si1 are read from Coefficient pointer */
1650     Co1 = pCoef16[ic * 2u];
1651     Si1 = pCoef16[(ic * 2u) + 1u];
1652     /*  Butterfly process for the i0+fftLen/2 sample */
1653     /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
1654     out1 = (short) ((Co1 * S0 - Si1 * S1) >> 16u);
1655     /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
1656     out2 = (short) ((Si1 * S0 + Co1 * S1) >> 16u);
1657     /* writing output(xb', yb') in little endian format */
1658     pSrc16[i2 * 2u] = out1;
1659     pSrc16[(i2 * 2u) + 1u] = out2;
1660
1661     /* Co3 & si3 are read from Coefficient pointer */
1662     Co3 = pCoef16[3u * ic * 2u];
1663     Si3 = pCoef16[(3u * ic * 2u) + 1u];
1664     /*  Butterfly process for the i0+3fftLen/4 sample */
1665     /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
1666     out1 = (short) ((Co3 * R0 - Si3 * R1) >> 16u);
1667     /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
1668     out2 = (short) ((Si3 * R0 + Co3 * R1) >> 16u);
1669     /* writing output(xd', yd') in little endian format */
1670     pSrc16[i3 * 2u] = out1;
1671     pSrc16[(i3 * 2u) + 1u] = out2;
1672
1673     /*  Twiddle coefficients index modifier */
1674     ic = ic + twidCoefModifier;
1675
1676     /*  Updating input index */
1677     i0 = i0 + 1u;
1678
1679   } while(--j);
1680
1681   /*  End of first stage process */
1682
1683   /* data is in 4.11(q11) format */
1684
1685
1686   /*  Start of Middle stage process */
1687
1688   /*  Twiddle coefficients index modifier */
1689   twidCoefModifier <<= 2u;
1690
1691   /*  Calculation of Middle stage */
1692   for (k = fftLen / 4u; k > 4u; k >>= 2u)
1693   {
1694     /*  Initializations for the middle stage */
1695     n1 = n2;
1696     n2 >>= 2u;
1697     ic = 0u;
1698
1699     for (j = 0u; j <= (n2 - 1u); j++)
1700     {
1701       /*  index calculation for the coefficients */
1702       Co1 = pCoef16[ic * 2u];
1703       Si1 = pCoef16[(ic * 2u) + 1u];
1704       Co2 = pCoef16[2u * ic * 2u];
1705       Si2 = pCoef16[2u * ic * 2u + 1u];
1706       Co3 = pCoef16[3u * ic * 2u];
1707       Si3 = pCoef16[(3u * ic * 2u) + 1u];
1708
1709       /*  Twiddle coefficients index modifier */
1710       ic = ic + twidCoefModifier;
1711
1712       /*  Butterfly implementation */
1713       for (i0 = j; i0 < fftLen; i0 += n1)
1714       {
1715         /*  index calculation for the input as, */
1716         /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1717         i1 = i0 + n2;
1718         i2 = i1 + n2;
1719         i3 = i2 + n2;
1720
1721         /*  Reading i0, i0+fftLen/2 inputs */
1722         /* Read ya (real), xa(imag) input */
1723         T0 = pSrc16[i0 * 2u];
1724         T1 = pSrc16[(i0 * 2u) + 1u];
1725
1726         /* Read yc (real), xc(imag) input */
1727         S0 = pSrc16[i2 * 2u];
1728         S1 = pSrc16[(i2 * 2u) + 1u];
1729
1730
1731         /* R0 = (ya + yc), R1 = (xa + xc) */
1732         R0 = __SSAT(T0 + S0, 16u);
1733         R1 = __SSAT(T1 + S1, 16u);
1734         /* S0 = (ya - yc), S1 = (xa - xc) */
1735         S0 = __SSAT(T0 - S0, 16u);
1736         S1 = __SSAT(T1 - S1, 16u);
1737
1738         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1739         /* Read yb (real), xb(imag) input */
1740         T0 = pSrc16[i1 * 2u];
1741         T1 = pSrc16[(i1 * 2u) + 1u];
1742
1743         /* Read yd (real), xd(imag) input */
1744         U0 = pSrc16[i3 * 2u];
1745         U1 = pSrc16[(i3 * 2u) + 1u];
1746
1747         /* T0 = (yb + yd), T1 = (xb + xd) */
1748         T0 = __SSAT(T0 + U0, 16u);
1749         T1 = __SSAT(T1 + U1, 16u);
1750
1751         /*  writing the butterfly processed i0 sample */
1752         /* xa' = xa + xb + xc + xd */
1753         /* ya' = ya + yb + yc + yd */
1754         pSrc16[i0 * 2u] = ((R0 >> 1u) + (T0 >> 1u)) >> 1u;
1755         pSrc16[(i0 * 2u) + 1u] = ((R1 >> 1u) + (T1 >> 1u)) >> 1u;
1756
1757         /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
1758         R0 = (R0 >> 1u) - (T0 >> 1u);
1759         R1 = (R1 >> 1u) - (T1 >> 1u);
1760
1761         /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
1762         out1 = (short) ((Co2 * R0 - Si2 * R1) >> 16);
1763         /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1764         out2 = (short) ((Si2 * R0 + Co2 * R1) >> 16);
1765
1766         /*  Reading i0+3fftLen/4 */
1767         /* Read yb (real), xb(imag) input */
1768         T0 = pSrc16[i1 * 2u];
1769         T1 = pSrc16[(i1 * 2u) + 1u];
1770
1771         /*  writing the butterfly processed i0 + fftLen/4 sample */
1772         /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
1773         /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1774         pSrc16[i1 * 2u] = out1;
1775         pSrc16[(i1 * 2u) + 1u] = out2;
1776
1777         /*  Butterfly calculations */
1778         /* Read yd (real), xd(imag) input */
1779         U0 = pSrc16[i3 * 2u];
1780         U1 = pSrc16[(i3 * 2u) + 1u];
1781
1782         /* T0 = yb-yd, T1 = xb-xd) */
1783         T0 = __SSAT(T0 - U0, 16u);
1784         T1 = __SSAT(T1 - U1, 16u);
1785
1786         /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
1787         R0 = (S0 >> 1u) + (T1 >> 1u);
1788         R1 = (S1 >> 1u) - (T0 >> 1u);
1789
1790         /* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
1791         S0 = (S0 >> 1u) - (T1 >> 1u);
1792         S1 = (S1 >> 1u) + (T0 >> 1u);
1793
1794         /*  Butterfly process for the i0+fftLen/2 sample */
1795         out1 = (short) ((Co1 * S0 - Si1 * S1) >> 16u);
1796         out2 = (short) ((Si1 * S0 + Co1 * S1) >> 16u);
1797         /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
1798         /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
1799         pSrc16[i2 * 2u] = out1;
1800         pSrc16[(i2 * 2u) + 1u] = out2;
1801
1802         /*  Butterfly process for the i0+3fftLen/4 sample */
1803         out1 = (short) ((Co3 * R0 - Si3 * R1) >> 16u);
1804
1805         out2 = (short) ((Si3 * R0 + Co3 * R1) >> 16u);
1806         /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
1807         /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
1808         pSrc16[i3 * 2u] = out1;
1809         pSrc16[(i3 * 2u) + 1u] = out2;
1810
1811
1812       }
1813     }
1814     /*  Twiddle coefficients index modifier */
1815     twidCoefModifier <<= 2u;
1816   }
1817   /*  End of Middle stages process */
1818
1819
1820   /* data is in 10.6(q6) format for the 1024 point */
1821   /* data is in 8.8(q8) format for the 256 point   */
1822   /* data is in 6.10(q10) format for the 64 point  */
1823   /* data is in 4.12(q12) format for the 16 point  */
1824
1825   /* start of last stage process */
1826
1827
1828   /*  Initializations for the last stage */
1829   n1 = n2;
1830   n2 >>= 2u;
1831
1832   /*  Butterfly implementation */
1833   for (i0 = 0u; i0 <= (fftLen - n1); i0 += n1)
1834   {
1835     /*  index calculation for the input as, */
1836     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1837     i1 = i0 + n2;
1838     i2 = i1 + n2;
1839     i3 = i2 + n2;
1840
1841     /*  Reading i0, i0+fftLen/2 inputs */
1842     /* Read ya (real), xa(imag) input */
1843     T0 = pSrc16[i0 * 2u];
1844     T1 = pSrc16[(i0 * 2u) + 1u];
1845     /* Read yc (real), xc(imag) input */
1846     S0 = pSrc16[i2 * 2u];
1847     S1 = pSrc16[(i2 * 2u) + 1u];
1848
1849     /* R0 = (ya + yc), R1 = (xa + xc) */
1850     R0 = __SSAT(T0 + S0, 16u);
1851     R1 = __SSAT(T1 + S1, 16u);
1852     /* S0 = (ya - yc), S1 = (xa - xc) */
1853     S0 = __SSAT(T0 - S0, 16u);
1854     S1 = __SSAT(T1 - S1, 16u);
1855
1856     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1857     /* Read yb (real), xb(imag) input */
1858     T0 = pSrc16[i1 * 2u];
1859     T1 = pSrc16[(i1 * 2u) + 1u];
1860     /* Read yd (real), xd(imag) input */
1861     U0 = pSrc16[i3 * 2u];
1862     U1 = pSrc16[(i3 * 2u) + 1u];
1863
1864     /* T0 = (yb + yd), T1 = (xb + xd) */
1865     T0 = __SSAT(T0 + U0, 16u);
1866     T1 = __SSAT(T1 + U1, 16u);
1867
1868     /*  writing the butterfly processed i0 sample */
1869     /* xa' = xa + xb + xc + xd */
1870     /* ya' = ya + yb + yc + yd */
1871     pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
1872     pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
1873
1874     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
1875     R0 = (R0 >> 1u) - (T0 >> 1u);
1876     R1 = (R1 >> 1u) - (T1 >> 1u);
1877
1878     /* Read yb (real), xb(imag) input */
1879     T0 = pSrc16[i1 * 2u];
1880     T1 = pSrc16[(i1 * 2u) + 1u];
1881
1882     /*  writing the butterfly processed i0 + fftLen/4 sample */
1883     /* xc' = (xa-xb+xc-xd) */
1884     /* yc' = (ya-yb+yc-yd) */
1885     pSrc16[i1 * 2u] = R0;
1886     pSrc16[(i1 * 2u) + 1u] = R1;
1887
1888     /* Read yd (real), xd(imag) input */
1889     U0 = pSrc16[i3 * 2u];
1890     U1 = pSrc16[(i3 * 2u) + 1u];
1891     /* T0 = (yb - yd), T1 = (xb - xd) */
1892     T0 = __SSAT(T0 - U0, 16u);
1893     T1 = __SSAT(T1 - U1, 16u);
1894
1895     /*  writing the butterfly processed i0 + fftLen/2 sample */
1896     /* xb' = (xa-yb-xc+yd) */
1897     /* yb' = (ya+xb-yc-xd) */
1898     pSrc16[i2 * 2u] = (S0 >> 1u) - (T1 >> 1u);
1899     pSrc16[(i2 * 2u) + 1u] = (S1 >> 1u) + (T0 >> 1u);
1900
1901
1902     /*  writing the butterfly processed i0 + 3fftLen/4 sample */
1903     /* xd' = (xa+yb-xc-yd) */
1904     /* yd' = (ya-xb-yc+xd) */
1905     pSrc16[i3 * 2u] = (S0 >> 1u) + (T1 >> 1u);
1906     pSrc16[(i3 * 2u) + 1u] = (S1 >> 1u) - (T0 >> 1u);
1907   }
1908   /* end of last stage  process */
1909
1910   /* output is in 11.5(q5) format for the 1024 point */
1911   /* output is in 9.7(q7) format for the 256 point   */
1912   /* output is in 7.9(q9) format for the 64 point  */
1913   /* output is in 5.11(q11) format for the 16 point  */
1914
1915 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
1916
1917 }