tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/TransformFunctions/arm_dct4_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:            arm_dct4_q15.c
   9 *
  10 * Description:  Processing function of DCT4 & IDCT4 Q15.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @addtogroup DCT4_IDCT4
  45  * @{
  46  */
  47
  48 /**
  49  * @brief Processing function for the Q15 DCT4/IDCT4.
  50  * @param[in]       *S             points to an instance of the Q15 DCT4 structure.
  51  * @param[in]       *pState        points to state buffer.
  52  * @param[in,out]   *pInlineBuffer points to the in-place input and output buffer.
  53  * @return none.
  54  *
  55  * \par Input an output formats:
  56  * Internally inputs are downscaled in the RFFT process function to avoid overflows.
  57  * Number of bits downscaled, depends on the size of the transform.
  58  * The input and output formats for different DCT sizes and number of bits to upscale are mentioned in the table below:
  59  *
  60  * \image html dct4FormatsQ15Table.gif
  61  */
  62
  63 void arm_dct4_q15(
  64   const arm_dct4_instance_q15 * S,
  65   q15_t * pState,
  66   q15_t * pInlineBuffer)
  67 {
  68   uint32_t i;                                    /* Loop counter */
  69   q15_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */
  70   q15_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */
  71   q15_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */
  72   q15_t in;                                      /* Temporary variable */
  73
  74
  75   /* DCT4 computation involves DCT2 (which is calculated using RFFT)
  76    * along with some pre-processing and post-processing.
  77    * Computational procedure is explained as follows:
  78    * (a) Pre-processing involves multiplying input with cos factor,
  79    *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
  80    *              where,
  81    *                 r(n) -- output of preprocessing
  82    *                 u(n) -- input to preprocessing(actual Source buffer)
  83    * (b) Calculation of DCT2 using FFT is divided into three steps:
  84    *                  Step1: Re-ordering of even and odd elements of input.
  85    *                  Step2: Calculating FFT of the re-ordered input.
  86    *                  Step3: Taking the real part of the product of FFT output and weights.
  87    * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
  88    *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
  89    *                        where,
  90    *                           Y4 -- DCT4 output,   Y2 -- DCT2 output
  91    * (d) Multiplying the output with the normalizing factor sqrt(2/N).
  92    */
  93
  94         /*-------- Pre-processing ------------*/
  95   /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
  96   arm_mult_q15(pInlineBuffer, cosFact, pInlineBuffer, S->N);
  97   arm_shift_q15(pInlineBuffer, 1, pInlineBuffer, S->N);
  98
  99   /* ----------------------------------------------------------------
 100    * Step1: Re-ordering of even and odd elements as
 101    *             pState[i] =  pInlineBuffer[2*i] and
 102    *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
 103    ---------------------------------------------------------------------*/
 104
 105   /* pS1 initialized to pState */
 106   pS1 = pState;
 107
 108   /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
 109   pS2 = pState + (S->N - 1u);
 110
 111   /* pbuff initialized to input buffer */
 112   pbuff = pInlineBuffer;
 113
 114
 115 #ifndef ARM_MATH_CM0_FAMILY
 116
 117   /* Run the below code for Cortex-M4 and Cortex-M3 */
 118
 119   /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
 120   i = (uint32_t) S->Nby2 >> 2u;
 121
 122   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
 123    ** a second loop below computes the remaining 1 to 3 samples. */
 124   do
 125   {
 126     /* Re-ordering of even and odd elements */
 127     /* pState[i] =  pInlineBuffer[2*i] */
 128     *pS1++ = *pbuff++;
 129     /* pState[N-i-1] = pInlineBuffer[2*i+1] */
 130     *pS2-- = *pbuff++;
 131
 132     *pS1++ = *pbuff++;
 133     *pS2-- = *pbuff++;
 134
 135     *pS1++ = *pbuff++;
 136     *pS2-- = *pbuff++;
 137
 138     *pS1++ = *pbuff++;
 139     *pS2-- = *pbuff++;
 140
 141     /* Decrement the loop counter */
 142     i--;
 143   } while(i > 0u);
 144
 145   /* pbuff initialized to input buffer */
 146   pbuff = pInlineBuffer;
 147
 148   /* pS1 initialized to pState */
 149   pS1 = pState;
 150
 151   /* Initializing the loop counter to N/4 instead of N for loop unrolling */
 152   i = (uint32_t) S->N >> 2u;
 153
 154   /* Processing with loop unrolling 4 times as N is always multiple of 4.
 155    * Compute 4 outputs at a time */
 156   do
 157   {
 158     /* Writing the re-ordered output back to inplace input buffer */
 159     *pbuff++ = *pS1++;
 160     *pbuff++ = *pS1++;
 161     *pbuff++ = *pS1++;
 162     *pbuff++ = *pS1++;
 163
 164     /* Decrement the loop counter */
 165     i--;
 166   } while(i > 0u);
 167
 168
 169   /* ---------------------------------------------------------
 170    *     Step2: Calculate RFFT for N-point input
 171    * ---------------------------------------------------------- */
 172   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
 173   arm_rfft_q15(S->pRfft, pInlineBuffer, pState);
 174
 175  /*----------------------------------------------------------------------
 176   *  Step3: Multiply the FFT output with the weights.
 177   *----------------------------------------------------------------------*/
 178   arm_cmplx_mult_cmplx_q15(pState, weights, pState, S->N);
 179
 180   /* The output of complex multiplication is in 3.13 format.
 181    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */
 182   arm_shift_q15(pState, 2, pState, S->N * 2);
 183
 184   /* ----------- Post-processing ---------- */
 185   /* DCT-IV can be obtained from DCT-II by the equation,
 186    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
 187    *       Hence, Y4(0) = Y2(0)/2  */
 188   /* Getting only real part from the output and Converting to DCT-IV */
 189
 190   /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
 191   i = ((uint32_t) S->N - 1u) >> 2u;
 192
 193   /* pbuff initialized to input buffer. */
 194   pbuff = pInlineBuffer;
 195
 196   /* pS1 initialized to pState */
 197   pS1 = pState;
 198
 199   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
 200   in = *pS1++ >> 1u;
 201   /* input buffer acts as inplace, so output values are stored in the input itself. */
 202   *pbuff++ = in;
 203
 204   /* pState pointer is incremented twice as the real values are located alternatively in the array */
 205   pS1++;
 206
 207   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
 208    ** a second loop below computes the remaining 1 to 3 samples. */
 209   do
 210   {
 211     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
 212     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
 213     in = *pS1++ - in;
 214     *pbuff++ = in;
 215     /* points to the next real value */
 216     pS1++;
 217
 218     in = *pS1++ - in;
 219     *pbuff++ = in;
 220     pS1++;
 221
 222     in = *pS1++ - in;
 223     *pbuff++ = in;
 224     pS1++;
 225
 226     in = *pS1++ - in;
 227     *pbuff++ = in;
 228     pS1++;
 229
 230     /* Decrement the loop counter */
 231     i--;
 232   } while(i > 0u);
 233
 234   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
 235    ** No loop unrolling is used. */
 236   i = ((uint32_t) S->N - 1u) % 0x4u;
 237
 238   while(i > 0u)
 239   {
 240     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
 241     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
 242     in = *pS1++ - in;
 243     *pbuff++ = in;
 244     /* points to the next real value */
 245     pS1++;
 246
 247     /* Decrement the loop counter */
 248     i--;
 249   }
 250
 251
 252    /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
 253
 254   /* Initializing the loop counter to N/4 instead of N for loop unrolling */
 255   i = (uint32_t) S->N >> 2u;
 256
 257   /* pbuff initialized to the pInlineBuffer(now contains the output values) */
 258   pbuff = pInlineBuffer;
 259
 260   /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */
 261   do
 262   {
 263     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
 264     in = *pbuff;
 265     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
 266
 267     in = *pbuff;
 268     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
 269
 270     in = *pbuff;
 271     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
 272
 273     in = *pbuff;
 274     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
 275
 276     /* Decrement the loop counter */
 277     i--;
 278   } while(i > 0u);
 279
 280
 281 #else
 282
 283   /* Run the below code for Cortex-M0 */
 284
 285   /* Initializing the loop counter to N/2 */
 286   i = (uint32_t) S->Nby2;
 287
 288   do
 289   {
 290     /* Re-ordering of even and odd elements */
 291     /* pState[i] =  pInlineBuffer[2*i] */
 292     *pS1++ = *pbuff++;
 293     /* pState[N-i-1] = pInlineBuffer[2*i+1] */
 294     *pS2-- = *pbuff++;
 295
 296     /* Decrement the loop counter */
 297     i--;
 298   } while(i > 0u);
 299
 300   /* pbuff initialized to input buffer */
 301   pbuff = pInlineBuffer;
 302
 303   /* pS1 initialized to pState */
 304   pS1 = pState;
 305
 306   /* Initializing the loop counter */
 307   i = (uint32_t) S->N;
 308
 309   do
 310   {
 311     /* Writing the re-ordered output back to inplace input buffer */
 312     *pbuff++ = *pS1++;
 313
 314     /* Decrement the loop counter */
 315     i--;
 316   } while(i > 0u);
 317
 318
 319   /* ---------------------------------------------------------
 320    *     Step2: Calculate RFFT for N-point input
 321    * ---------------------------------------------------------- */
 322   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
 323   arm_rfft_q15(S->pRfft, pInlineBuffer, pState);
 324
 325  /*----------------------------------------------------------------------
 326   *  Step3: Multiply the FFT output with the weights.
 327   *----------------------------------------------------------------------*/
 328   arm_cmplx_mult_cmplx_q15(pState, weights, pState, S->N);
 329
 330   /* The output of complex multiplication is in 3.13 format.
 331    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */
 332   arm_shift_q15(pState, 2, pState, S->N * 2);
 333
 334   /* ----------- Post-processing ---------- */
 335   /* DCT-IV can be obtained from DCT-II by the equation,
 336    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
 337    *       Hence, Y4(0) = Y2(0)/2  */
 338   /* Getting only real part from the output and Converting to DCT-IV */
 339
 340   /* Initializing the loop counter */
 341   i = ((uint32_t) S->N - 1u);
 342
 343   /* pbuff initialized to input buffer. */
 344   pbuff = pInlineBuffer;
 345
 346   /* pS1 initialized to pState */
 347   pS1 = pState;
 348
 349   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
 350   in = *pS1++ >> 1u;
 351   /* input buffer acts as inplace, so output values are stored in the input itself. */
 352   *pbuff++ = in;
 353
 354   /* pState pointer is incremented twice as the real values are located alternatively in the array */
 355   pS1++;
 356
 357   do
 358   {
 359     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
 360     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
 361     in = *pS1++ - in;
 362     *pbuff++ = in;
 363     /* points to the next real value */
 364     pS1++;
 365
 366     /* Decrement the loop counter */
 367     i--;
 368   } while(i > 0u);
 369
 370    /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
 371
 372   /* Initializing the loop counter */
 373   i = (uint32_t) S->N;
 374
 375   /* pbuff initialized to the pInlineBuffer(now contains the output values) */
 376   pbuff = pInlineBuffer;
 377
 378   do
 379   {
 380     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
 381     in = *pbuff;
 382     *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
 383
 384     /* Decrement the loop counter */
 385     i--;
 386   } while(i > 0u);
 387
 388 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
 389
 390 }
 391
 392 /**
 393    * @} end of DCT4_IDCT4 group
 394    */