tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_fir_decimate_fast_q31.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:            arm_fir_decimate_fast_q31.c
   9 *
  10 * Description:  Fast Q31 FIR Decimator.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @addtogroup FIR_decimate
  49  * @{
  50  */
  51
  52 /**
  53  * @brief Processing function for the Q31 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.
  54  * @param[in] *S points to an instance of the Q31 FIR decimator structure.
  55  * @param[in] *pSrc points to the block of input data.
  56  * @param[out] *pDst points to the block of output data
  57  * @param[in] blockSize number of input samples to process per call.
  58  * @return none
  59  *
  60  * <b>Scaling and Overflow Behavior:</b>
  61  *
  62  * \par
  63  * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
  64  * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
  65  * These intermediate results are added to a 2.30 accumulator.
  66  * Finally, the accumulator is saturated and converted to a 1.31 result.
  67  * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
  68  * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (where log2 is read as log to the base 2).
  69  *
  70  * \par
  71  * Refer to the function <code>arm_fir_decimate_q31()</code> for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.
  72  * Both the slow and the fast versions use the same instance structure.
  73  * Use the function <code>arm_fir_decimate_init_q31()</code> to initialize the filter structure.
  74  */
  75
  76 void arm_fir_decimate_fast_q31(
  77   arm_fir_decimate_instance_q31 * S,
  78   q31_t * pSrc,
  79   q31_t * pDst,
  80   uint32_t blockSize)
  81 {
  82   q31_t *pState = S->pState;                     /* State pointer */
  83   q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
  84   q31_t *pStateCurnt;                            /* Points to the current sample of the state */
  85   q31_t x0, c0;                                  /* Temporary variables to hold state and coefficient values */
  86   q31_t *px;                                     /* Temporary pointers for state buffer */
  87   q31_t *pb;                                     /* Temporary pointers for coefficient buffer */
  88   q31_t sum0;                                    /* Accumulator */
  89   uint32_t numTaps = S->numTaps;                 /* Number of taps */
  90   uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
  91   uint32_t blkCntN2;
  92   q31_t x1;
  93   q31_t acc0, acc1;
  94   q31_t *px0, *px1;
  95
  96   /* S->pState buffer contains previous frame (numTaps - 1) samples */
  97   /* pStateCurnt points to the location where the new input data should be written */
  98   pStateCurnt = S->pState + (numTaps - 1u);
  99
 100   /* Total number of output samples to be computed */
 101
 102   blkCnt = outBlockSize / 2;
 103   blkCntN2 = outBlockSize - (2 * blkCnt);
 104
 105   while(blkCnt > 0u)
 106   {
 107     /* Copy decimation factor number of new input samples into the state buffer */
 108     i = 2 * S->M;
 109
 110     do
 111     {
 112       *pStateCurnt++ = *pSrc++;
 113
 114     } while(--i);
 115
 116     /* Set accumulator to zero */
 117     acc0 = 0;
 118     acc1 = 0;
 119
 120     /* Initialize state pointer */
 121     px0 = pState;
 122     px1 = pState + S->M;
 123
 124     /* Initialize coeff pointer */
 125     pb = pCoeffs;
 126
 127     /* Loop unrolling.  Process 4 taps at a time. */
 128     tapCnt = numTaps >> 2;
 129
 130     /* Loop over the number of taps.  Unroll by a factor of 4.
 131      ** Repeat until we've computed numTaps-4 coefficients. */
 132     while(tapCnt > 0u)
 133     {
 134       /* Read the b[numTaps-1] coefficient */
 135       c0 = *(pb);
 136
 137       /* Read x[n-numTaps-1] for sample 0 sample 1 */
 138       x0 = *(px0);
 139       x1 = *(px1);
 140
 141       /* Perform the multiply-accumulate */
 142       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
 143       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
 144
 145       /* Read the b[numTaps-2] coefficient */
 146       c0 = *(pb + 1u);
 147
 148       /* Read x[n-numTaps-2]  for sample 0 sample 1  */
 149       x0 = *(px0 + 1u);
 150       x1 = *(px1 + 1u);
 151
 152       /* Perform the multiply-accumulate */
 153       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
 154       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
 155
 156       /* Read the b[numTaps-3] coefficient */
 157       c0 = *(pb + 2u);
 158
 159       /* Read x[n-numTaps-3]  for sample 0 sample 1 */
 160       x0 = *(px0 + 2u);
 161       x1 = *(px1 + 2u);
 162       pb += 4u;
 163
 164       /* Perform the multiply-accumulate */
 165       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
 166       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
 167
 168       /* Read the b[numTaps-4] coefficient */
 169       c0 = *(pb - 1u);
 170
 171       /* Read x[n-numTaps-4] for sample 0 sample 1 */
 172       x0 = *(px0 + 3u);
 173       x1 = *(px1 + 3u);
 174
 175
 176       /* Perform the multiply-accumulate */
 177       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
 178       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
 179
 180       /* update state pointers */
 181       px0 += 4u;
 182       px1 += 4u;
 183
 184       /* Decrement the loop counter */
 185       tapCnt--;
 186     }
 187
 188     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
 189     tapCnt = numTaps % 0x4u;
 190
 191     while(tapCnt > 0u)
 192     {
 193       /* Read coefficients */
 194       c0 = *(pb++);
 195
 196       /* Fetch 1 state variable */
 197       x0 = *(px0++);
 198       x1 = *(px1++);
 199
 200       /* Perform the multiply-accumulate */
 201       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
 202       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
 203
 204       /* Decrement the loop counter */
 205       tapCnt--;
 206     }
 207
 208     /* Advance the state pointer by the decimation factor
 209      * to process the next group of decimation factor number samples */
 210     pState = pState + S->M * 2;
 211
 212     /* The result is in the accumulator, store in the destination buffer. */
 213     *pDst++ = (q31_t) (acc0 << 1);
 214     *pDst++ = (q31_t) (acc1 << 1);
 215
 216     /* Decrement the loop counter */
 217     blkCnt--;
 218   }
 219
 220   while(blkCntN2 > 0u)
 221   {
 222     /* Copy decimation factor number of new input samples into the state buffer */
 223     i = S->M;
 224
 225     do
 226     {
 227       *pStateCurnt++ = *pSrc++;
 228
 229     } while(--i);
 230
 231     /* Set accumulator to zero */
 232     sum0 = 0;
 233
 234     /* Initialize state pointer */
 235     px = pState;
 236
 237     /* Initialize coeff pointer */
 238     pb = pCoeffs;
 239
 240     /* Loop unrolling.  Process 4 taps at a time. */
 241     tapCnt = numTaps >> 2;
 242
 243     /* Loop over the number of taps.  Unroll by a factor of 4.
 244      ** Repeat until we've computed numTaps-4 coefficients. */
 245     while(tapCnt > 0u)
 246     {
 247       /* Read the b[numTaps-1] coefficient */
 248       c0 = *(pb++);
 249
 250       /* Read x[n-numTaps-1] sample */
 251       x0 = *(px++);
 252
 253       /* Perform the multiply-accumulate */
 254       sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
 255
 256       /* Read the b[numTaps-2] coefficient */
 257       c0 = *(pb++);
 258
 259       /* Read x[n-numTaps-2] sample */
 260       x0 = *(px++);
 261
 262       /* Perform the multiply-accumulate */
 263       sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
 264
 265       /* Read the b[numTaps-3] coefficient */
 266       c0 = *(pb++);
 267
 268       /* Read x[n-numTaps-3] sample */
 269       x0 = *(px++);
 270
 271       /* Perform the multiply-accumulate */
 272       sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
 273
 274       /* Read the b[numTaps-4] coefficient */
 275       c0 = *(pb++);
 276
 277       /* Read x[n-numTaps-4] sample */
 278       x0 = *(px++);
 279
 280       /* Perform the multiply-accumulate */
 281       sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
 282
 283       /* Decrement the loop counter */
 284       tapCnt--;
 285     }
 286
 287     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
 288     tapCnt = numTaps % 0x4u;
 289
 290     while(tapCnt > 0u)
 291     {
 292       /* Read coefficients */
 293       c0 = *(pb++);
 294
 295       /* Fetch 1 state variable */
 296       x0 = *(px++);
 297
 298       /* Perform the multiply-accumulate */
 299       sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
 300
 301       /* Decrement the loop counter */
 302       tapCnt--;
 303     }
 304
 305     /* Advance the state pointer by the decimation factor
 306      * to process the next group of decimation factor number samples */
 307     pState = pState + S->M;
 308
 309     /* The result is in the accumulator, store in the destination buffer. */
 310     *pDst++ = (q31_t) (sum0 << 1);
 311
 312     /* Decrement the loop counter */
 313     blkCntN2--;
 314   }
 315
 316   /* Processing is complete.
 317    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
 318    ** This prepares the state buffer for the next function call. */
 319
 320   /* Points to the start of the state buffer */
 321   pStateCurnt = S->pState;
 322
 323   i = (numTaps - 1u) >> 2u;
 324
 325   /* copy data */
 326   while(i > 0u)
 327   {
 328     *pStateCurnt++ = *pState++;
 329     *pStateCurnt++ = *pState++;
 330     *pStateCurnt++ = *pState++;
 331     *pStateCurnt++ = *pState++;
 332
 333     /* Decrement the loop counter */
 334     i--;
 335   }
 336
 337   i = (numTaps - 1u) % 0x04u;
 338
 339   /* copy data */
 340   while(i > 0u)
 341   {
 342     *pStateCurnt++ = *pState++;
 343
 344     /* Decrement the loop counter */
 345     i--;
 346   }
 347 }
 348
 349 /**
 350  * @} end of FIR_decimate group
 351  */