tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_fir_fast_q31.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:            arm_fir_fast_q31.c
   9 *
  10 * Description:  Processing function for the Q31 Fast FIR filter.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @addtogroup FIR
  49  * @{
  50  */
  51
  52 /**
  53  * @param[in] *S points to an instance of the Q31 structure.
  54  * @param[in] *pSrc points to the block of input data.
  55  * @param[out] *pDst points to the block output data.
  56  * @param[in] blockSize number of samples to process per call.
  57  * @return none.
  58  *
  59  * <b>Scaling and Overflow Behavior:</b>
  60  *
  61  * \par
  62  * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
  63  * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
  64  * These intermediate results are added to a 2.30 accumulator.
  65  * Finally, the accumulator is saturated and converted to a 1.31 result.
  66  * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
  67  * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
  68  *
  69  * \par
  70  * Refer to the function <code>arm_fir_q31()</code> for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.  Both the slow and the fast versions use the same instance structure.
  71  * Use the function <code>arm_fir_init_q31()</code> to initialize the filter structure.
  72  */
  73
  74 IAR_ONLY_LOW_OPTIMIZATION_ENTER
  75 void arm_fir_fast_q31(
  76   const arm_fir_instance_q31 * S,
  77   q31_t * pSrc,
  78   q31_t * pDst,
  79   uint32_t blockSize)
  80 {
  81   q31_t *pState = S->pState;                     /* State pointer */
  82   q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
  83   q31_t *pStateCurnt;                            /* Points to the current sample of the state */
  84   q31_t x0, x1, x2, x3;                          /* Temporary variables to hold state */
  85   q31_t c0;                                      /* Temporary variable to hold coefficient value */
  86   q31_t *px;                                     /* Temporary pointer for state */
  87   q31_t *pb;                                     /* Temporary pointer for coefficient buffer */
  88   q31_t acc0, acc1, acc2, acc3;                  /* Accumulators */
  89   uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
  90   uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
  91
  92   /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
  93   /* pStateCurnt points to the location where the new input data should be written */
  94   pStateCurnt = &(S->pState[(numTaps - 1u)]);
  95
  96   /* Apply loop unrolling and compute 4 output values simultaneously.
  97    * The variables acc0 ... acc3 hold output values that are being computed:
  98    *
  99    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
 100    *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
 101    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
 102    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]
 103    */
 104   blkCnt = blockSize >> 2;
 105
 106   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
 107    ** a second loop below computes the remaining 1 to 3 samples. */
 108   while(blkCnt > 0u)
 109   {
 110     /* Copy four new input samples into the state buffer */
 111     *pStateCurnt++ = *pSrc++;
 112     *pStateCurnt++ = *pSrc++;
 113     *pStateCurnt++ = *pSrc++;
 114     *pStateCurnt++ = *pSrc++;
 115
 116     /* Set all accumulators to zero */
 117     acc0 = 0;
 118     acc1 = 0;
 119     acc2 = 0;
 120     acc3 = 0;
 121
 122     /* Initialize state pointer */
 123     px = pState;
 124
 125     /* Initialize coefficient pointer */
 126     pb = pCoeffs;
 127
 128     /* Read the first three samples from the state buffer:
 129      *  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
 130     x0 = *(px++);
 131     x1 = *(px++);
 132     x2 = *(px++);
 133
 134     /* Loop unrolling.  Process 4 taps at a time. */
 135     tapCnt = numTaps >> 2;
 136     i = tapCnt;
 137
 138     while(i > 0u)
 139     {
 140       /* Read the b[numTaps] coefficient */
 141       c0 = *(pb++);
 142
 143       /* Read x[n-numTaps-3] sample */
 144       x3 = *(px++);
 145
 146       /* acc0 +=  b[numTaps] * x[n-numTaps] */
 147       multAcc_32x32_keep32_R(acc0, x0, c0);
 148
 149       /* acc1 +=  b[numTaps] * x[n-numTaps-1] */
 150       multAcc_32x32_keep32_R(acc1, x1, c0);
 151
 152       /* acc2 +=  b[numTaps] * x[n-numTaps-2] */
 153       multAcc_32x32_keep32_R(acc2, x2, c0);
 154
 155       /* acc3 +=  b[numTaps] * x[n-numTaps-3] */
 156       multAcc_32x32_keep32_R(acc3, x3, c0);
 157
 158       /* Read the b[numTaps-1] coefficient */
 159       c0 = *(pb++);
 160
 161       /* Read x[n-numTaps-4] sample */
 162       x0 = *(px++);
 163
 164       /* Perform the multiply-accumulates */
 165       multAcc_32x32_keep32_R(acc0, x1, c0);
 166       multAcc_32x32_keep32_R(acc1, x2, c0);
 167       multAcc_32x32_keep32_R(acc2, x3, c0);
 168       multAcc_32x32_keep32_R(acc3, x0, c0);
 169
 170       /* Read the b[numTaps-2] coefficient */
 171       c0 = *(pb++);
 172
 173       /* Read x[n-numTaps-5] sample */
 174       x1 = *(px++);
 175
 176       /* Perform the multiply-accumulates */
 177       multAcc_32x32_keep32_R(acc0, x2, c0);
 178       multAcc_32x32_keep32_R(acc1, x3, c0);
 179       multAcc_32x32_keep32_R(acc2, x0, c0);
 180       multAcc_32x32_keep32_R(acc3, x1, c0);
 181
 182       /* Read the b[numTaps-3] coefficients */
 183       c0 = *(pb++);
 184
 185       /* Read x[n-numTaps-6] sample */
 186       x2 = *(px++);
 187
 188       /* Perform the multiply-accumulates */
 189       multAcc_32x32_keep32_R(acc0, x3, c0);
 190       multAcc_32x32_keep32_R(acc1, x0, c0);
 191       multAcc_32x32_keep32_R(acc2, x1, c0);
 192       multAcc_32x32_keep32_R(acc3, x2, c0);
 193       i--;
 194     }
 195
 196     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
 197
 198     i = numTaps - (tapCnt * 4u);
 199     while(i > 0u)
 200     {
 201       /* Read coefficients */
 202       c0 = *(pb++);
 203
 204       /* Fetch 1 state variable */
 205       x3 = *(px++);
 206
 207       /* Perform the multiply-accumulates */
 208       multAcc_32x32_keep32_R(acc0, x0, c0);
 209       multAcc_32x32_keep32_R(acc1, x1, c0);
 210       multAcc_32x32_keep32_R(acc2, x2, c0);
 211       multAcc_32x32_keep32_R(acc3, x3, c0);
 212
 213       /* Reuse the present sample states for next sample */
 214       x0 = x1;
 215       x1 = x2;
 216       x2 = x3;
 217
 218       /* Decrement the loop counter */
 219       i--;
 220     }
 221
 222     /* Advance the state pointer by 4 to process the next group of 4 samples */
 223     pState = pState + 4;
 224
 225     /* The results in the 4 accumulators are in 2.30 format.  Convert to 1.31
 226      ** Then store the 4 outputs in the destination buffer. */
 227     *pDst++ = (q31_t) (acc0 << 1);
 228     *pDst++ = (q31_t) (acc1 << 1);
 229     *pDst++ = (q31_t) (acc2 << 1);
 230     *pDst++ = (q31_t) (acc3 << 1);
 231
 232     /* Decrement the samples loop counter */
 233     blkCnt--;
 234   }
 235
 236
 237   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
 238    ** No loop unrolling is used. */
 239   blkCnt = blockSize % 4u;
 240
 241   while(blkCnt > 0u)
 242   {
 243     /* Copy one sample at a time into state buffer */
 244     *pStateCurnt++ = *pSrc++;
 245
 246     /* Set the accumulator to zero */
 247     acc0 = 0;
 248
 249     /* Initialize state pointer */
 250     px = pState;
 251
 252     /* Initialize Coefficient pointer */
 253     pb = (pCoeffs);
 254
 255     i = numTaps;
 256
 257     /* Perform the multiply-accumulates */
 258     do
 259     {
 260       multAcc_32x32_keep32_R(acc0, (*px++), (*(pb++)));
 261       i--;
 262     } while(i > 0u);
 263
 264     /* The result is in 2.30 format.  Convert to 1.31
 265      ** Then store the output in the destination buffer. */
 266     *pDst++ = (q31_t) (acc0 << 1);
 267
 268     /* Advance state pointer by 1 for the next sample */
 269     pState = pState + 1;
 270
 271     /* Decrement the samples loop counter */
 272     blkCnt--;
 273   }
 274
 275   /* Processing is complete.
 276    ** Now copy the last numTaps - 1 samples to the start of the state buffer.
 277    ** This prepares the state buffer for the next function call. */
 278
 279   /* Points to the start of the state buffer */
 280   pStateCurnt = S->pState;
 281
 282   /* Calculate remaining number of copies */
 283   tapCnt = (numTaps - 1u);
 284
 285   /* Copy the remaining q31_t data */
 286   while(tapCnt > 0u)
 287   {
 288     *pStateCurnt++ = *pState++;
 289
 290     /* Decrement the loop counter */
 291     tapCnt--;
 292   }
 293
 294
 295 }
 296 IAR_ONLY_LOW_OPTIMIZATION_EXIT
 297 /**
 298  * @} end of FIR group
 299  */