tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_fir_decimate_fast_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:            arm_fir_decimate_fast_q15.c
   9 *
  10 * Description:  Fast Q15 FIR Decimator.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @addtogroup FIR_decimate
  49  * @{
  50  */
  51
  52 /**
  53  * @brief Processing function for the Q15 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.
  54  * @param[in] *S points to an instance of the Q15 FIR decimator structure.
  55  * @param[in] *pSrc points to the block of input data.
  56  * @param[out] *pDst points to the block of output data
  57  * @param[in] blockSize number of input samples to process per call.
  58  * @return none
  59  *
  60  * \par Restrictions
  61  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
  62  *      In this case input, output, state buffers should be aligned by 32-bit
  63  *
  64  * <b>Scaling and Overflow Behavior:</b>
  65  * \par
  66  * This fast version uses a 32-bit accumulator with 2.30 format.
  67  * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
  68  * Thus, if the accumulator result overflows it wraps around and distorts the result.
  69  * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (log2 is read as log to the base 2).
  70  * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
  71  *
  72  * \par
  73  * Refer to the function <code>arm_fir_decimate_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
  74  * Both the slow and the fast versions use the same instance structure.
  75  * Use the function <code>arm_fir_decimate_init_q15()</code> to initialize the filter structure.
  76  */
  77
  78 #ifndef UNALIGNED_SUPPORT_DISABLE
  79
  80 void arm_fir_decimate_fast_q15(
  81   const arm_fir_decimate_instance_q15 * S,
  82   q15_t * pSrc,
  83   q15_t * pDst,
  84   uint32_t blockSize)
  85 {
  86   q15_t *pState = S->pState;                     /* State pointer */
  87   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
  88   q15_t *pStateCurnt;                            /* Points to the current sample of the state */
  89   q15_t *px;                                     /* Temporary pointer for state buffer */
  90   q15_t *pb;                                     /* Temporary pointer coefficient buffer */
  91   q31_t x0, x1, c0, c1;                          /* Temporary variables to hold state and coefficient values */
  92   q31_t sum0;                                    /* Accumulators */
  93   q31_t acc0, acc1;
  94   q15_t *px0, *px1;
  95   uint32_t blkCntN3;
  96   uint32_t numTaps = S->numTaps;                 /* Number of taps */
  97   uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
  98
  99
 100   /* S->pState buffer contains previous frame (numTaps - 1) samples */
 101   /* pStateCurnt points to the location where the new input data should be written */
 102   pStateCurnt = S->pState + (numTaps - 1u);
 103
 104
 105   /* Total number of output samples to be computed */
 106   blkCnt = outBlockSize / 2;
 107   blkCntN3 = outBlockSize - (2 * blkCnt);
 108
 109
 110   while(blkCnt > 0u)
 111   {
 112     /* Copy decimation factor number of new input samples into the state buffer */
 113     i = 2 * S->M;
 114
 115     do
 116     {
 117       *pStateCurnt++ = *pSrc++;
 118
 119     } while(--i);
 120
 121     /* Set accumulator to zero */
 122     acc0 = 0;
 123     acc1 = 0;
 124
 125     /* Initialize state pointer */
 126     px0 = pState;
 127
 128     px1 = pState + S->M;
 129
 130
 131     /* Initialize coeff pointer */
 132     pb = pCoeffs;
 133
 134     /* Loop unrolling.  Process 4 taps at a time. */
 135     tapCnt = numTaps >> 2;
 136
 137     /* Loop over the number of taps.  Unroll by a factor of 4.
 138      ** Repeat until we've computed numTaps-4 coefficients. */
 139     while(tapCnt > 0u)
 140     {
 141       /* Read the Read b[numTaps-1] and b[numTaps-2]  coefficients */
 142       c0 = *__SIMD32(pb)++;
 143
 144       /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
 145       x0 = *__SIMD32(px0)++;
 146
 147       x1 = *__SIMD32(px1)++;
 148
 149       /* Perform the multiply-accumulate */
 150       acc0 = __SMLAD(x0, c0, acc0);
 151
 152       acc1 = __SMLAD(x1, c0, acc1);
 153
 154       /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
 155       c0 = *__SIMD32(pb)++;
 156
 157       /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
 158       x0 = *__SIMD32(px0)++;
 159
 160       x1 = *__SIMD32(px1)++;
 161
 162       /* Perform the multiply-accumulate */
 163       acc0 = __SMLAD(x0, c0, acc0);
 164
 165       acc1 = __SMLAD(x1, c0, acc1);
 166
 167       /* Decrement the loop counter */
 168       tapCnt--;
 169     }
 170
 171     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
 172     tapCnt = numTaps % 0x4u;
 173
 174     while(tapCnt > 0u)
 175     {
 176       /* Read coefficients */
 177       c0 = *pb++;
 178
 179       /* Fetch 1 state variable */
 180       x0 = *px0++;
 181
 182       x1 = *px1++;
 183
 184       /* Perform the multiply-accumulate */
 185       acc0 = __SMLAD(x0, c0, acc0);
 186       acc1 = __SMLAD(x1, c0, acc1);
 187
 188       /* Decrement the loop counter */
 189       tapCnt--;
 190     }
 191
 192     /* Advance the state pointer by the decimation factor
 193      * to process the next group of decimation factor number samples */
 194     pState = pState + S->M * 2;
 195
 196     /* Store filter output, smlad returns the values in 2.14 format */
 197     /* so downsacle by 15 to get output in 1.15 */
 198     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
 199     *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
 200
 201     /* Decrement the loop counter */
 202     blkCnt--;
 203   }
 204
 205
 206
 207   while(blkCntN3 > 0u)
 208   {
 209     /* Copy decimation factor number of new input samples into the state buffer */
 210     i = S->M;
 211
 212     do
 213     {
 214       *pStateCurnt++ = *pSrc++;
 215
 216     } while(--i);
 217
 218     /*Set sum to zero */
 219     sum0 = 0;
 220
 221     /* Initialize state pointer */
 222     px = pState;
 223
 224     /* Initialize coeff pointer */
 225     pb = pCoeffs;
 226
 227     /* Loop unrolling.  Process 4 taps at a time. */
 228     tapCnt = numTaps >> 2;
 229
 230     /* Loop over the number of taps.  Unroll by a factor of 4.
 231      ** Repeat until we've computed numTaps-4 coefficients. */
 232     while(tapCnt > 0u)
 233     {
 234       /* Read the Read b[numTaps-1] and b[numTaps-2]  coefficients */
 235       c0 = *__SIMD32(pb)++;
 236
 237       /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
 238       x0 = *__SIMD32(px)++;
 239
 240       /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
 241       c1 = *__SIMD32(pb)++;
 242
 243       /* Perform the multiply-accumulate */
 244       sum0 = __SMLAD(x0, c0, sum0);
 245
 246       /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
 247       x0 = *__SIMD32(px)++;
 248
 249       /* Perform the multiply-accumulate */
 250       sum0 = __SMLAD(x0, c1, sum0);
 251
 252       /* Decrement the loop counter */
 253       tapCnt--;
 254     }
 255
 256     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
 257     tapCnt = numTaps % 0x4u;
 258
 259     while(tapCnt > 0u)
 260     {
 261       /* Read coefficients */
 262       c0 = *pb++;
 263
 264       /* Fetch 1 state variable */
 265       x0 = *px++;
 266
 267       /* Perform the multiply-accumulate */
 268       sum0 = __SMLAD(x0, c0, sum0);
 269
 270       /* Decrement the loop counter */
 271       tapCnt--;
 272     }
 273
 274     /* Advance the state pointer by the decimation factor
 275      * to process the next group of decimation factor number samples */
 276     pState = pState + S->M;
 277
 278     /* Store filter output, smlad returns the values in 2.14 format */
 279     /* so downsacle by 15 to get output in 1.15 */
 280     *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
 281
 282     /* Decrement the loop counter */
 283     blkCntN3--;
 284   }
 285
 286   /* Processing is complete.
 287    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
 288    ** This prepares the state buffer for the next function call. */
 289
 290   /* Points to the start of the state buffer */
 291   pStateCurnt = S->pState;
 292
 293   i = (numTaps - 1u) >> 2u;
 294
 295   /* copy data */
 296   while(i > 0u)
 297   {
 298     *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
 299     *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
 300
 301     /* Decrement the loop counter */
 302     i--;
 303   }
 304
 305   i = (numTaps - 1u) % 0x04u;
 306
 307   /* copy data */
 308   while(i > 0u)
 309   {
 310     *pStateCurnt++ = *pState++;
 311
 312     /* Decrement the loop counter */
 313     i--;
 314   }
 315 }
 316
 317 #else
 318
 319
 320 void arm_fir_decimate_fast_q15(
 321   const arm_fir_decimate_instance_q15 * S,
 322   q15_t * pSrc,
 323   q15_t * pDst,
 324   uint32_t blockSize)
 325 {
 326   q15_t *pState = S->pState;                     /* State pointer */
 327   q15_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */
 328   q15_t *pStateCurnt;                            /* Points to the current sample of the state */
 329   q15_t *px;                                     /* Temporary pointer for state buffer */
 330   q15_t *pb;                                     /* Temporary pointer coefficient buffer */
 331   q15_t x0, x1, c0;                              /* Temporary variables to hold state and coefficient values */
 332   q31_t sum0;                                    /* Accumulators */
 333   q31_t acc0, acc1;
 334   q15_t *px0, *px1;
 335   uint32_t blkCntN3;
 336   uint32_t numTaps = S->numTaps;                 /* Number of taps */
 337   uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M;  /* Loop counters */
 338
 339
 340   /* S->pState buffer contains previous frame (numTaps - 1) samples */
 341   /* pStateCurnt points to the location where the new input data should be written */
 342   pStateCurnt = S->pState + (numTaps - 1u);
 343
 344
 345   /* Total number of output samples to be computed */
 346   blkCnt = outBlockSize / 2;
 347   blkCntN3 = outBlockSize - (2 * blkCnt);
 348
 349   while(blkCnt > 0u)
 350   {
 351     /* Copy decimation factor number of new input samples into the state buffer */
 352     i = 2 * S->M;
 353
 354     do
 355     {
 356       *pStateCurnt++ = *pSrc++;
 357
 358     } while(--i);
 359
 360     /* Set accumulator to zero */
 361     acc0 = 0;
 362     acc1 = 0;
 363
 364     /* Initialize state pointer */
 365     px0 = pState;
 366
 367     px1 = pState + S->M;
 368
 369
 370     /* Initialize coeff pointer */
 371     pb = pCoeffs;
 372
 373     /* Loop unrolling.  Process 4 taps at a time. */
 374     tapCnt = numTaps >> 2;
 375
 376     /* Loop over the number of taps.  Unroll by a factor of 4.
 377      ** Repeat until we've computed numTaps-4 coefficients. */
 378     while(tapCnt > 0u)
 379     {
 380       /* Read the Read b[numTaps-1] coefficients */
 381       c0 = *pb++;
 382
 383       /* Read x[n-numTaps-1] for sample 0 and for sample 1 */
 384       x0 = *px0++;
 385       x1 = *px1++;
 386
 387       /* Perform the multiply-accumulate */
 388       acc0 += x0 * c0;
 389       acc1 += x1 * c0;
 390
 391       /* Read the b[numTaps-2] coefficient */
 392       c0 = *pb++;
 393
 394       /* Read x[n-numTaps-2] for sample 0 and sample 1 */
 395       x0 = *px0++;
 396       x1 = *px1++;
 397
 398       /* Perform the multiply-accumulate */
 399       acc0 += x0 * c0;
 400       acc1 += x1 * c0;
 401
 402       /* Read the b[numTaps-3]  coefficients */
 403       c0 = *pb++;
 404
 405       /* Read x[n-numTaps-3] for sample 0 and sample 1 */
 406       x0 = *px0++;
 407       x1 = *px1++;
 408
 409       /* Perform the multiply-accumulate */
 410       acc0 += x0 * c0;
 411       acc1 += x1 * c0;
 412
 413       /* Read the b[numTaps-4] coefficient */
 414       c0 = *pb++;
 415
 416       /* Read x[n-numTaps-4] for sample 0 and sample 1 */
 417       x0 = *px0++;
 418       x1 = *px1++;
 419
 420       /* Perform the multiply-accumulate */
 421       acc0 += x0 * c0;
 422       acc1 += x1 * c0;
 423
 424       /* Decrement the loop counter */
 425       tapCnt--;
 426     }
 427
 428     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
 429     tapCnt = numTaps % 0x4u;
 430
 431     while(tapCnt > 0u)
 432     {
 433       /* Read coefficients */
 434       c0 = *pb++;
 435
 436       /* Fetch 1 state variable */
 437       x0 = *px0++;
 438       x1 = *px1++;
 439
 440       /* Perform the multiply-accumulate */
 441       acc0 += x0 * c0;
 442       acc1 += x1 * c0;
 443
 444       /* Decrement the loop counter */
 445       tapCnt--;
 446     }
 447
 448     /* Advance the state pointer by the decimation factor
 449      * to process the next group of decimation factor number samples */
 450     pState = pState + S->M * 2;
 451
 452     /* Store filter output, smlad returns the values in 2.14 format */
 453     /* so downsacle by 15 to get output in 1.15 */
 454
 455     *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
 456     *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
 457
 458
 459     /* Decrement the loop counter */
 460     blkCnt--;
 461   }
 462
 463   while(blkCntN3 > 0u)
 464   {
 465     /* Copy decimation factor number of new input samples into the state buffer */
 466     i = S->M;
 467
 468     do
 469     {
 470       *pStateCurnt++ = *pSrc++;
 471
 472     } while(--i);
 473
 474     /*Set sum to zero */
 475     sum0 = 0;
 476
 477     /* Initialize state pointer */
 478     px = pState;
 479
 480     /* Initialize coeff pointer */
 481     pb = pCoeffs;
 482
 483     /* Loop unrolling.  Process 4 taps at a time. */
 484     tapCnt = numTaps >> 2;
 485
 486     /* Loop over the number of taps.  Unroll by a factor of 4.
 487      ** Repeat until we've computed numTaps-4 coefficients. */
 488     while(tapCnt > 0u)
 489     {
 490       /* Read the Read b[numTaps-1] coefficients */
 491       c0 = *pb++;
 492
 493       /* Read x[n-numTaps-1] and sample */
 494       x0 = *px++;
 495
 496       /* Perform the multiply-accumulate */
 497       sum0 += x0 * c0;
 498
 499       /* Read the b[numTaps-2] coefficient */
 500       c0 = *pb++;
 501
 502       /* Read x[n-numTaps-2] and  sample */
 503       x0 = *px++;
 504
 505       /* Perform the multiply-accumulate */
 506       sum0 += x0 * c0;
 507
 508       /* Read the b[numTaps-3]  coefficients */
 509       c0 = *pb++;
 510
 511       /* Read x[n-numTaps-3] sample */
 512       x0 = *px++;
 513
 514       /* Perform the multiply-accumulate */
 515       sum0 += x0 * c0;
 516
 517       /* Read the b[numTaps-4] coefficient */
 518       c0 = *pb++;
 519
 520       /* Read x[n-numTaps-4] sample */
 521       x0 = *px++;
 522
 523       /* Perform the multiply-accumulate */
 524       sum0 += x0 * c0;
 525
 526       /* Decrement the loop counter */
 527       tapCnt--;
 528     }
 529
 530     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
 531     tapCnt = numTaps % 0x4u;
 532
 533     while(tapCnt > 0u)
 534     {
 535       /* Read coefficients */
 536       c0 = *pb++;
 537
 538       /* Fetch 1 state variable */
 539       x0 = *px++;
 540
 541       /* Perform the multiply-accumulate */
 542       sum0 += x0 * c0;
 543
 544       /* Decrement the loop counter */
 545       tapCnt--;
 546     }
 547
 548     /* Advance the state pointer by the decimation factor
 549      * to process the next group of decimation factor number samples */
 550     pState = pState + S->M;
 551
 552     /* Store filter output, smlad returns the values in 2.14 format */
 553     /* so downsacle by 15 to get output in 1.15 */
 554     *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
 555
 556     /* Decrement the loop counter */
 557     blkCntN3--;
 558   }
 559
 560   /* Processing is complete.
 561    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
 562    ** This prepares the state buffer for the next function call. */
 563
 564   /* Points to the start of the state buffer */
 565   pStateCurnt = S->pState;
 566
 567   i = (numTaps - 1u) >> 2u;
 568
 569   /* copy data */
 570   while(i > 0u)
 571   {
 572     *pStateCurnt++ = *pState++;
 573     *pStateCurnt++ = *pState++;
 574     *pStateCurnt++ = *pState++;
 575     *pStateCurnt++ = *pState++;
 576
 577     /* Decrement the loop counter */
 578     i--;
 579   }
 580
 581   i = (numTaps - 1u) % 0x04u;
 582
 583   /* copy data */
 584   while(i > 0u)
 585   {
 586     *pStateCurnt++ = *pState++;
 587
 588     /* Decrement the loop counter */
 589     i--;
 590   }
 591 }
 592
 593
 594 #endif  /*      #ifndef UNALIGNED_SUPPORT_DISABLE       */
 595
 596 /**
 597  * @} end of FIR_decimate group
 598  */