tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_correlate_fast_q31.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_correlate_fast_q31.c
   9 *
  10 * Description:  Fast Q31 Correlation.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @addtogroup Corr
  49  * @{
  50  */
  51
  52 /**
  53  * @brief Correlation of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4.
  54  * @param[in] *pSrcA points to the first input sequence.
  55  * @param[in] srcALen length of the first input sequence.
  56  * @param[in] *pSrcB points to the second input sequence.
  57  * @param[in] srcBLen length of the second input sequence.
  58  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
  59  * @return none.
  60  *
  61  * @details
  62  * <b>Scaling and Overflow Behavior:</b>
  63  *
  64  * \par
  65  * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
  66  * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
  67  * These intermediate results are accumulated in a 32-bit register in 2.30 format.
  68  * Finally, the accumulator is saturated and converted to a 1.31 result.
  69  *
  70  * \par
  71  * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
  72  * In order to avoid overflows completely the input signals must be scaled down.
  73  * The input signals should be scaled down to avoid intermediate overflows.
  74  * Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a
  75  * maximum of min(srcALen, srcBLen) number of additions is carried internally.
  76  *
  77  * \par
  78  * See <code>arm_correlate_q31()</code> for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
  79  */
  80
  81 void arm_correlate_fast_q31(
  82   q31_t * pSrcA,
  83   uint32_t srcALen,
  84   q31_t * pSrcB,
  85   uint32_t srcBLen,
  86   q31_t * pDst)
  87 {
  88   q31_t *pIn1;                                   /* inputA pointer               */
  89   q31_t *pIn2;                                   /* inputB pointer               */
  90   q31_t *pOut = pDst;                            /* output pointer               */
  91   q31_t *px;                                     /* Intermediate inputA pointer  */
  92   q31_t *py;                                     /* Intermediate inputB pointer  */
  93   q31_t *pSrc1;                                  /* Intermediate pointers        */
  94   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
  95   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */
  96   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
  97   int32_t inc = 1;                               /* Destination address modifier */
  98
  99
 100   /* The algorithm implementation is based on the lengths of the inputs. */
 101   /* srcB is always made to slide across srcA. */
 102   /* So srcBLen is always considered as shorter or equal to srcALen */
 103   if(srcALen >= srcBLen)
 104   {
 105     /* Initialization of inputA pointer */
 106     pIn1 = (pSrcA);
 107
 108     /* Initialization of inputB pointer */
 109     pIn2 = (pSrcB);
 110
 111     /* Number of output samples is calculated */
 112     outBlockSize = (2u * srcALen) - 1u;
 113
 114     /* When srcALen > srcBLen, zero padding is done to srcB
 115      * to make their lengths equal.
 116      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
 117      * number of output samples are made zero */
 118     j = outBlockSize - (srcALen + (srcBLen - 1u));
 119
 120     /* Updating the pointer position to non zero value */
 121     pOut += j;
 122
 123   }
 124   else
 125   {
 126     /* Initialization of inputA pointer */
 127     pIn1 = (pSrcB);
 128
 129     /* Initialization of inputB pointer */
 130     pIn2 = (pSrcA);
 131
 132     /* srcBLen is always considered as shorter or equal to srcALen */
 133     j = srcBLen;
 134     srcBLen = srcALen;
 135     srcALen = j;
 136
 137     /* CORR(x, y) = Reverse order(CORR(y, x)) */
 138     /* Hence set the destination pointer to point to the last output sample */
 139     pOut = pDst + ((srcALen + srcBLen) - 2u);
 140
 141     /* Destination address modifier is set to -1 */
 142     inc = -1;
 143
 144   }
 145
 146   /* The function is internally
 147    * divided into three parts according to the number of multiplications that has to be
 148    * taken place between inputA samples and inputB samples. In the first part of the
 149    * algorithm, the multiplications increase by one for every iteration.
 150    * In the second part of the algorithm, srcBLen number of multiplications are done.
 151    * In the third part of the algorithm, the multiplications decrease by one
 152    * for every iteration.*/
 153   /* The algorithm is implemented in three stages.
 154    * The loop counters of each stage is initiated here. */
 155   blockSize1 = srcBLen - 1u;
 156   blockSize2 = srcALen - (srcBLen - 1u);
 157   blockSize3 = blockSize1;
 158
 159   /* --------------------------
 160    * Initializations of stage1
 161    * -------------------------*/
 162
 163   /* sum = x[0] * y[srcBlen - 1]
 164    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
 165    * ....
 166    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
 167    */
 168
 169   /* In this stage the MAC operations are increased by 1 for every iteration.
 170      The count variable holds the number of MAC operations performed */
 171   count = 1u;
 172
 173   /* Working pointer of inputA */
 174   px = pIn1;
 175
 176   /* Working pointer of inputB */
 177   pSrc1 = pIn2 + (srcBLen - 1u);
 178   py = pSrc1;
 179
 180   /* ------------------------
 181    * Stage1 process
 182    * ----------------------*/
 183
 184   /* The first stage starts here */
 185   while(blockSize1 > 0u)
 186   {
 187     /* Accumulator is made zero for every iteration */
 188     sum = 0;
 189
 190     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 191     k = count >> 2;
 192
 193     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 194      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 195     while(k > 0u)
 196     {
 197       /* x[0] * y[srcBLen - 4] */
 198       sum = (q31_t) ((((q63_t) sum << 32) +
 199                       ((q63_t) * px++ * (*py++))) >> 32);
 200       /* x[1] * y[srcBLen - 3] */
 201       sum = (q31_t) ((((q63_t) sum << 32) +
 202                       ((q63_t) * px++ * (*py++))) >> 32);
 203       /* x[2] * y[srcBLen - 2] */
 204       sum = (q31_t) ((((q63_t) sum << 32) +
 205                       ((q63_t) * px++ * (*py++))) >> 32);
 206       /* x[3] * y[srcBLen - 1] */
 207       sum = (q31_t) ((((q63_t) sum << 32) +
 208                       ((q63_t) * px++ * (*py++))) >> 32);
 209
 210       /* Decrement the loop counter */
 211       k--;
 212     }
 213
 214     /* If the count is not a multiple of 4, compute any remaining MACs here.
 215      ** No loop unrolling is used. */
 216     k = count % 0x4u;
 217
 218     while(k > 0u)
 219     {
 220       /* Perform the multiply-accumulates */
 221       /* x[0] * y[srcBLen - 1] */
 222       sum = (q31_t) ((((q63_t) sum << 32) +
 223                       ((q63_t) * px++ * (*py++))) >> 32);
 224
 225       /* Decrement the loop counter */
 226       k--;
 227     }
 228
 229     /* Store the result in the accumulator in the destination buffer. */
 230     *pOut = sum << 1;
 231     /* Destination pointer is updated according to the address modifier, inc */
 232     pOut += inc;
 233
 234     /* Update the inputA and inputB pointers for next MAC calculation */
 235     py = pSrc1 - count;
 236     px = pIn1;
 237
 238     /* Increment the MAC count */
 239     count++;
 240
 241     /* Decrement the loop counter */
 242     blockSize1--;
 243   }
 244
 245   /* --------------------------
 246    * Initializations of stage2
 247    * ------------------------*/
 248
 249   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
 250    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
 251    * ....
 252    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 253    */
 254
 255   /* Working pointer of inputA */
 256   px = pIn1;
 257
 258   /* Working pointer of inputB */
 259   py = pIn2;
 260
 261   /* count is index by which the pointer pIn1 to be incremented */
 262   count = 0u;
 263
 264   /* -------------------
 265    * Stage2 process
 266    * ------------------*/
 267
 268   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 269    * So, to loop unroll over blockSize2,
 270    * srcBLen should be greater than or equal to 4 */
 271   if(srcBLen >= 4u)
 272   {
 273     /* Loop unroll over blockSize2, by 4 */
 274     blkCnt = blockSize2 >> 2u;
 275
 276     while(blkCnt > 0u)
 277     {
 278       /* Set all accumulators to zero */
 279       acc0 = 0;
 280       acc1 = 0;
 281       acc2 = 0;
 282       acc3 = 0;
 283
 284       /* read x[0], x[1], x[2] samples */
 285       x0 = *(px++);
 286       x1 = *(px++);
 287       x2 = *(px++);
 288
 289       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 290       k = srcBLen >> 2u;
 291
 292       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 293        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 294       do
 295       {
 296         /* Read y[0] sample */
 297         c0 = *(py++);
 298
 299         /* Read x[3] sample */
 300         x3 = *(px++);
 301
 302         /* Perform the multiply-accumulate */
 303         /* acc0 +=  x[0] * y[0] */
 304         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
 305         /* acc1 +=  x[1] * y[0] */
 306         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
 307         /* acc2 +=  x[2] * y[0] */
 308         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
 309         /* acc3 +=  x[3] * y[0] */
 310         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
 311
 312         /* Read y[1] sample */
 313         c0 = *(py++);
 314
 315         /* Read x[4] sample */
 316         x0 = *(px++);
 317
 318         /* Perform the multiply-accumulates */
 319         /* acc0 +=  x[1] * y[1] */
 320         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
 321         /* acc1 +=  x[2] * y[1] */
 322         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
 323         /* acc2 +=  x[3] * y[1] */
 324         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
 325         /* acc3 +=  x[4] * y[1] */
 326         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
 327
 328         /* Read y[2] sample */
 329         c0 = *(py++);
 330
 331         /* Read x[5] sample */
 332         x1 = *(px++);
 333
 334         /* Perform the multiply-accumulates */
 335         /* acc0 +=  x[2] * y[2] */
 336         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
 337         /* acc1 +=  x[3] * y[2] */
 338         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
 339         /* acc2 +=  x[4] * y[2] */
 340         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
 341         /* acc3 +=  x[5] * y[2] */
 342         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
 343
 344         /* Read y[3] sample */
 345         c0 = *(py++);
 346
 347         /* Read x[6] sample */
 348         x2 = *(px++);
 349
 350         /* Perform the multiply-accumulates */
 351         /* acc0 +=  x[3] * y[3] */
 352         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
 353         /* acc1 +=  x[4] * y[3] */
 354         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
 355         /* acc2 +=  x[5] * y[3] */
 356         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
 357         /* acc3 +=  x[6] * y[3] */
 358         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
 359
 360
 361       } while(--k);
 362
 363       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 364        ** No loop unrolling is used. */
 365       k = srcBLen % 0x4u;
 366
 367       while(k > 0u)
 368       {
 369         /* Read y[4] sample */
 370         c0 = *(py++);
 371
 372         /* Read x[7] sample */
 373         x3 = *(px++);
 374
 375         /* Perform the multiply-accumulates */
 376         /* acc0 +=  x[4] * y[4] */
 377         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
 378         /* acc1 +=  x[5] * y[4] */
 379         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
 380         /* acc2 +=  x[6] * y[4] */
 381         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
 382         /* acc3 +=  x[7] * y[4] */
 383         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
 384
 385         /* Reuse the present samples for the next MAC */
 386         x0 = x1;
 387         x1 = x2;
 388         x2 = x3;
 389
 390         /* Decrement the loop counter */
 391         k--;
 392       }
 393
 394       /* Store the result in the accumulator in the destination buffer. */
 395       *pOut = (q31_t) (acc0 << 1);
 396       /* Destination pointer is updated according to the address modifier, inc */
 397       pOut += inc;
 398
 399       *pOut = (q31_t) (acc1 << 1);
 400       pOut += inc;
 401
 402       *pOut = (q31_t) (acc2 << 1);
 403       pOut += inc;
 404
 405       *pOut = (q31_t) (acc3 << 1);
 406       pOut += inc;
 407
 408       /* Increment the pointer pIn1 index, count by 4 */
 409       count += 4u;
 410
 411       /* Update the inputA and inputB pointers for next MAC calculation */
 412       px = pIn1 + count;
 413       py = pIn2;
 414
 415
 416       /* Decrement the loop counter */
 417       blkCnt--;
 418     }
 419
 420     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 421      ** No loop unrolling is used. */
 422     blkCnt = blockSize2 % 0x4u;
 423
 424     while(blkCnt > 0u)
 425     {
 426       /* Accumulator is made zero for every iteration */
 427       sum = 0;
 428
 429       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 430       k = srcBLen >> 2u;
 431
 432       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 433        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 434       while(k > 0u)
 435       {
 436         /* Perform the multiply-accumulates */
 437         sum = (q31_t) ((((q63_t) sum << 32) +
 438                         ((q63_t) * px++ * (*py++))) >> 32);
 439         sum = (q31_t) ((((q63_t) sum << 32) +
 440                         ((q63_t) * px++ * (*py++))) >> 32);
 441         sum = (q31_t) ((((q63_t) sum << 32) +
 442                         ((q63_t) * px++ * (*py++))) >> 32);
 443         sum = (q31_t) ((((q63_t) sum << 32) +
 444                         ((q63_t) * px++ * (*py++))) >> 32);
 445
 446         /* Decrement the loop counter */
 447         k--;
 448       }
 449
 450       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 451        ** No loop unrolling is used. */
 452       k = srcBLen % 0x4u;
 453
 454       while(k > 0u)
 455       {
 456         /* Perform the multiply-accumulate */
 457         sum = (q31_t) ((((q63_t) sum << 32) +
 458                         ((q63_t) * px++ * (*py++))) >> 32);
 459
 460         /* Decrement the loop counter */
 461         k--;
 462       }
 463
 464       /* Store the result in the accumulator in the destination buffer. */
 465       *pOut = sum << 1;
 466       /* Destination pointer is updated according to the address modifier, inc */
 467       pOut += inc;
 468
 469       /* Increment the MAC count */
 470       count++;
 471
 472       /* Update the inputA and inputB pointers for next MAC calculation */
 473       px = pIn1 + count;
 474       py = pIn2;
 475
 476
 477       /* Decrement the loop counter */
 478       blkCnt--;
 479     }
 480   }
 481   else
 482   {
 483     /* If the srcBLen is not a multiple of 4,
 484      * the blockSize2 loop cannot be unrolled by 4 */
 485     blkCnt = blockSize2;
 486
 487     while(blkCnt > 0u)
 488     {
 489       /* Accumulator is made zero for every iteration */
 490       sum = 0;
 491
 492       /* Loop over srcBLen */
 493       k = srcBLen;
 494
 495       while(k > 0u)
 496       {
 497         /* Perform the multiply-accumulate */
 498         sum = (q31_t) ((((q63_t) sum << 32) +
 499                         ((q63_t) * px++ * (*py++))) >> 32);
 500
 501         /* Decrement the loop counter */
 502         k--;
 503       }
 504
 505       /* Store the result in the accumulator in the destination buffer. */
 506       *pOut = sum << 1;
 507       /* Destination pointer is updated according to the address modifier, inc */
 508       pOut += inc;
 509
 510       /* Increment the MAC count */
 511       count++;
 512
 513       /* Update the inputA and inputB pointers for next MAC calculation */
 514       px = pIn1 + count;
 515       py = pIn2;
 516
 517       /* Decrement the loop counter */
 518       blkCnt--;
 519     }
 520   }
 521
 522   /* --------------------------
 523    * Initializations of stage3
 524    * -------------------------*/
 525
 526   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 527    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 528    * ....
 529    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
 530    * sum +=  x[srcALen-1] * y[0]
 531    */
 532
 533   /* In this stage the MAC operations are decreased by 1 for every iteration.
 534      The count variable holds the number of MAC operations performed */
 535   count = srcBLen - 1u;
 536
 537   /* Working pointer of inputA */
 538   pSrc1 = ((pIn1 + srcALen) - srcBLen) + 1u;
 539   px = pSrc1;
 540
 541   /* Working pointer of inputB */
 542   py = pIn2;
 543
 544   /* -------------------
 545    * Stage3 process
 546    * ------------------*/
 547
 548   while(blockSize3 > 0u)
 549   {
 550     /* Accumulator is made zero for every iteration */
 551     sum = 0;
 552
 553     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 554     k = count >> 2u;
 555
 556     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 557      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 558     while(k > 0u)
 559     {
 560       /* Perform the multiply-accumulates */
 561       /* sum += x[srcALen - srcBLen + 4] * y[3] */
 562       sum = (q31_t) ((((q63_t) sum << 32) +
 563                       ((q63_t) * px++ * (*py++))) >> 32);
 564       /* sum += x[srcALen - srcBLen + 3] * y[2] */
 565       sum = (q31_t) ((((q63_t) sum << 32) +
 566                       ((q63_t) * px++ * (*py++))) >> 32);
 567       /* sum += x[srcALen - srcBLen + 2] * y[1] */
 568       sum = (q31_t) ((((q63_t) sum << 32) +
 569                       ((q63_t) * px++ * (*py++))) >> 32);
 570       /* sum += x[srcALen - srcBLen + 1] * y[0] */
 571       sum = (q31_t) ((((q63_t) sum << 32) +
 572                       ((q63_t) * px++ * (*py++))) >> 32);
 573
 574       /* Decrement the loop counter */
 575       k--;
 576     }
 577
 578     /* If the count is not a multiple of 4, compute any remaining MACs here.
 579      ** No loop unrolling is used. */
 580     k = count % 0x4u;
 581
 582     while(k > 0u)
 583     {
 584       /* Perform the multiply-accumulates */
 585       sum = (q31_t) ((((q63_t) sum << 32) +
 586                       ((q63_t) * px++ * (*py++))) >> 32);
 587
 588       /* Decrement the loop counter */
 589       k--;
 590     }
 591
 592     /* Store the result in the accumulator in the destination buffer. */
 593     *pOut = sum << 1;
 594     /* Destination pointer is updated according to the address modifier, inc */
 595     pOut += inc;
 596
 597     /* Update the inputA and inputB pointers for next MAC calculation */
 598     px = ++pSrc1;
 599     py = pIn2;
 600
 601     /* Decrement the MAC count */
 602     count--;
 603
 604     /* Decrement the loop counter */
 605     blockSize3--;
 606   }
 607
 608 }
 609
 610 /**
 611  * @} end of Corr group
 612  */