tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_correlate_fast_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_correlate_fast_q15.c
   9 *
  10 * Description:  Fast Q15 Correlation.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @addtogroup Corr
  49  * @{
  50  */
  51
  52 /**
  53  * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
  54  * @param[in] *pSrcA points to the first input sequence.
  55  * @param[in] srcALen length of the first input sequence.
  56  * @param[in] *pSrcB points to the second input sequence.
  57  * @param[in] srcBLen length of the second input sequence.
  58  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
  59  * @return none.
  60  *
  61  * <b>Scaling and Overflow Behavior:</b>
  62  *
  63  * \par
  64  * This fast version uses a 32-bit accumulator with 2.30 format.
  65  * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
  66  * There is no saturation on intermediate additions.
  67  * Thus, if the accumulator overflows it wraps around and distorts the result.
  68  * The input signals should be scaled down to avoid intermediate overflows.
  69  * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
  70  * maximum of min(srcALen, srcBLen) number of additions is carried internally.
  71  * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
  72  *
  73  * \par
  74  * See <code>arm_correlate_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
  75  */
  76
  77 void arm_correlate_fast_q15(
  78   q15_t * pSrcA,
  79   uint32_t srcALen,
  80   q15_t * pSrcB,
  81   uint32_t srcBLen,
  82   q15_t * pDst)
  83 {
  84 #ifndef UNALIGNED_SUPPORT_DISABLE
  85
  86   q15_t *pIn1;                                   /* inputA pointer               */
  87   q15_t *pIn2;                                   /* inputB pointer               */
  88   q15_t *pOut = pDst;                            /* output pointer               */
  89   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
  90   q15_t *px;                                     /* Intermediate inputA pointer  */
  91   q15_t *py;                                     /* Intermediate inputB pointer  */
  92   q15_t *pSrc1;                                  /* Intermediate pointers        */
  93   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */
  94   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
  95   int32_t inc = 1;                               /* Destination address modifier */
  96
  97
  98   /* The algorithm implementation is based on the lengths of the inputs. */
  99   /* srcB is always made to slide across srcA. */
 100   /* So srcBLen is always considered as shorter or equal to srcALen */
 101   /* But CORR(x, y) is reverse of CORR(y, x) */
 102   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
 103   /* and the destination pointer modifier, inc is set to -1 */
 104   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
 105   /* But to improve the performance,
 106    * we include zeroes in the output instead of zero padding either of the the inputs*/
 107   /* If srcALen > srcBLen,
 108    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
 109   /* If srcALen < srcBLen,
 110    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
 111   if(srcALen >= srcBLen)
 112   {
 113     /* Initialization of inputA pointer */
 114     pIn1 = (pSrcA);
 115
 116     /* Initialization of inputB pointer */
 117     pIn2 = (pSrcB);
 118
 119     /* Number of output samples is calculated */
 120     outBlockSize = (2u * srcALen) - 1u;
 121
 122     /* When srcALen > srcBLen, zero padding is done to srcB
 123      * to make their lengths equal.
 124      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
 125      * number of output samples are made zero */
 126     j = outBlockSize - (srcALen + (srcBLen - 1u));
 127
 128     /* Updating the pointer position to non zero value */
 129     pOut += j;
 130
 131   }
 132   else
 133   {
 134     /* Initialization of inputA pointer */
 135     pIn1 = (pSrcB);
 136
 137     /* Initialization of inputB pointer */
 138     pIn2 = (pSrcA);
 139
 140     /* srcBLen is always considered as shorter or equal to srcALen */
 141     j = srcBLen;
 142     srcBLen = srcALen;
 143     srcALen = j;
 144
 145     /* CORR(x, y) = Reverse order(CORR(y, x)) */
 146     /* Hence set the destination pointer to point to the last output sample */
 147     pOut = pDst + ((srcALen + srcBLen) - 2u);
 148
 149     /* Destination address modifier is set to -1 */
 150     inc = -1;
 151
 152   }
 153
 154   /* The function is internally
 155    * divided into three parts according to the number of multiplications that has to be
 156    * taken place between inputA samples and inputB samples. In the first part of the
 157    * algorithm, the multiplications increase by one for every iteration.
 158    * In the second part of the algorithm, srcBLen number of multiplications are done.
 159    * In the third part of the algorithm, the multiplications decrease by one
 160    * for every iteration.*/
 161   /* The algorithm is implemented in three stages.
 162    * The loop counters of each stage is initiated here. */
 163   blockSize1 = srcBLen - 1u;
 164   blockSize2 = srcALen - (srcBLen - 1u);
 165   blockSize3 = blockSize1;
 166
 167   /* --------------------------
 168    * Initializations of stage1
 169    * -------------------------*/
 170
 171   /* sum = x[0] * y[srcBlen - 1]
 172    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
 173    * ....
 174    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
 175    */
 176
 177   /* In this stage the MAC operations are increased by 1 for every iteration.
 178      The count variable holds the number of MAC operations performed */
 179   count = 1u;
 180
 181   /* Working pointer of inputA */
 182   px = pIn1;
 183
 184   /* Working pointer of inputB */
 185   pSrc1 = pIn2 + (srcBLen - 1u);
 186   py = pSrc1;
 187
 188   /* ------------------------
 189    * Stage1 process
 190    * ----------------------*/
 191
 192   /* The first loop starts here */
 193   while(blockSize1 > 0u)
 194   {
 195     /* Accumulator is made zero for every iteration */
 196     sum = 0;
 197
 198     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 199     k = count >> 2;
 200
 201     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 202      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 203     while(k > 0u)
 204     {
 205       /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
 206       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
 207       /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
 208       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
 209
 210       /* Decrement the loop counter */
 211       k--;
 212     }
 213
 214     /* If the count is not a multiple of 4, compute any remaining MACs here.
 215      ** No loop unrolling is used. */
 216     k = count % 0x4u;
 217
 218     while(k > 0u)
 219     {
 220       /* Perform the multiply-accumulates */
 221       /* x[0] * y[srcBLen - 1] */
 222       sum = __SMLAD(*px++, *py++, sum);
 223
 224       /* Decrement the loop counter */
 225       k--;
 226     }
 227
 228     /* Store the result in the accumulator in the destination buffer. */
 229     *pOut = (q15_t) (sum >> 15);
 230     /* Destination pointer is updated according to the address modifier, inc */
 231     pOut += inc;
 232
 233     /* Update the inputA and inputB pointers for next MAC calculation */
 234     py = pSrc1 - count;
 235     px = pIn1;
 236
 237     /* Increment the MAC count */
 238     count++;
 239
 240     /* Decrement the loop counter */
 241     blockSize1--;
 242   }
 243
 244   /* --------------------------
 245    * Initializations of stage2
 246    * ------------------------*/
 247
 248   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
 249    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
 250    * ....
 251    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 252    */
 253
 254   /* Working pointer of inputA */
 255   px = pIn1;
 256
 257   /* Working pointer of inputB */
 258   py = pIn2;
 259
 260   /* count is index by which the pointer pIn1 to be incremented */
 261   count = 0u;
 262
 263   /* -------------------
 264    * Stage2 process
 265    * ------------------*/
 266
 267   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 268    * So, to loop unroll over blockSize2,
 269    * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
 270   if(srcBLen >= 4u)
 271   {
 272     /* Loop unroll over blockSize2, by 4 */
 273     blkCnt = blockSize2 >> 2u;
 274
 275     while(blkCnt > 0u)
 276     {
 277       /* Set all accumulators to zero */
 278       acc0 = 0;
 279       acc1 = 0;
 280       acc2 = 0;
 281       acc3 = 0;
 282
 283       /* read x[0], x[1] samples */
 284       x0 = *__SIMD32(px);
 285       /* read x[1], x[2] samples */
 286       x1 = _SIMD32_OFFSET(px + 1);
 287           px += 2u;
 288
 289       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 290       k = srcBLen >> 2u;
 291
 292       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 293        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 294       do
 295       {
 296         /* Read the first two inputB samples using SIMD:
 297          * y[0] and y[1] */
 298         c0 = *__SIMD32(py)++;
 299
 300         /* acc0 +=  x[0] * y[0] + x[1] * y[1] */
 301         acc0 = __SMLAD(x0, c0, acc0);
 302
 303         /* acc1 +=  x[1] * y[0] + x[2] * y[1] */
 304         acc1 = __SMLAD(x1, c0, acc1);
 305
 306         /* Read x[2], x[3] */
 307         x2 = *__SIMD32(px);
 308
 309         /* Read x[3], x[4] */
 310         x3 = _SIMD32_OFFSET(px + 1);
 311
 312         /* acc2 +=  x[2] * y[0] + x[3] * y[1] */
 313         acc2 = __SMLAD(x2, c0, acc2);
 314
 315         /* acc3 +=  x[3] * y[0] + x[4] * y[1] */
 316         acc3 = __SMLAD(x3, c0, acc3);
 317
 318         /* Read y[2] and y[3] */
 319         c0 = *__SIMD32(py)++;
 320
 321         /* acc0 +=  x[2] * y[2] + x[3] * y[3] */
 322         acc0 = __SMLAD(x2, c0, acc0);
 323
 324         /* acc1 +=  x[3] * y[2] + x[4] * y[3] */
 325         acc1 = __SMLAD(x3, c0, acc1);
 326
 327         /* Read x[4], x[5] */
 328         x0 = _SIMD32_OFFSET(px + 2);
 329
 330         /* Read x[5], x[6] */
 331         x1 = _SIMD32_OFFSET(px + 3);
 332                 px += 4u;
 333
 334         /* acc2 +=  x[4] * y[2] + x[5] * y[3] */
 335         acc2 = __SMLAD(x0, c0, acc2);
 336
 337         /* acc3 +=  x[5] * y[2] + x[6] * y[3] */
 338         acc3 = __SMLAD(x1, c0, acc3);
 339
 340       } while(--k);
 341
 342       /* For the next MAC operations, SIMD is not used
 343        * So, the 16 bit pointer if inputB, py is updated */
 344
 345       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 346        ** No loop unrolling is used. */
 347       k = srcBLen % 0x4u;
 348
 349       if(k == 1u)
 350       {
 351         /* Read y[4] */
 352         c0 = *py;
 353 #ifdef  ARM_MATH_BIG_ENDIAN
 354
 355         c0 = c0 << 16u;
 356
 357 #else
 358
 359         c0 = c0 & 0x0000FFFF;
 360
 361 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 362
 363         /* Read x[7] */
 364         x3 = *__SIMD32(px);
 365                 px++;
 366
 367         /* Perform the multiply-accumulates */
 368         acc0 = __SMLAD(x0, c0, acc0);
 369         acc1 = __SMLAD(x1, c0, acc1);
 370         acc2 = __SMLADX(x1, c0, acc2);
 371         acc3 = __SMLADX(x3, c0, acc3);
 372       }
 373
 374       if(k == 2u)
 375       {
 376         /* Read y[4], y[5] */
 377         c0 = *__SIMD32(py);
 378
 379         /* Read x[7], x[8] */
 380         x3 = *__SIMD32(px);
 381
 382         /* Read x[9] */
 383         x2 = _SIMD32_OFFSET(px + 1);
 384                 px += 2u;
 385
 386         /* Perform the multiply-accumulates */
 387         acc0 = __SMLAD(x0, c0, acc0);
 388         acc1 = __SMLAD(x1, c0, acc1);
 389         acc2 = __SMLAD(x3, c0, acc2);
 390         acc3 = __SMLAD(x2, c0, acc3);
 391       }
 392
 393       if(k == 3u)
 394       {
 395         /* Read y[4], y[5] */
 396         c0 = *__SIMD32(py)++;
 397
 398         /* Read x[7], x[8] */
 399         x3 = *__SIMD32(px);
 400
 401         /* Read x[9] */
 402         x2 = _SIMD32_OFFSET(px + 1);
 403
 404         /* Perform the multiply-accumulates */
 405         acc0 = __SMLAD(x0, c0, acc0);
 406         acc1 = __SMLAD(x1, c0, acc1);
 407         acc2 = __SMLAD(x3, c0, acc2);
 408         acc3 = __SMLAD(x2, c0, acc3);
 409
 410         c0 = (*py);
 411         /* Read y[6] */
 412 #ifdef  ARM_MATH_BIG_ENDIAN
 413
 414         c0 = c0 << 16u;
 415 #else
 416
 417         c0 = c0 & 0x0000FFFF;
 418 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 419
 420         /* Read x[10] */
 421         x3 = _SIMD32_OFFSET(px + 2);
 422                 px += 3u;
 423
 424         /* Perform the multiply-accumulates */
 425         acc0 = __SMLADX(x1, c0, acc0);
 426         acc1 = __SMLAD(x2, c0, acc1);
 427         acc2 = __SMLADX(x2, c0, acc2);
 428         acc3 = __SMLADX(x3, c0, acc3);
 429       }
 430
 431       /* Store the result in the accumulator in the destination buffer. */
 432       *pOut = (q15_t) (acc0 >> 15);
 433       /* Destination pointer is updated according to the address modifier, inc */
 434       pOut += inc;
 435
 436       *pOut = (q15_t) (acc1 >> 15);
 437       pOut += inc;
 438
 439       *pOut = (q15_t) (acc2 >> 15);
 440       pOut += inc;
 441
 442       *pOut = (q15_t) (acc3 >> 15);
 443       pOut += inc;
 444
 445       /* Increment the pointer pIn1 index, count by 1 */
 446       count += 4u;
 447
 448       /* Update the inputA and inputB pointers for next MAC calculation */
 449       px = pIn1 + count;
 450       py = pIn2;
 451
 452
 453       /* Decrement the loop counter */
 454       blkCnt--;
 455     }
 456
 457     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 458      ** No loop unrolling is used. */
 459     blkCnt = blockSize2 % 0x4u;
 460
 461     while(blkCnt > 0u)
 462     {
 463       /* Accumulator is made zero for every iteration */
 464       sum = 0;
 465
 466       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 467       k = srcBLen >> 2u;
 468
 469       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 470        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 471       while(k > 0u)
 472       {
 473         /* Perform the multiply-accumulates */
 474         sum += ((q31_t) * px++ * *py++);
 475         sum += ((q31_t) * px++ * *py++);
 476         sum += ((q31_t) * px++ * *py++);
 477         sum += ((q31_t) * px++ * *py++);
 478
 479         /* Decrement the loop counter */
 480         k--;
 481       }
 482
 483       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 484        ** No loop unrolling is used. */
 485       k = srcBLen % 0x4u;
 486
 487       while(k > 0u)
 488       {
 489         /* Perform the multiply-accumulates */
 490         sum += ((q31_t) * px++ * *py++);
 491
 492         /* Decrement the loop counter */
 493         k--;
 494       }
 495
 496       /* Store the result in the accumulator in the destination buffer. */
 497       *pOut = (q15_t) (sum >> 15);
 498       /* Destination pointer is updated according to the address modifier, inc */
 499       pOut += inc;
 500
 501       /* Increment the pointer pIn1 index, count by 1 */
 502       count++;
 503
 504       /* Update the inputA and inputB pointers for next MAC calculation */
 505       px = pIn1 + count;
 506       py = pIn2;
 507
 508       /* Decrement the loop counter */
 509       blkCnt--;
 510     }
 511   }
 512   else
 513   {
 514     /* If the srcBLen is not a multiple of 4,
 515      * the blockSize2 loop cannot be unrolled by 4 */
 516     blkCnt = blockSize2;
 517
 518     while(blkCnt > 0u)
 519     {
 520       /* Accumulator is made zero for every iteration */
 521       sum = 0;
 522
 523       /* Loop over srcBLen */
 524       k = srcBLen;
 525
 526       while(k > 0u)
 527       {
 528         /* Perform the multiply-accumulate */
 529         sum += ((q31_t) * px++ * *py++);
 530
 531         /* Decrement the loop counter */
 532         k--;
 533       }
 534
 535       /* Store the result in the accumulator in the destination buffer. */
 536       *pOut = (q15_t) (sum >> 15);
 537       /* Destination pointer is updated according to the address modifier, inc */
 538       pOut += inc;
 539
 540       /* Increment the MAC count */
 541       count++;
 542
 543       /* Update the inputA and inputB pointers for next MAC calculation */
 544       px = pIn1 + count;
 545       py = pIn2;
 546
 547       /* Decrement the loop counter */
 548       blkCnt--;
 549     }
 550   }
 551
 552   /* --------------------------
 553    * Initializations of stage3
 554    * -------------------------*/
 555
 556   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 557    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 558    * ....
 559    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
 560    * sum +=  x[srcALen-1] * y[0]
 561    */
 562
 563   /* In this stage the MAC operations are decreased by 1 for every iteration.
 564      The count variable holds the number of MAC operations performed */
 565   count = srcBLen - 1u;
 566
 567   /* Working pointer of inputA */
 568   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 569   px = pSrc1;
 570
 571   /* Working pointer of inputB */
 572   py = pIn2;
 573
 574   /* -------------------
 575    * Stage3 process
 576    * ------------------*/
 577
 578   while(blockSize3 > 0u)
 579   {
 580     /* Accumulator is made zero for every iteration */
 581     sum = 0;
 582
 583     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 584     k = count >> 2u;
 585
 586     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 587      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 588     while(k > 0u)
 589     {
 590       /* Perform the multiply-accumulates */
 591       /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
 592       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
 593       /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
 594       sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
 595
 596       /* Decrement the loop counter */
 597       k--;
 598     }
 599
 600     /* If the count is not a multiple of 4, compute any remaining MACs here.
 601      ** No loop unrolling is used. */
 602     k = count % 0x4u;
 603
 604     while(k > 0u)
 605     {
 606       /* Perform the multiply-accumulates */
 607       sum = __SMLAD(*px++, *py++, sum);
 608
 609       /* Decrement the loop counter */
 610       k--;
 611     }
 612
 613     /* Store the result in the accumulator in the destination buffer. */
 614     *pOut = (q15_t) (sum >> 15);
 615     /* Destination pointer is updated according to the address modifier, inc */
 616     pOut += inc;
 617
 618     /* Update the inputA and inputB pointers for next MAC calculation */
 619     px = ++pSrc1;
 620     py = pIn2;
 621
 622     /* Decrement the MAC count */
 623     count--;
 624
 625     /* Decrement the loop counter */
 626     blockSize3--;
 627   }
 628
 629 #else
 630
 631   q15_t *pIn1;                                   /* inputA pointer               */
 632   q15_t *pIn2;                                   /* inputB pointer               */
 633   q15_t *pOut = pDst;                            /* output pointer               */
 634   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
 635   q15_t *px;                                     /* Intermediate inputA pointer  */
 636   q15_t *py;                                     /* Intermediate inputB pointer  */
 637   q15_t *pSrc1;                                  /* Intermediate pointers        */
 638   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */
 639   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
 640   int32_t inc = 1;                               /* Destination address modifier */
 641   q15_t a, b;
 642
 643
 644   /* The algorithm implementation is based on the lengths of the inputs. */
 645   /* srcB is always made to slide across srcA. */
 646   /* So srcBLen is always considered as shorter or equal to srcALen */
 647   /* But CORR(x, y) is reverse of CORR(y, x) */
 648   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
 649   /* and the destination pointer modifier, inc is set to -1 */
 650   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
 651   /* But to improve the performance,
 652    * we include zeroes in the output instead of zero padding either of the the inputs*/
 653   /* If srcALen > srcBLen,
 654    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
 655   /* If srcALen < srcBLen,
 656    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
 657   if(srcALen >= srcBLen)
 658   {
 659     /* Initialization of inputA pointer */
 660     pIn1 = (pSrcA);
 661
 662     /* Initialization of inputB pointer */
 663     pIn2 = (pSrcB);
 664
 665     /* Number of output samples is calculated */
 666     outBlockSize = (2u * srcALen) - 1u;
 667
 668     /* When srcALen > srcBLen, zero padding is done to srcB
 669      * to make their lengths equal.
 670      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
 671      * number of output samples are made zero */
 672     j = outBlockSize - (srcALen + (srcBLen - 1u));
 673
 674     /* Updating the pointer position to non zero value */
 675     pOut += j;
 676
 677   }
 678   else
 679   {
 680     /* Initialization of inputA pointer */
 681     pIn1 = (pSrcB);
 682
 683     /* Initialization of inputB pointer */
 684     pIn2 = (pSrcA);
 685
 686     /* srcBLen is always considered as shorter or equal to srcALen */
 687     j = srcBLen;
 688     srcBLen = srcALen;
 689     srcALen = j;
 690
 691     /* CORR(x, y) = Reverse order(CORR(y, x)) */
 692     /* Hence set the destination pointer to point to the last output sample */
 693     pOut = pDst + ((srcALen + srcBLen) - 2u);
 694
 695     /* Destination address modifier is set to -1 */
 696     inc = -1;
 697
 698   }
 699
 700   /* The function is internally
 701    * divided into three parts according to the number of multiplications that has to be
 702    * taken place between inputA samples and inputB samples. In the first part of the
 703    * algorithm, the multiplications increase by one for every iteration.
 704    * In the second part of the algorithm, srcBLen number of multiplications are done.
 705    * In the third part of the algorithm, the multiplications decrease by one
 706    * for every iteration.*/
 707   /* The algorithm is implemented in three stages.
 708    * The loop counters of each stage is initiated here. */
 709   blockSize1 = srcBLen - 1u;
 710   blockSize2 = srcALen - (srcBLen - 1u);
 711   blockSize3 = blockSize1;
 712
 713   /* --------------------------
 714    * Initializations of stage1
 715    * -------------------------*/
 716
 717   /* sum = x[0] * y[srcBlen - 1]
 718    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
 719    * ....
 720    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
 721    */
 722
 723   /* In this stage the MAC operations are increased by 1 for every iteration.
 724      The count variable holds the number of MAC operations performed */
 725   count = 1u;
 726
 727   /* Working pointer of inputA */
 728   px = pIn1;
 729
 730   /* Working pointer of inputB */
 731   pSrc1 = pIn2 + (srcBLen - 1u);
 732   py = pSrc1;
 733
 734   /* ------------------------
 735    * Stage1 process
 736    * ----------------------*/
 737
 738   /* The first loop starts here */
 739   while(blockSize1 > 0u)
 740   {
 741     /* Accumulator is made zero for every iteration */
 742     sum = 0;
 743
 744     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 745     k = count >> 2;
 746
 747     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 748      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 749     while(k > 0u)
 750     {
 751       /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
 752         sum += ((q31_t) * px++ * *py++);
 753         sum += ((q31_t) * px++ * *py++);
 754         sum += ((q31_t) * px++ * *py++);
 755         sum += ((q31_t) * px++ * *py++);
 756
 757       /* Decrement the loop counter */
 758       k--;
 759     }
 760
 761     /* If the count is not a multiple of 4, compute any remaining MACs here.
 762      ** No loop unrolling is used. */
 763     k = count % 0x4u;
 764
 765     while(k > 0u)
 766     {
 767       /* Perform the multiply-accumulates */
 768       /* x[0] * y[srcBLen - 1] */
 769         sum += ((q31_t) * px++ * *py++);
 770
 771       /* Decrement the loop counter */
 772       k--;
 773     }
 774
 775     /* Store the result in the accumulator in the destination buffer. */
 776     *pOut = (q15_t) (sum >> 15);
 777     /* Destination pointer is updated according to the address modifier, inc */
 778     pOut += inc;
 779
 780     /* Update the inputA and inputB pointers for next MAC calculation */
 781     py = pSrc1 - count;
 782     px = pIn1;
 783
 784     /* Increment the MAC count */
 785     count++;
 786
 787     /* Decrement the loop counter */
 788     blockSize1--;
 789   }
 790
 791   /* --------------------------
 792    * Initializations of stage2
 793    * ------------------------*/
 794
 795   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
 796    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
 797    * ....
 798    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 799    */
 800
 801   /* Working pointer of inputA */
 802   px = pIn1;
 803
 804   /* Working pointer of inputB */
 805   py = pIn2;
 806
 807   /* count is index by which the pointer pIn1 to be incremented */
 808   count = 0u;
 809
 810   /* -------------------
 811    * Stage2 process
 812    * ------------------*/
 813
 814   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 815    * So, to loop unroll over blockSize2,
 816    * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
 817   if(srcBLen >= 4u)
 818   {
 819     /* Loop unroll over blockSize2, by 4 */
 820     blkCnt = blockSize2 >> 2u;
 821
 822     while(blkCnt > 0u)
 823     {
 824       /* Set all accumulators to zero */
 825       acc0 = 0;
 826       acc1 = 0;
 827       acc2 = 0;
 828       acc3 = 0;
 829
 830       /* read x[0], x[1], x[2] samples */
 831           a = *px;
 832           b = *(px + 1);
 833
 834 #ifndef ARM_MATH_BIG_ENDIAN
 835
 836           x0 = __PKHBT(a, b, 16);
 837           a = *(px + 2);
 838           x1 = __PKHBT(b, a, 16);
 839
 840 #else
 841
 842           x0 = __PKHBT(b, a, 16);
 843           a = *(px + 2);
 844           x1 = __PKHBT(a, b, 16);
 845
 846 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 847
 848           px += 2u;
 849
 850       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 851       k = srcBLen >> 2u;
 852
 853       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 854        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 855       do
 856       {
 857         /* Read the first two inputB samples using SIMD:
 858          * y[0] and y[1] */
 859                   a = *py;
 860                   b = *(py + 1);
 861
 862 #ifndef ARM_MATH_BIG_ENDIAN
 863
 864                   c0 = __PKHBT(a, b, 16);
 865
 866 #else
 867
 868                   c0 = __PKHBT(b, a, 16);
 869
 870 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 871
 872         /* acc0 +=  x[0] * y[0] + x[1] * y[1] */
 873         acc0 = __SMLAD(x0, c0, acc0);
 874
 875         /* acc1 +=  x[1] * y[0] + x[2] * y[1] */
 876         acc1 = __SMLAD(x1, c0, acc1);
 877
 878         /* Read x[2], x[3], x[4] */
 879                 a = *px;
 880                 b = *(px + 1);
 881
 882 #ifndef ARM_MATH_BIG_ENDIAN
 883
 884                 x2 = __PKHBT(a, b, 16);
 885                 a = *(px + 2);
 886                 x3 = __PKHBT(b, a, 16);
 887
 888 #else
 889
 890                 x2 = __PKHBT(b, a, 16);
 891                 a = *(px + 2);
 892                 x3 = __PKHBT(a, b, 16);
 893
 894 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 895
 896         /* acc2 +=  x[2] * y[0] + x[3] * y[1] */
 897         acc2 = __SMLAD(x2, c0, acc2);
 898
 899         /* acc3 +=  x[3] * y[0] + x[4] * y[1] */
 900         acc3 = __SMLAD(x3, c0, acc3);
 901
 902         /* Read y[2] and y[3] */
 903                   a = *(py + 2);
 904                   b = *(py + 3);
 905
 906                   py += 4u;
 907
 908 #ifndef ARM_MATH_BIG_ENDIAN
 909
 910                   c0 = __PKHBT(a, b, 16);
 911
 912 #else
 913
 914                   c0 = __PKHBT(b, a, 16);
 915
 916 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 917
 918         /* acc0 +=  x[2] * y[2] + x[3] * y[3] */
 919         acc0 = __SMLAD(x2, c0, acc0);
 920
 921         /* acc1 +=  x[3] * y[2] + x[4] * y[3] */
 922         acc1 = __SMLAD(x3, c0, acc1);
 923
 924         /* Read x[4], x[5], x[6] */
 925                 a = *(px + 2);
 926                 b = *(px + 3);
 927
 928 #ifndef ARM_MATH_BIG_ENDIAN
 929
 930                 x0 = __PKHBT(a, b, 16);
 931                 a = *(px + 4);
 932                 x1 = __PKHBT(b, a, 16);
 933
 934 #else
 935
 936                 x0 = __PKHBT(b, a, 16);
 937                 a = *(px + 4);
 938                 x1 = __PKHBT(a, b, 16);
 939
 940 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 941
 942                 px += 4u;
 943
 944         /* acc2 +=  x[4] * y[2] + x[5] * y[3] */
 945         acc2 = __SMLAD(x0, c0, acc2);
 946
 947         /* acc3 +=  x[5] * y[2] + x[6] * y[3] */
 948         acc3 = __SMLAD(x1, c0, acc3);
 949
 950       } while(--k);
 951
 952       /* For the next MAC operations, SIMD is not used
 953        * So, the 16 bit pointer if inputB, py is updated */
 954
 955       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 956        ** No loop unrolling is used. */
 957       k = srcBLen % 0x4u;
 958
 959       if(k == 1u)
 960       {
 961         /* Read y[4] */
 962         c0 = *py;
 963 #ifdef  ARM_MATH_BIG_ENDIAN
 964
 965         c0 = c0 << 16u;
 966
 967 #else
 968
 969         c0 = c0 & 0x0000FFFF;
 970
 971 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 972
 973         /* Read x[7] */
 974                 a = *px;
 975                 b = *(px + 1);
 976
 977                 px++;;
 978
 979 #ifndef ARM_MATH_BIG_ENDIAN
 980
 981                 x3 = __PKHBT(a, b, 16);
 982
 983 #else
 984
 985                 x3 = __PKHBT(b, a, 16);
 986
 987 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 988
 989                 px++;
 990
 991         /* Perform the multiply-accumulates */
 992         acc0 = __SMLAD(x0, c0, acc0);
 993         acc1 = __SMLAD(x1, c0, acc1);
 994         acc2 = __SMLADX(x1, c0, acc2);
 995         acc3 = __SMLADX(x3, c0, acc3);
 996       }
 997
 998       if(k == 2u)
 999       {
1000         /* Read y[4], y[5] */
1001                   a = *py;
1002                   b = *(py + 1);
1003
1004 #ifndef ARM_MATH_BIG_ENDIAN
1005
1006                   c0 = __PKHBT(a, b, 16);
1007
1008 #else
1009
1010                   c0 = __PKHBT(b, a, 16);
1011
1012 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1013
1014         /* Read x[7], x[8], x[9] */
1015                 a = *px;
1016                 b = *(px + 1);
1017
1018 #ifndef ARM_MATH_BIG_ENDIAN
1019
1020                 x3 = __PKHBT(a, b, 16);
1021                 a = *(px + 2);
1022                 x2 = __PKHBT(b, a, 16);
1023
1024 #else
1025
1026                 x3 = __PKHBT(b, a, 16);
1027                 a = *(px + 2);
1028                 x2 = __PKHBT(a, b, 16);
1029
1030 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1031
1032                 px += 2u;
1033
1034         /* Perform the multiply-accumulates */
1035         acc0 = __SMLAD(x0, c0, acc0);
1036         acc1 = __SMLAD(x1, c0, acc1);
1037         acc2 = __SMLAD(x3, c0, acc2);
1038         acc3 = __SMLAD(x2, c0, acc3);
1039       }
1040
1041       if(k == 3u)
1042       {
1043         /* Read y[4], y[5] */
1044                   a = *py;
1045                   b = *(py + 1);
1046
1047 #ifndef ARM_MATH_BIG_ENDIAN
1048
1049                   c0 = __PKHBT(a, b, 16);
1050
1051 #else
1052
1053                   c0 = __PKHBT(b, a, 16);
1054
1055 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1056
1057                 py += 2u;
1058
1059         /* Read x[7], x[8], x[9] */
1060                 a = *px;
1061                 b = *(px + 1);
1062
1063 #ifndef ARM_MATH_BIG_ENDIAN
1064
1065                 x3 = __PKHBT(a, b, 16);
1066                 a = *(px + 2);
1067                 x2 = __PKHBT(b, a, 16);
1068
1069 #else
1070
1071                 x3 = __PKHBT(b, a, 16);
1072                 a = *(px + 2);
1073                 x2 = __PKHBT(a, b, 16);
1074
1075 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1076
1077         /* Perform the multiply-accumulates */
1078         acc0 = __SMLAD(x0, c0, acc0);
1079         acc1 = __SMLAD(x1, c0, acc1);
1080         acc2 = __SMLAD(x3, c0, acc2);
1081         acc3 = __SMLAD(x2, c0, acc3);
1082
1083         c0 = (*py);
1084         /* Read y[6] */
1085 #ifdef  ARM_MATH_BIG_ENDIAN
1086
1087         c0 = c0 << 16u;
1088 #else
1089
1090         c0 = c0 & 0x0000FFFF;
1091 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
1092
1093         /* Read x[10] */
1094                 b = *(px + 3);
1095
1096 #ifndef ARM_MATH_BIG_ENDIAN
1097
1098                 x3 = __PKHBT(a, b, 16);
1099
1100 #else
1101
1102                 x3 = __PKHBT(b, a, 16);
1103
1104 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1105
1106                 px += 3u;
1107
1108         /* Perform the multiply-accumulates */
1109         acc0 = __SMLADX(x1, c0, acc0);
1110         acc1 = __SMLAD(x2, c0, acc1);
1111         acc2 = __SMLADX(x2, c0, acc2);
1112         acc3 = __SMLADX(x3, c0, acc3);
1113       }
1114
1115       /* Store the result in the accumulator in the destination buffer. */
1116       *pOut = (q15_t) (acc0 >> 15);
1117       /* Destination pointer is updated according to the address modifier, inc */
1118       pOut += inc;
1119
1120       *pOut = (q15_t) (acc1 >> 15);
1121       pOut += inc;
1122
1123       *pOut = (q15_t) (acc2 >> 15);
1124       pOut += inc;
1125
1126       *pOut = (q15_t) (acc3 >> 15);
1127       pOut += inc;
1128
1129       /* Increment the pointer pIn1 index, count by 1 */
1130       count += 4u;
1131
1132       /* Update the inputA and inputB pointers for next MAC calculation */
1133       px = pIn1 + count;
1134       py = pIn2;
1135
1136
1137       /* Decrement the loop counter */
1138       blkCnt--;
1139     }
1140
1141     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
1142      ** No loop unrolling is used. */
1143     blkCnt = blockSize2 % 0x4u;
1144
1145     while(blkCnt > 0u)
1146     {
1147       /* Accumulator is made zero for every iteration */
1148       sum = 0;
1149
1150       /* Apply loop unrolling and compute 4 MACs simultaneously. */
1151       k = srcBLen >> 2u;
1152
1153       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
1154        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1155       while(k > 0u)
1156       {
1157         /* Perform the multiply-accumulates */
1158         sum += ((q31_t) * px++ * *py++);
1159         sum += ((q31_t) * px++ * *py++);
1160         sum += ((q31_t) * px++ * *py++);
1161         sum += ((q31_t) * px++ * *py++);
1162
1163         /* Decrement the loop counter */
1164         k--;
1165       }
1166
1167       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1168        ** No loop unrolling is used. */
1169       k = srcBLen % 0x4u;
1170
1171       while(k > 0u)
1172       {
1173         /* Perform the multiply-accumulates */
1174         sum += ((q31_t) * px++ * *py++);
1175
1176         /* Decrement the loop counter */
1177         k--;
1178       }
1179
1180       /* Store the result in the accumulator in the destination buffer. */
1181       *pOut = (q15_t) (sum >> 15);
1182       /* Destination pointer is updated according to the address modifier, inc */
1183       pOut += inc;
1184
1185       /* Increment the pointer pIn1 index, count by 1 */
1186       count++;
1187
1188       /* Update the inputA and inputB pointers for next MAC calculation */
1189       px = pIn1 + count;
1190       py = pIn2;
1191
1192       /* Decrement the loop counter */
1193       blkCnt--;
1194     }
1195   }
1196   else
1197   {
1198     /* If the srcBLen is not a multiple of 4,
1199      * the blockSize2 loop cannot be unrolled by 4 */
1200     blkCnt = blockSize2;
1201
1202     while(blkCnt > 0u)
1203     {
1204       /* Accumulator is made zero for every iteration */
1205       sum = 0;
1206
1207       /* Loop over srcBLen */
1208       k = srcBLen;
1209
1210       while(k > 0u)
1211       {
1212         /* Perform the multiply-accumulate */
1213         sum += ((q31_t) * px++ * *py++);
1214
1215         /* Decrement the loop counter */
1216         k--;
1217       }
1218
1219       /* Store the result in the accumulator in the destination buffer. */
1220       *pOut = (q15_t) (sum >> 15);
1221       /* Destination pointer is updated according to the address modifier, inc */
1222       pOut += inc;
1223
1224       /* Increment the MAC count */
1225       count++;
1226
1227       /* Update the inputA and inputB pointers for next MAC calculation */
1228       px = pIn1 + count;
1229       py = pIn2;
1230
1231       /* Decrement the loop counter */
1232       blkCnt--;
1233     }
1234   }
1235
1236   /* --------------------------
1237    * Initializations of stage3
1238    * -------------------------*/
1239
1240   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
1241    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
1242    * ....
1243    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
1244    * sum +=  x[srcALen-1] * y[0]
1245    */
1246
1247   /* In this stage the MAC operations are decreased by 1 for every iteration.
1248      The count variable holds the number of MAC operations performed */
1249   count = srcBLen - 1u;
1250
1251   /* Working pointer of inputA */
1252   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
1253   px = pSrc1;
1254
1255   /* Working pointer of inputB */
1256   py = pIn2;
1257
1258   /* -------------------
1259    * Stage3 process
1260    * ------------------*/
1261
1262   while(blockSize3 > 0u)
1263   {
1264     /* Accumulator is made zero for every iteration */
1265     sum = 0;
1266
1267     /* Apply loop unrolling and compute 4 MACs simultaneously. */
1268     k = count >> 2u;
1269
1270     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
1271      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1272     while(k > 0u)
1273     {
1274       /* Perform the multiply-accumulates */
1275         sum += ((q31_t) * px++ * *py++);
1276         sum += ((q31_t) * px++ * *py++);
1277         sum += ((q31_t) * px++ * *py++);
1278         sum += ((q31_t) * px++ * *py++);
1279
1280       /* Decrement the loop counter */
1281       k--;
1282     }
1283
1284     /* If the count is not a multiple of 4, compute any remaining MACs here.
1285      ** No loop unrolling is used. */
1286     k = count % 0x4u;
1287
1288     while(k > 0u)
1289     {
1290       /* Perform the multiply-accumulates */
1291         sum += ((q31_t) * px++ * *py++);
1292
1293       /* Decrement the loop counter */
1294       k--;
1295     }
1296
1297     /* Store the result in the accumulator in the destination buffer. */
1298     *pOut = (q15_t) (sum >> 15);
1299     /* Destination pointer is updated according to the address modifier, inc */
1300     pOut += inc;
1301
1302     /* Update the inputA and inputB pointers for next MAC calculation */
1303     px = ++pSrc1;
1304     py = pIn2;
1305
1306     /* Decrement the MAC count */
1307     count--;
1308
1309     /* Decrement the loop counter */
1310     blockSize3--;
1311   }
1312
1313 #endif /*   #ifndef UNALIGNED_SUPPORT_DISABLE */
1314
1315 }
1316
1317 /**
1318  * @} end of Corr group
1319  */