tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_fast_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_fast_q15.c
   9 *
  10 * Description:  Fast Q15 Convolution.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @addtogroup Conv
  49  * @{
  50  */
  51
  52 /**
  53  * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
  54  * @param[in] *pSrcA points to the first input sequence.
  55  * @param[in] srcALen length of the first input sequence.
  56  * @param[in] *pSrcB points to the second input sequence.
  57  * @param[in] srcBLen length of the second input sequence.
  58  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
  59  * @return none.
  60  *
  61  * <b>Scaling and Overflow Behavior:</b>
  62  *
  63  * \par
  64  * This fast version uses a 32-bit accumulator with 2.30 format.
  65  * The accumulator maintains full precision of the intermediate multiplication results
  66  * but provides only a single guard bit. There is no saturation on intermediate additions.
  67  * Thus, if the accumulator overflows it wraps around and distorts the result.
  68  * The input signals should be scaled down to avoid intermediate overflows.
  69  * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
  70  * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
  71  * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
  72  *
  73  * \par
  74  * See <code>arm_conv_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
  75  */
  76
  77 void arm_conv_fast_q15(
  78   q15_t * pSrcA,
  79   uint32_t srcALen,
  80   q15_t * pSrcB,
  81   uint32_t srcBLen,
  82   q15_t * pDst)
  83 {
  84 #ifndef UNALIGNED_SUPPORT_DISABLE
  85   q15_t *pIn1;                                   /* inputA pointer */
  86   q15_t *pIn2;                                   /* inputB pointer */
  87   q15_t *pOut = pDst;                            /* output pointer */
  88   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
  89   q15_t *px;                                     /* Intermediate inputA pointer  */
  90   q15_t *py;                                     /* Intermediate inputB pointer  */
  91   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
  92   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
  93   uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt;     /* loop counter */
  94
  95   /* The algorithm implementation is based on the lengths of the inputs. */
  96   /* srcB is always made to slide across srcA. */
  97   /* So srcBLen is always considered as shorter or equal to srcALen */
  98   if(srcALen >= srcBLen)
  99   {
 100     /* Initialization of inputA pointer */
 101     pIn1 = pSrcA;
 102
 103     /* Initialization of inputB pointer */
 104     pIn2 = pSrcB;
 105   }
 106   else
 107   {
 108     /* Initialization of inputA pointer */
 109     pIn1 = pSrcB;
 110
 111     /* Initialization of inputB pointer */
 112     pIn2 = pSrcA;
 113
 114     /* srcBLen is always considered as shorter or equal to srcALen */
 115     j = srcBLen;
 116     srcBLen = srcALen;
 117     srcALen = j;
 118   }
 119
 120   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
 121   /* The function is internally
 122    * divided into three stages according to the number of multiplications that has to be
 123    * taken place between inputA samples and inputB samples. In the first stage of the
 124    * algorithm, the multiplications increase by one for every iteration.
 125    * In the second stage of the algorithm, srcBLen number of multiplications are done.
 126    * In the third stage of the algorithm, the multiplications decrease by one
 127    * for every iteration. */
 128
 129   /* The algorithm is implemented in three stages.
 130      The loop counters of each stage is initiated here. */
 131   blockSize1 = srcBLen - 1u;
 132   blockSize2 = srcALen - (srcBLen - 1u);
 133   blockSize3 = blockSize1;
 134
 135   /* --------------------------
 136    * Initializations of stage1
 137    * -------------------------*/
 138
 139   /* sum = x[0] * y[0]
 140    * sum = x[0] * y[1] + x[1] * y[0]
 141    * ....
 142    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
 143    */
 144
 145   /* In this stage the MAC operations are increased by 1 for every iteration.
 146      The count variable holds the number of MAC operations performed */
 147   count = 1u;
 148
 149   /* Working pointer of inputA */
 150   px = pIn1;
 151
 152   /* Working pointer of inputB */
 153   py = pIn2;
 154
 155
 156   /* ------------------------
 157    * Stage1 process
 158    * ----------------------*/
 159
 160   /* For loop unrolling by 4, this stage is divided into two. */
 161   /* First part of this stage computes the MAC operations less than 4 */
 162   /* Second part of this stage computes the MAC operations greater than or equal to 4 */
 163
 164   /* The first part of the stage starts here */
 165   while((count < 4u) && (blockSize1 > 0u))
 166   {
 167     /* Accumulator is made zero for every iteration */
 168     sum = 0;
 169
 170     /* Loop over number of MAC operations between
 171      * inputA samples and inputB samples */
 172     k = count;
 173
 174     while(k > 0u)
 175     {
 176       /* Perform the multiply-accumulates */
 177       sum = __SMLAD(*px++, *py--, sum);
 178
 179       /* Decrement the loop counter */
 180       k--;
 181     }
 182
 183     /* Store the result in the accumulator in the destination buffer. */
 184     *pOut++ = (q15_t) (sum >> 15);
 185
 186     /* Update the inputA and inputB pointers for next MAC calculation */
 187     py = pIn2 + count;
 188     px = pIn1;
 189
 190     /* Increment the MAC count */
 191     count++;
 192
 193     /* Decrement the loop counter */
 194     blockSize1--;
 195   }
 196
 197   /* The second part of the stage starts here */
 198   /* The internal loop, over count, is unrolled by 4 */
 199   /* To, read the last two inputB samples using SIMD:
 200    * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
 201   py = py - 1;
 202
 203   while(blockSize1 > 0u)
 204   {
 205     /* Accumulator is made zero for every iteration */
 206     sum = 0;
 207
 208     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 209     k = count >> 2u;
 210
 211     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 212      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 213     while(k > 0u)
 214     {
 215       /* Perform the multiply-accumulates */
 216       /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
 217       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 218       /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
 219       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 220
 221       /* Decrement the loop counter */
 222       k--;
 223     }
 224
 225     /* For the next MAC operations, the pointer py is used without SIMD
 226      * So, py is incremented by 1 */
 227     py = py + 1u;
 228
 229     /* If the count is not a multiple of 4, compute any remaining MACs here.
 230      ** No loop unrolling is used. */
 231     k = count % 0x4u;
 232
 233     while(k > 0u)
 234     {
 235       /* Perform the multiply-accumulates */
 236       sum = __SMLAD(*px++, *py--, sum);
 237
 238       /* Decrement the loop counter */
 239       k--;
 240     }
 241
 242     /* Store the result in the accumulator in the destination buffer. */
 243     *pOut++ = (q15_t) (sum >> 15);
 244
 245     /* Update the inputA and inputB pointers for next MAC calculation */
 246     py = pIn2 + (count - 1u);
 247     px = pIn1;
 248
 249     /* Increment the MAC count */
 250     count++;
 251
 252     /* Decrement the loop counter */
 253     blockSize1--;
 254   }
 255
 256   /* --------------------------
 257    * Initializations of stage2
 258    * ------------------------*/
 259
 260   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
 261    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
 262    * ....
 263    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
 264    */
 265
 266   /* Working pointer of inputA */
 267   px = pIn1;
 268
 269   /* Working pointer of inputB */
 270   pSrc2 = pIn2 + (srcBLen - 1u);
 271   py = pSrc2;
 272
 273   /* count is the index by which the pointer pIn1 to be incremented */
 274   count = 0u;
 275
 276
 277   /* --------------------
 278    * Stage2 process
 279    * -------------------*/
 280
 281   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 282    * So, to loop unroll over blockSize2,
 283    * srcBLen should be greater than or equal to 4 */
 284   if(srcBLen >= 4u)
 285   {
 286     /* Loop unroll over blockSize2, by 4 */
 287     blkCnt = blockSize2 >> 2u;
 288
 289     while(blkCnt > 0u)
 290     {
 291       py = py - 1u;
 292
 293       /* Set all accumulators to zero */
 294       acc0 = 0;
 295       acc1 = 0;
 296       acc2 = 0;
 297       acc3 = 0;
 298
 299
 300       /* read x[0], x[1] samples */
 301       x0 = *__SIMD32(px);
 302       /* read x[1], x[2] samples */
 303       x1 = _SIMD32_OFFSET(px+1);
 304           px+= 2u;
 305
 306
 307       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 308       k = srcBLen >> 2u;
 309
 310       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 311        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 312       do
 313       {
 314         /* Read the last two inputB samples using SIMD:
 315          * y[srcBLen - 1] and y[srcBLen - 2] */
 316         c0 = *__SIMD32(py)--;
 317
 318         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
 319         acc0 = __SMLADX(x0, c0, acc0);
 320
 321         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
 322         acc1 = __SMLADX(x1, c0, acc1);
 323
 324         /* Read x[2], x[3] */
 325         x2 = *__SIMD32(px);
 326
 327         /* Read x[3], x[4] */
 328         x3 = _SIMD32_OFFSET(px+1);
 329
 330         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
 331         acc2 = __SMLADX(x2, c0, acc2);
 332
 333         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
 334         acc3 = __SMLADX(x3, c0, acc3);
 335
 336         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
 337         c0 = *__SIMD32(py)--;
 338
 339         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
 340         acc0 = __SMLADX(x2, c0, acc0);
 341
 342         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
 343         acc1 = __SMLADX(x3, c0, acc1);
 344
 345         /* Read x[4], x[5] */
 346         x0 = _SIMD32_OFFSET(px+2);
 347
 348         /* Read x[5], x[6] */
 349         x1 = _SIMD32_OFFSET(px+3);
 350                 px += 4u;
 351
 352         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
 353         acc2 = __SMLADX(x0, c0, acc2);
 354
 355         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
 356         acc3 = __SMLADX(x1, c0, acc3);
 357
 358       } while(--k);
 359
 360       /* For the next MAC operations, SIMD is not used
 361        * So, the 16 bit pointer if inputB, py is updated */
 362
 363       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 364        ** No loop unrolling is used. */
 365       k = srcBLen % 0x4u;
 366
 367       if(k == 1u)
 368       {
 369         /* Read y[srcBLen - 5] */
 370         c0 = *(py+1);
 371
 372 #ifdef  ARM_MATH_BIG_ENDIAN
 373
 374         c0 = c0 << 16u;
 375
 376 #else
 377
 378         c0 = c0 & 0x0000FFFF;
 379
 380 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 381
 382         /* Read x[7] */
 383         x3 = *__SIMD32(px);
 384                 px++;
 385
 386         /* Perform the multiply-accumulates */
 387         acc0 = __SMLAD(x0, c0, acc0);
 388         acc1 = __SMLAD(x1, c0, acc1);
 389         acc2 = __SMLADX(x1, c0, acc2);
 390         acc3 = __SMLADX(x3, c0, acc3);
 391       }
 392
 393       if(k == 2u)
 394       {
 395         /* Read y[srcBLen - 5], y[srcBLen - 6] */
 396         c0 = _SIMD32_OFFSET(py);
 397
 398         /* Read x[7], x[8] */
 399         x3 = *__SIMD32(px);
 400
 401         /* Read x[9] */
 402         x2 = _SIMD32_OFFSET(px+1);
 403                 px += 2u;
 404
 405         /* Perform the multiply-accumulates */
 406         acc0 = __SMLADX(x0, c0, acc0);
 407         acc1 = __SMLADX(x1, c0, acc1);
 408         acc2 = __SMLADX(x3, c0, acc2);
 409         acc3 = __SMLADX(x2, c0, acc3);
 410       }
 411
 412       if(k == 3u)
 413       {
 414         /* Read y[srcBLen - 5], y[srcBLen - 6] */
 415         c0 = _SIMD32_OFFSET(py);
 416
 417         /* Read x[7], x[8] */
 418         x3 = *__SIMD32(px);
 419
 420         /* Read x[9] */
 421         x2 = _SIMD32_OFFSET(px+1);
 422
 423         /* Perform the multiply-accumulates */
 424         acc0 = __SMLADX(x0, c0, acc0);
 425         acc1 = __SMLADX(x1, c0, acc1);
 426         acc2 = __SMLADX(x3, c0, acc2);
 427         acc3 = __SMLADX(x2, c0, acc3);
 428
 429         /* Read y[srcBLen - 7] */
 430                 c0 = *(py-1);
 431 #ifdef  ARM_MATH_BIG_ENDIAN
 432
 433         c0 = c0 << 16u;
 434 #else
 435
 436         c0 = c0 & 0x0000FFFF;
 437 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 438
 439         /* Read x[10] */
 440         x3 =  _SIMD32_OFFSET(px+2);
 441                 px += 3u;
 442
 443         /* Perform the multiply-accumulates */
 444         acc0 = __SMLADX(x1, c0, acc0);
 445         acc1 = __SMLAD(x2, c0, acc1);
 446         acc2 = __SMLADX(x2, c0, acc2);
 447         acc3 = __SMLADX(x3, c0, acc3);
 448       }
 449
 450       /* Store the results in the accumulators in the destination buffer. */
 451 #ifndef ARM_MATH_BIG_ENDIAN
 452
 453       *__SIMD32(pOut)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16);
 454       *__SIMD32(pOut)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16);
 455
 456 #else
 457
 458       *__SIMD32(pOut)++ = __PKHBT((acc1 >> 15), (acc0 >> 15), 16);
 459       *__SIMD32(pOut)++ = __PKHBT((acc3 >> 15), (acc2 >> 15), 16);
 460
 461 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
 462
 463       /* Increment the pointer pIn1 index, count by 4 */
 464       count += 4u;
 465
 466       /* Update the inputA and inputB pointers for next MAC calculation */
 467       px = pIn1 + count;
 468       py = pSrc2;
 469
 470       /* Decrement the loop counter */
 471       blkCnt--;
 472     }
 473
 474     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 475      ** No loop unrolling is used. */
 476     blkCnt = blockSize2 % 0x4u;
 477
 478     while(blkCnt > 0u)
 479     {
 480       /* Accumulator is made zero for every iteration */
 481       sum = 0;
 482
 483       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 484       k = srcBLen >> 2u;
 485
 486       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 487        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 488       while(k > 0u)
 489       {
 490         /* Perform the multiply-accumulates */
 491         sum += ((q31_t) * px++ * *py--);
 492         sum += ((q31_t) * px++ * *py--);
 493         sum += ((q31_t) * px++ * *py--);
 494         sum += ((q31_t) * px++ * *py--);
 495
 496         /* Decrement the loop counter */
 497         k--;
 498       }
 499
 500       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 501        ** No loop unrolling is used. */
 502       k = srcBLen % 0x4u;
 503
 504       while(k > 0u)
 505       {
 506         /* Perform the multiply-accumulates */
 507         sum += ((q31_t) * px++ * *py--);
 508
 509         /* Decrement the loop counter */
 510         k--;
 511       }
 512
 513       /* Store the result in the accumulator in the destination buffer. */
 514       *pOut++ = (q15_t) (sum >> 15);
 515
 516       /* Increment the pointer pIn1 index, count by 1 */
 517       count++;
 518
 519       /* Update the inputA and inputB pointers for next MAC calculation */
 520       px = pIn1 + count;
 521       py = pSrc2;
 522
 523       /* Decrement the loop counter */
 524       blkCnt--;
 525     }
 526   }
 527   else
 528   {
 529     /* If the srcBLen is not a multiple of 4,
 530      * the blockSize2 loop cannot be unrolled by 4 */
 531     blkCnt = blockSize2;
 532
 533     while(blkCnt > 0u)
 534     {
 535       /* Accumulator is made zero for every iteration */
 536       sum = 0;
 537
 538       /* srcBLen number of MACS should be performed */
 539       k = srcBLen;
 540
 541       while(k > 0u)
 542       {
 543         /* Perform the multiply-accumulate */
 544         sum += ((q31_t) * px++ * *py--);
 545
 546         /* Decrement the loop counter */
 547         k--;
 548       }
 549
 550       /* Store the result in the accumulator in the destination buffer. */
 551       *pOut++ = (q15_t) (sum >> 15);
 552
 553       /* Increment the MAC count */
 554       count++;
 555
 556       /* Update the inputA and inputB pointers for next MAC calculation */
 557       px = pIn1 + count;
 558       py = pSrc2;
 559
 560       /* Decrement the loop counter */
 561       blkCnt--;
 562     }
 563   }
 564
 565
 566   /* --------------------------
 567    * Initializations of stage3
 568    * -------------------------*/
 569
 570   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
 571    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
 572    * ....
 573    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
 574    * sum +=  x[srcALen-1] * y[srcBLen-1]
 575    */
 576
 577   /* In this stage the MAC operations are decreased by 1 for every iteration.
 578      The blockSize3 variable holds the number of MAC operations performed */
 579
 580   /* Working pointer of inputA */
 581   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 582   px = pSrc1;
 583
 584   /* Working pointer of inputB */
 585   pSrc2 = pIn2 + (srcBLen - 1u);
 586   pIn2 = pSrc2 - 1u;
 587   py = pIn2;
 588
 589   /* -------------------
 590    * Stage3 process
 591    * ------------------*/
 592
 593   /* For loop unrolling by 4, this stage is divided into two. */
 594   /* First part of this stage computes the MAC operations greater than 4 */
 595   /* Second part of this stage computes the MAC operations less than or equal to 4 */
 596
 597   /* The first part of the stage starts here */
 598   j = blockSize3 >> 2u;
 599
 600   while((j > 0u) && (blockSize3 > 0u))
 601   {
 602     /* Accumulator is made zero for every iteration */
 603     sum = 0;
 604
 605     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 606     k = blockSize3 >> 2u;
 607
 608     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 609      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 610     while(k > 0u)
 611     {
 612       /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
 613        * with y[srcBLen - 1], y[srcBLen - 2] respectively */
 614       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 615       /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
 616        * with y[srcBLen - 3], y[srcBLen - 4] respectively */
 617       sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 618
 619       /* Decrement the loop counter */
 620       k--;
 621     }
 622
 623     /* For the next MAC operations, the pointer py is used without SIMD
 624      * So, py is incremented by 1 */
 625     py = py + 1u;
 626
 627     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
 628      ** No loop unrolling is used. */
 629     k = blockSize3 % 0x4u;
 630
 631     while(k > 0u)
 632     {
 633       /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
 634       sum = __SMLAD(*px++, *py--, sum);
 635
 636       /* Decrement the loop counter */
 637       k--;
 638     }
 639
 640     /* Store the result in the accumulator in the destination buffer. */
 641     *pOut++ = (q15_t) (sum >> 15);
 642
 643     /* Update the inputA and inputB pointers for next MAC calculation */
 644     px = ++pSrc1;
 645     py = pIn2;
 646
 647     /* Decrement the loop counter */
 648     blockSize3--;
 649
 650     j--;
 651   }
 652
 653   /* The second part of the stage starts here */
 654   /* SIMD is not used for the next MAC operations,
 655    * so pointer py is updated to read only one sample at a time */
 656   py = py + 1u;
 657
 658   while(blockSize3 > 0u)
 659   {
 660     /* Accumulator is made zero for every iteration */
 661     sum = 0;
 662
 663     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 664     k = blockSize3;
 665
 666     while(k > 0u)
 667     {
 668       /* Perform the multiply-accumulates */
 669       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
 670       sum = __SMLAD(*px++, *py--, sum);
 671
 672       /* Decrement the loop counter */
 673       k--;
 674     }
 675
 676     /* Store the result in the accumulator in the destination buffer. */
 677     *pOut++ = (q15_t) (sum >> 15);
 678
 679     /* Update the inputA and inputB pointers for next MAC calculation */
 680     px = ++pSrc1;
 681     py = pSrc2;
 682
 683     /* Decrement the loop counter */
 684     blockSize3--;
 685   }
 686
 687 #else
 688   q15_t *pIn1;                                   /* inputA pointer */
 689   q15_t *pIn2;                                   /* inputB pointer */
 690   q15_t *pOut = pDst;                            /* output pointer */
 691   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
 692   q15_t *px;                                     /* Intermediate inputA pointer  */
 693   q15_t *py;                                     /* Intermediate inputB pointer  */
 694   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
 695   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
 696   uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt;     /* loop counter */
 697   q15_t a, b;
 698
 699   /* The algorithm implementation is based on the lengths of the inputs. */
 700   /* srcB is always made to slide across srcA. */
 701   /* So srcBLen is always considered as shorter or equal to srcALen */
 702   if(srcALen >= srcBLen)
 703   {
 704     /* Initialization of inputA pointer */
 705     pIn1 = pSrcA;
 706
 707     /* Initialization of inputB pointer */
 708     pIn2 = pSrcB;
 709   }
 710   else
 711   {
 712     /* Initialization of inputA pointer */
 713     pIn1 = pSrcB;
 714
 715     /* Initialization of inputB pointer */
 716     pIn2 = pSrcA;
 717
 718     /* srcBLen is always considered as shorter or equal to srcALen */
 719     j = srcBLen;
 720     srcBLen = srcALen;
 721     srcALen = j;
 722   }
 723
 724   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
 725   /* The function is internally
 726    * divided into three stages according to the number of multiplications that has to be
 727    * taken place between inputA samples and inputB samples. In the first stage of the
 728    * algorithm, the multiplications increase by one for every iteration.
 729    * In the second stage of the algorithm, srcBLen number of multiplications are done.
 730    * In the third stage of the algorithm, the multiplications decrease by one
 731    * for every iteration. */
 732
 733   /* The algorithm is implemented in three stages.
 734      The loop counters of each stage is initiated here. */
 735   blockSize1 = srcBLen - 1u;
 736   blockSize2 = srcALen - (srcBLen - 1u);
 737   blockSize3 = blockSize1;
 738
 739   /* --------------------------
 740    * Initializations of stage1
 741    * -------------------------*/
 742
 743   /* sum = x[0] * y[0]
 744    * sum = x[0] * y[1] + x[1] * y[0]
 745    * ....
 746    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
 747    */
 748
 749   /* In this stage the MAC operations are increased by 1 for every iteration.
 750      The count variable holds the number of MAC operations performed */
 751   count = 1u;
 752
 753   /* Working pointer of inputA */
 754   px = pIn1;
 755
 756   /* Working pointer of inputB */
 757   py = pIn2;
 758
 759
 760   /* ------------------------
 761    * Stage1 process
 762    * ----------------------*/
 763
 764   /* For loop unrolling by 4, this stage is divided into two. */
 765   /* First part of this stage computes the MAC operations less than 4 */
 766   /* Second part of this stage computes the MAC operations greater than or equal to 4 */
 767
 768   /* The first part of the stage starts here */
 769   while((count < 4u) && (blockSize1 > 0u))
 770   {
 771     /* Accumulator is made zero for every iteration */
 772     sum = 0;
 773
 774     /* Loop over number of MAC operations between
 775      * inputA samples and inputB samples */
 776     k = count;
 777
 778     while(k > 0u)
 779     {
 780       /* Perform the multiply-accumulates */
 781       sum += ((q31_t) * px++ * *py--);
 782
 783       /* Decrement the loop counter */
 784       k--;
 785     }
 786
 787     /* Store the result in the accumulator in the destination buffer. */
 788     *pOut++ = (q15_t) (sum >> 15);
 789
 790     /* Update the inputA and inputB pointers for next MAC calculation */
 791     py = pIn2 + count;
 792     px = pIn1;
 793
 794     /* Increment the MAC count */
 795     count++;
 796
 797     /* Decrement the loop counter */
 798     blockSize1--;
 799   }
 800
 801   /* The second part of the stage starts here */
 802   /* The internal loop, over count, is unrolled by 4 */
 803   /* To, read the last two inputB samples using SIMD:
 804    * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
 805   py = py - 1;
 806
 807   while(blockSize1 > 0u)
 808   {
 809     /* Accumulator is made zero for every iteration */
 810     sum = 0;
 811
 812     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 813     k = count >> 2u;
 814
 815     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 816      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 817         py++;
 818
 819     while(k > 0u)
 820     {
 821       /* Perform the multiply-accumulates */
 822         sum += ((q31_t) * px++ * *py--);
 823         sum += ((q31_t) * px++ * *py--);
 824         sum += ((q31_t) * px++ * *py--);
 825         sum += ((q31_t) * px++ * *py--);
 826
 827       /* Decrement the loop counter */
 828       k--;
 829     }
 830
 831     /* If the count is not a multiple of 4, compute any remaining MACs here.
 832      ** No loop unrolling is used. */
 833     k = count % 0x4u;
 834
 835     while(k > 0u)
 836     {
 837       /* Perform the multiply-accumulates */
 838       sum += ((q31_t) * px++ * *py--);
 839
 840       /* Decrement the loop counter */
 841       k--;
 842     }
 843
 844     /* Store the result in the accumulator in the destination buffer. */
 845     *pOut++ = (q15_t) (sum >> 15);
 846
 847     /* Update the inputA and inputB pointers for next MAC calculation */
 848     py = pIn2 + (count - 1u);
 849     px = pIn1;
 850
 851     /* Increment the MAC count */
 852     count++;
 853
 854     /* Decrement the loop counter */
 855     blockSize1--;
 856   }
 857
 858   /* --------------------------
 859    * Initializations of stage2
 860    * ------------------------*/
 861
 862   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
 863    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
 864    * ....
 865    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
 866    */
 867
 868   /* Working pointer of inputA */
 869   px = pIn1;
 870
 871   /* Working pointer of inputB */
 872   pSrc2 = pIn2 + (srcBLen - 1u);
 873   py = pSrc2;
 874
 875   /* count is the index by which the pointer pIn1 to be incremented */
 876   count = 0u;
 877
 878
 879   /* --------------------
 880    * Stage2 process
 881    * -------------------*/
 882
 883   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 884    * So, to loop unroll over blockSize2,
 885    * srcBLen should be greater than or equal to 4 */
 886   if(srcBLen >= 4u)
 887   {
 888     /* Loop unroll over blockSize2, by 4 */
 889     blkCnt = blockSize2 >> 2u;
 890
 891     while(blkCnt > 0u)
 892     {
 893       py = py - 1u;
 894
 895       /* Set all accumulators to zero */
 896       acc0 = 0;
 897       acc1 = 0;
 898       acc2 = 0;
 899       acc3 = 0;
 900
 901       /* read x[0], x[1] samples */
 902           a = *px++;
 903           b = *px++;
 904
 905 #ifndef ARM_MATH_BIG_ENDIAN
 906
 907           x0 = __PKHBT(a, b, 16);
 908           a = *px;
 909           x1 = __PKHBT(b, a, 16);
 910
 911 #else
 912
 913           x0 = __PKHBT(b, a, 16);
 914           a = *px;
 915           x1 = __PKHBT(a, b, 16);
 916
 917 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN        */
 918
 919       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 920       k = srcBLen >> 2u;
 921
 922       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 923        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 924       do
 925       {
 926         /* Read the last two inputB samples using SIMD:
 927          * y[srcBLen - 1] and y[srcBLen - 2] */
 928                 a = *py;
 929                 b = *(py+1);
 930                 py -= 2;
 931
 932 #ifndef ARM_MATH_BIG_ENDIAN
 933
 934                 c0 = __PKHBT(a, b, 16);
 935
 936 #else
 937
 938                 c0 = __PKHBT(b, a, 16);;
 939
 940 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 941
 942         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
 943         acc0 = __SMLADX(x0, c0, acc0);
 944
 945         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
 946         acc1 = __SMLADX(x1, c0, acc1);
 947
 948           a = *px;
 949           b = *(px + 1);
 950
 951 #ifndef ARM_MATH_BIG_ENDIAN
 952
 953           x2 = __PKHBT(a, b, 16);
 954           a = *(px + 2);
 955           x3 = __PKHBT(b, a, 16);
 956
 957 #else
 958
 959           x2 = __PKHBT(b, a, 16);
 960           a = *(px + 2);
 961           x3 = __PKHBT(a, b, 16);
 962
 963 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN        */
 964
 965         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
 966         acc2 = __SMLADX(x2, c0, acc2);
 967
 968         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
 969         acc3 = __SMLADX(x3, c0, acc3);
 970
 971         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
 972                 a = *py;
 973                 b = *(py+1);
 974                 py -= 2;
 975
 976 #ifndef ARM_MATH_BIG_ENDIAN
 977
 978                 c0 = __PKHBT(a, b, 16);
 979
 980 #else
 981
 982                 c0 = __PKHBT(b, a, 16);;
 983
 984 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 985
 986         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
 987         acc0 = __SMLADX(x2, c0, acc0);
 988
 989         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
 990         acc1 = __SMLADX(x3, c0, acc1);
 991
 992         /* Read x[4], x[5], x[6] */
 993           a = *(px + 2);
 994           b = *(px + 3);
 995
 996 #ifndef ARM_MATH_BIG_ENDIAN
 997
 998           x0 = __PKHBT(a, b, 16);
 999           a = *(px + 4);
1000           x1 = __PKHBT(b, a, 16);
1001
1002 #else
1003
1004           x0 = __PKHBT(b, a, 16);
1005           a = *(px + 4);
1006           x1 = __PKHBT(a, b, 16);
1007
1008 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN        */
1009
1010                 px += 4u;
1011
1012         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
1013         acc2 = __SMLADX(x0, c0, acc2);
1014
1015         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
1016         acc3 = __SMLADX(x1, c0, acc3);
1017
1018       } while(--k);
1019
1020       /* For the next MAC operations, SIMD is not used
1021        * So, the 16 bit pointer if inputB, py is updated */
1022
1023       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1024        ** No loop unrolling is used. */
1025       k = srcBLen % 0x4u;
1026
1027       if(k == 1u)
1028       {
1029         /* Read y[srcBLen - 5] */
1030         c0 = *(py+1);
1031
1032 #ifdef  ARM_MATH_BIG_ENDIAN
1033
1034         c0 = c0 << 16u;
1035
1036 #else
1037
1038         c0 = c0 & 0x0000FFFF;
1039
1040 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
1041
1042         /* Read x[7] */
1043                 a = *px;
1044                 b = *(px+1);
1045                 px++;
1046
1047 #ifndef ARM_MATH_BIG_ENDIAN
1048
1049                 x3 = __PKHBT(a, b, 16);
1050
1051 #else
1052
1053                 x3 = __PKHBT(b, a, 16);;
1054
1055 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1056
1057
1058         /* Perform the multiply-accumulates */
1059         acc0 = __SMLAD(x0, c0, acc0);
1060         acc1 = __SMLAD(x1, c0, acc1);
1061         acc2 = __SMLADX(x1, c0, acc2);
1062         acc3 = __SMLADX(x3, c0, acc3);
1063       }
1064
1065       if(k == 2u)
1066       {
1067         /* Read y[srcBLen - 5], y[srcBLen - 6] */
1068                 a = *py;
1069                 b = *(py+1);
1070
1071 #ifndef ARM_MATH_BIG_ENDIAN
1072
1073                 c0 = __PKHBT(a, b, 16);
1074
1075 #else
1076
1077                 c0 = __PKHBT(b, a, 16);;
1078
1079 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1080
1081         /* Read x[7], x[8], x[9] */
1082           a = *px;
1083           b = *(px + 1);
1084
1085 #ifndef ARM_MATH_BIG_ENDIAN
1086
1087           x3 = __PKHBT(a, b, 16);
1088           a = *(px + 2);
1089           x2 = __PKHBT(b, a, 16);
1090
1091 #else
1092
1093           x3 = __PKHBT(b, a, 16);
1094           a = *(px + 2);
1095           x2 = __PKHBT(a, b, 16);
1096
1097 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN        */
1098                 px += 2u;
1099
1100         /* Perform the multiply-accumulates */
1101         acc0 = __SMLADX(x0, c0, acc0);
1102         acc1 = __SMLADX(x1, c0, acc1);
1103         acc2 = __SMLADX(x3, c0, acc2);
1104         acc3 = __SMLADX(x2, c0, acc3);
1105       }
1106
1107       if(k == 3u)
1108       {
1109         /* Read y[srcBLen - 5], y[srcBLen - 6] */
1110                 a = *py;
1111                 b = *(py+1);
1112
1113 #ifndef ARM_MATH_BIG_ENDIAN
1114
1115                 c0 = __PKHBT(a, b, 16);
1116
1117 #else
1118
1119                 c0 = __PKHBT(b, a, 16);;
1120
1121 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1122
1123         /* Read x[7], x[8], x[9] */
1124           a = *px;
1125           b = *(px + 1);
1126
1127 #ifndef ARM_MATH_BIG_ENDIAN
1128
1129           x3 = __PKHBT(a, b, 16);
1130           a = *(px + 2);
1131           x2 = __PKHBT(b, a, 16);
1132
1133 #else
1134
1135           x3 = __PKHBT(b, a, 16);
1136           a = *(px + 2);
1137           x2 = __PKHBT(a, b, 16);
1138
1139 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN        */
1140
1141         /* Perform the multiply-accumulates */
1142         acc0 = __SMLADX(x0, c0, acc0);
1143         acc1 = __SMLADX(x1, c0, acc1);
1144         acc2 = __SMLADX(x3, c0, acc2);
1145         acc3 = __SMLADX(x2, c0, acc3);
1146
1147         /* Read y[srcBLen - 7] */
1148                 c0 = *(py-1);
1149 #ifdef  ARM_MATH_BIG_ENDIAN
1150
1151         c0 = c0 << 16u;
1152 #else
1153
1154         c0 = c0 & 0x0000FFFF;
1155 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
1156
1157         /* Read x[10] */
1158                 a = *(px+2);
1159                 b = *(px+3);
1160
1161 #ifndef ARM_MATH_BIG_ENDIAN
1162
1163                 x3 = __PKHBT(a, b, 16);
1164
1165 #else
1166
1167                 x3 = __PKHBT(b, a, 16);;
1168
1169 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1170
1171                 px += 3u;
1172
1173         /* Perform the multiply-accumulates */
1174         acc0 = __SMLADX(x1, c0, acc0);
1175         acc1 = __SMLAD(x2, c0, acc1);
1176         acc2 = __SMLADX(x2, c0, acc2);
1177         acc3 = __SMLADX(x3, c0, acc3);
1178       }
1179
1180       /* Store the results in the accumulators in the destination buffer. */
1181           *pOut++ = (q15_t)(acc0 >> 15);
1182           *pOut++ = (q15_t)(acc1 >> 15);
1183           *pOut++ = (q15_t)(acc2 >> 15);
1184           *pOut++ = (q15_t)(acc3 >> 15);
1185
1186       /* Increment the pointer pIn1 index, count by 4 */
1187       count += 4u;
1188
1189       /* Update the inputA and inputB pointers for next MAC calculation */
1190       px = pIn1 + count;
1191       py = pSrc2;
1192
1193       /* Decrement the loop counter */
1194       blkCnt--;
1195     }
1196
1197     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
1198      ** No loop unrolling is used. */
1199     blkCnt = blockSize2 % 0x4u;
1200
1201     while(blkCnt > 0u)
1202     {
1203       /* Accumulator is made zero for every iteration */
1204       sum = 0;
1205
1206       /* Apply loop unrolling and compute 4 MACs simultaneously. */
1207       k = srcBLen >> 2u;
1208
1209       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
1210        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1211       while(k > 0u)
1212       {
1213         /* Perform the multiply-accumulates */
1214         sum += ((q31_t) * px++ * *py--);
1215         sum += ((q31_t) * px++ * *py--);
1216         sum += ((q31_t) * px++ * *py--);
1217         sum += ((q31_t) * px++ * *py--);
1218
1219         /* Decrement the loop counter */
1220         k--;
1221       }
1222
1223       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1224        ** No loop unrolling is used. */
1225       k = srcBLen % 0x4u;
1226
1227       while(k > 0u)
1228       {
1229         /* Perform the multiply-accumulates */
1230         sum += ((q31_t) * px++ * *py--);
1231
1232         /* Decrement the loop counter */
1233         k--;
1234       }
1235
1236       /* Store the result in the accumulator in the destination buffer. */
1237       *pOut++ = (q15_t) (sum >> 15);
1238
1239       /* Increment the pointer pIn1 index, count by 1 */
1240       count++;
1241
1242       /* Update the inputA and inputB pointers for next MAC calculation */
1243       px = pIn1 + count;
1244       py = pSrc2;
1245
1246       /* Decrement the loop counter */
1247       blkCnt--;
1248     }
1249   }
1250   else
1251   {
1252     /* If the srcBLen is not a multiple of 4,
1253      * the blockSize2 loop cannot be unrolled by 4 */
1254     blkCnt = blockSize2;
1255
1256     while(blkCnt > 0u)
1257     {
1258       /* Accumulator is made zero for every iteration */
1259       sum = 0;
1260
1261       /* srcBLen number of MACS should be performed */
1262       k = srcBLen;
1263
1264       while(k > 0u)
1265       {
1266         /* Perform the multiply-accumulate */
1267         sum += ((q31_t) * px++ * *py--);
1268
1269         /* Decrement the loop counter */
1270         k--;
1271       }
1272
1273       /* Store the result in the accumulator in the destination buffer. */
1274       *pOut++ = (q15_t) (sum >> 15);
1275
1276       /* Increment the MAC count */
1277       count++;
1278
1279       /* Update the inputA and inputB pointers for next MAC calculation */
1280       px = pIn1 + count;
1281       py = pSrc2;
1282
1283       /* Decrement the loop counter */
1284       blkCnt--;
1285     }
1286   }
1287
1288
1289   /* --------------------------
1290    * Initializations of stage3
1291    * -------------------------*/
1292
1293   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
1294    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
1295    * ....
1296    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
1297    * sum +=  x[srcALen-1] * y[srcBLen-1]
1298    */
1299
1300   /* In this stage the MAC operations are decreased by 1 for every iteration.
1301      The blockSize3 variable holds the number of MAC operations performed */
1302
1303   /* Working pointer of inputA */
1304   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
1305   px = pSrc1;
1306
1307   /* Working pointer of inputB */
1308   pSrc2 = pIn2 + (srcBLen - 1u);
1309   pIn2 = pSrc2 - 1u;
1310   py = pIn2;
1311
1312   /* -------------------
1313    * Stage3 process
1314    * ------------------*/
1315
1316   /* For loop unrolling by 4, this stage is divided into two. */
1317   /* First part of this stage computes the MAC operations greater than 4 */
1318   /* Second part of this stage computes the MAC operations less than or equal to 4 */
1319
1320   /* The first part of the stage starts here */
1321   j = blockSize3 >> 2u;
1322
1323   while((j > 0u) && (blockSize3 > 0u))
1324   {
1325     /* Accumulator is made zero for every iteration */
1326     sum = 0;
1327
1328     /* Apply loop unrolling and compute 4 MACs simultaneously. */
1329     k = blockSize3 >> 2u;
1330
1331     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
1332      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1333         py++;
1334
1335     while(k > 0u)
1336     {
1337         sum += ((q31_t) * px++ * *py--);
1338         sum += ((q31_t) * px++ * *py--);
1339         sum += ((q31_t) * px++ * *py--);
1340         sum += ((q31_t) * px++ * *py--);
1341       /* Decrement the loop counter */
1342       k--;
1343     }
1344
1345     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
1346      ** No loop unrolling is used. */
1347     k = blockSize3 % 0x4u;
1348
1349     while(k > 0u)
1350     {
1351       /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
1352         sum += ((q31_t) * px++ * *py--);
1353
1354       /* Decrement the loop counter */
1355       k--;
1356     }
1357
1358     /* Store the result in the accumulator in the destination buffer. */
1359     *pOut++ = (q15_t) (sum >> 15);
1360
1361     /* Update the inputA and inputB pointers for next MAC calculation */
1362     px = ++pSrc1;
1363     py = pIn2;
1364
1365     /* Decrement the loop counter */
1366     blockSize3--;
1367
1368     j--;
1369   }
1370
1371   /* The second part of the stage starts here */
1372   /* SIMD is not used for the next MAC operations,
1373    * so pointer py is updated to read only one sample at a time */
1374   py = py + 1u;
1375
1376   while(blockSize3 > 0u)
1377   {
1378     /* Accumulator is made zero for every iteration */
1379     sum = 0;
1380
1381     /* Apply loop unrolling and compute 4 MACs simultaneously. */
1382     k = blockSize3;
1383
1384     while(k > 0u)
1385     {
1386       /* Perform the multiply-accumulates */
1387       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
1388         sum += ((q31_t) * px++ * *py--);
1389
1390       /* Decrement the loop counter */
1391       k--;
1392     }
1393
1394     /* Store the result in the accumulator in the destination buffer. */
1395     *pOut++ = (q15_t) (sum >> 15);
1396
1397     /* Update the inputA and inputB pointers for next MAC calculation */
1398     px = ++pSrc1;
1399     py = pSrc2;
1400
1401     /* Decrement the loop counter */
1402     blockSize3--;
1403   }
1404
1405 #endif  /*      #ifndef UNALIGNED_SUPPORT_DISABLE       */
1406 }
1407
1408 /**
1409  * @} end of Conv group
1410  */