tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_partial_fast_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_partial_fast_q15.c
   9 *
  10 * Description:  Fast Q15 Partial convolution.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @addtogroup PartialConv
  49  * @{
  50  */
  51
  52 /**
  53  * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
  54  * @param[in]       *pSrcA points to the first input sequence.
  55  * @param[in]       srcALen length of the first input sequence.
  56  * @param[in]       *pSrcB points to the second input sequence.
  57  * @param[in]       srcBLen length of the second input sequence.
  58  * @param[out]      *pDst points to the location where the output result is written.
  59  * @param[in]       firstIndex is the first output sample to start with.
  60  * @param[in]       numPoints is the number of output points to be computed.
  61  * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
  62  *
  63  * See <code>arm_conv_partial_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
  64  */
  65
  66
  67 arm_status arm_conv_partial_fast_q15(
  68   q15_t * pSrcA,
  69   uint32_t srcALen,
  70   q15_t * pSrcB,
  71   uint32_t srcBLen,
  72   q15_t * pDst,
  73   uint32_t firstIndex,
  74   uint32_t numPoints)
  75 {
  76 #ifndef UNALIGNED_SUPPORT_DISABLE
  77
  78   q15_t *pIn1;                                   /* inputA pointer               */
  79   q15_t *pIn2;                                   /* inputB pointer               */
  80   q15_t *pOut = pDst;                            /* output pointer               */
  81   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */
  82   q15_t *px;                                     /* Intermediate inputA pointer  */
  83   q15_t *py;                                     /* Intermediate inputB pointer  */
  84   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
  85   q31_t x0, x1, x2, x3, c0;
  86   uint32_t j, k, count, check, blkCnt;
  87   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters                 */
  88   arm_status status;                             /* status of Partial convolution */
  89
  90   /* Check for range of output samples to be calculated */
  91   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
  92   {
  93     /* Set status as ARM_MATH_ARGUMENT_ERROR */
  94     status = ARM_MATH_ARGUMENT_ERROR;
  95   }
  96   else
  97   {
  98
  99     /* The algorithm implementation is based on the lengths of the inputs. */
 100     /* srcB is always made to slide across srcA. */
 101     /* So srcBLen is always considered as shorter or equal to srcALen */
 102     if(srcALen >=srcBLen)
 103     {
 104       /* Initialization of inputA pointer */
 105       pIn1 = pSrcA;
 106
 107       /* Initialization of inputB pointer */
 108       pIn2 = pSrcB;
 109     }
 110     else
 111     {
 112       /* Initialization of inputA pointer */
 113       pIn1 = pSrcB;
 114
 115       /* Initialization of inputB pointer */
 116       pIn2 = pSrcA;
 117
 118       /* srcBLen is always considered as shorter or equal to srcALen */
 119       j = srcBLen;
 120       srcBLen = srcALen;
 121       srcALen = j;
 122     }
 123
 124     /* Conditions to check which loopCounter holds
 125      * the first and last indices of the output samples to be calculated. */
 126     check = firstIndex + numPoints;
 127     blockSize3 = ((int32_t) check - (int32_t) srcALen);
 128     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
 129     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
 130     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
 131                                      (int32_t) numPoints) : 0;
 132     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
 133                                     (int32_t) firstIndex);
 134     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
 135
 136     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
 137     /* The function is internally
 138      * divided into three stages according to the number of multiplications that has to be
 139      * taken place between inputA samples and inputB samples. In the first stage of the
 140      * algorithm, the multiplications increase by one for every iteration.
 141      * In the second stage of the algorithm, srcBLen number of multiplications are done.
 142      * In the third stage of the algorithm, the multiplications decrease by one
 143      * for every iteration. */
 144
 145     /* Set the output pointer to point to the firstIndex
 146      * of the output sample to be calculated. */
 147     pOut = pDst + firstIndex;
 148
 149     /* --------------------------
 150      * Initializations of stage1
 151      * -------------------------*/
 152
 153     /* sum = x[0] * y[0]
 154      * sum = x[0] * y[1] + x[1] * y[0]
 155      * ....
 156      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
 157      */
 158
 159     /* In this stage the MAC operations are increased by 1 for every iteration.
 160        The count variable holds the number of MAC operations performed.
 161        Since the partial convolution starts from firstIndex
 162        Number of Macs to be performed is firstIndex + 1 */
 163     count = 1u + firstIndex;
 164
 165     /* Working pointer of inputA */
 166     px = pIn1;
 167
 168     /* Working pointer of inputB */
 169     pSrc2 = pIn2 + firstIndex;
 170     py = pSrc2;
 171
 172     /* ------------------------
 173      * Stage1 process
 174      * ----------------------*/
 175
 176     /* For loop unrolling by 4, this stage is divided into two. */
 177     /* First part of this stage computes the MAC operations less than 4 */
 178     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
 179
 180     /* The first part of the stage starts here */
 181     while((count < 4u) && (blockSize1 > 0))
 182     {
 183       /* Accumulator is made zero for every iteration */
 184       sum = 0;
 185
 186       /* Loop over number of MAC operations between
 187        * inputA samples and inputB samples */
 188       k = count;
 189
 190       while(k > 0u)
 191       {
 192         /* Perform the multiply-accumulates */
 193         sum = __SMLAD(*px++, *py--, sum);
 194
 195         /* Decrement the loop counter */
 196         k--;
 197       }
 198
 199       /* Store the result in the accumulator in the destination buffer. */
 200       *pOut++ = (q15_t) (sum >> 15);
 201
 202       /* Update the inputA and inputB pointers for next MAC calculation */
 203       py = ++pSrc2;
 204       px = pIn1;
 205
 206       /* Increment the MAC count */
 207       count++;
 208
 209       /* Decrement the loop counter */
 210       blockSize1--;
 211     }
 212
 213     /* The second part of the stage starts here */
 214     /* The internal loop, over count, is unrolled by 4 */
 215     /* To, read the last two inputB samples using SIMD:
 216      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
 217     py = py - 1;
 218
 219     while(blockSize1 > 0)
 220     {
 221       /* Accumulator is made zero for every iteration */
 222       sum = 0;
 223
 224       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 225       k = count >> 2u;
 226
 227       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 228        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 229       while(k > 0u)
 230       {
 231         /* Perform the multiply-accumulates */
 232         /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
 233         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 234         /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
 235         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 236
 237         /* Decrement the loop counter */
 238         k--;
 239       }
 240
 241       /* For the next MAC operations, the pointer py is used without SIMD
 242        * So, py is incremented by 1 */
 243       py = py + 1u;
 244
 245       /* If the count is not a multiple of 4, compute any remaining MACs here.
 246        ** No loop unrolling is used. */
 247       k = count % 0x4u;
 248
 249       while(k > 0u)
 250       {
 251         /* Perform the multiply-accumulates */
 252         sum = __SMLAD(*px++, *py--, sum);
 253
 254         /* Decrement the loop counter */
 255         k--;
 256       }
 257
 258       /* Store the result in the accumulator in the destination buffer. */
 259       *pOut++ = (q15_t) (sum >> 15);
 260
 261       /* Update the inputA and inputB pointers for next MAC calculation */
 262       py = ++pSrc2 - 1u;
 263       px = pIn1;
 264
 265       /* Increment the MAC count */
 266       count++;
 267
 268       /* Decrement the loop counter */
 269       blockSize1--;
 270     }
 271
 272     /* --------------------------
 273      * Initializations of stage2
 274      * ------------------------*/
 275
 276     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
 277      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
 278      * ....
 279      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
 280      */
 281
 282     /* Working pointer of inputA */
 283     px = pIn1;
 284
 285     /* Working pointer of inputB */
 286     pSrc2 = pIn2 + (srcBLen - 1u);
 287     py = pSrc2;
 288
 289     /* count is the index by which the pointer pIn1 to be incremented */
 290     count = 0u;
 291
 292
 293     /* --------------------
 294      * Stage2 process
 295      * -------------------*/
 296
 297     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 298      * So, to loop unroll over blockSize2,
 299      * srcBLen should be greater than or equal to 4 */
 300     if(srcBLen >= 4u)
 301     {
 302       /* Loop unroll over blockSize2, by 4 */
 303       blkCnt = ((uint32_t) blockSize2 >> 2u);
 304
 305       while(blkCnt > 0u)
 306       {
 307       py = py - 1u;
 308
 309         /* Set all accumulators to zero */
 310         acc0 = 0;
 311         acc1 = 0;
 312         acc2 = 0;
 313         acc3 = 0;
 314
 315
 316         /* read x[0], x[1] samples */
 317       x0 = *__SIMD32(px);
 318         /* read x[1], x[2] samples */
 319       x1 = _SIMD32_OFFSET(px+1);
 320           px+= 2u;
 321
 322
 323         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 324         k = srcBLen >> 2u;
 325
 326         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 327          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 328         do
 329         {
 330           /* Read the last two inputB samples using SIMD:
 331            * y[srcBLen - 1] and y[srcBLen - 2] */
 332         c0 = *__SIMD32(py)--;
 333
 334           /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
 335           acc0 = __SMLADX(x0, c0, acc0);
 336
 337           /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
 338           acc1 = __SMLADX(x1, c0, acc1);
 339
 340           /* Read x[2], x[3] */
 341         x2 = *__SIMD32(px);
 342
 343           /* Read x[3], x[4] */
 344         x3 = _SIMD32_OFFSET(px+1);
 345
 346           /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
 347           acc2 = __SMLADX(x2, c0, acc2);
 348
 349           /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
 350           acc3 = __SMLADX(x3, c0, acc3);
 351
 352           /* Read y[srcBLen - 3] and y[srcBLen - 4] */
 353         c0 = *__SIMD32(py)--;
 354
 355           /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
 356           acc0 = __SMLADX(x2, c0, acc0);
 357
 358           /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
 359           acc1 = __SMLADX(x3, c0, acc1);
 360
 361           /* Read x[4], x[5] */
 362         x0 = _SIMD32_OFFSET(px+2);
 363
 364           /* Read x[5], x[6] */
 365         x1 = _SIMD32_OFFSET(px+3);
 366                 px += 4u;
 367
 368           /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
 369           acc2 = __SMLADX(x0, c0, acc2);
 370
 371           /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
 372           acc3 = __SMLADX(x1, c0, acc3);
 373
 374         } while(--k);
 375
 376         /* For the next MAC operations, SIMD is not used
 377          * So, the 16 bit pointer if inputB, py is updated */
 378
 379         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 380          ** No loop unrolling is used. */
 381         k = srcBLen % 0x4u;
 382
 383         if(k == 1u)
 384         {
 385           /* Read y[srcBLen - 5] */
 386         c0 = *(py+1);
 387 #ifdef  ARM_MATH_BIG_ENDIAN
 388
 389         c0 = c0 << 16u;
 390
 391 #else
 392
 393         c0 = c0 & 0x0000FFFF;
 394
 395 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 396
 397           /* Read x[7] */
 398         x3 = *__SIMD32(px);
 399                 px++;
 400
 401           /* Perform the multiply-accumulates */
 402           acc0 = __SMLAD(x0, c0, acc0);
 403           acc1 = __SMLAD(x1, c0, acc1);
 404           acc2 = __SMLADX(x1, c0, acc2);
 405           acc3 = __SMLADX(x3, c0, acc3);
 406         }
 407
 408         if(k == 2u)
 409         {
 410           /* Read y[srcBLen - 5], y[srcBLen - 6] */
 411         c0 = _SIMD32_OFFSET(py);
 412
 413           /* Read x[7], x[8] */
 414         x3 = *__SIMD32(px);
 415
 416         /* Read x[9] */
 417         x2 = _SIMD32_OFFSET(px+1);
 418                 px += 2u;
 419
 420           /* Perform the multiply-accumulates */
 421           acc0 = __SMLADX(x0, c0, acc0);
 422           acc1 = __SMLADX(x1, c0, acc1);
 423           acc2 = __SMLADX(x3, c0, acc2);
 424           acc3 = __SMLADX(x2, c0, acc3);
 425         }
 426
 427         if(k == 3u)
 428         {
 429           /* Read y[srcBLen - 5], y[srcBLen - 6] */
 430         c0 = _SIMD32_OFFSET(py);
 431
 432           /* Read x[7], x[8] */
 433         x3 = *__SIMD32(px);
 434
 435           /* Read x[9] */
 436         x2 = _SIMD32_OFFSET(px+1);
 437
 438           /* Perform the multiply-accumulates */
 439           acc0 = __SMLADX(x0, c0, acc0);
 440           acc1 = __SMLADX(x1, c0, acc1);
 441           acc2 = __SMLADX(x3, c0, acc2);
 442           acc3 = __SMLADX(x2, c0, acc3);
 443
 444                 c0 = *(py-1);
 445 #ifdef  ARM_MATH_BIG_ENDIAN
 446
 447         c0 = c0 << 16u;
 448 #else
 449
 450         c0 = c0 & 0x0000FFFF;
 451 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 452
 453           /* Read x[10] */
 454         x3 =  _SIMD32_OFFSET(px+2);
 455                 px += 3u;
 456
 457           /* Perform the multiply-accumulates */
 458           acc0 = __SMLADX(x1, c0, acc0);
 459           acc1 = __SMLAD(x2, c0, acc1);
 460           acc2 = __SMLADX(x2, c0, acc2);
 461           acc3 = __SMLADX(x3, c0, acc3);
 462         }
 463
 464         /* Store the results in the accumulators in the destination buffer. */
 465 #ifndef ARM_MATH_BIG_ENDIAN
 466
 467         *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16);
 468         *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16);
 469
 470 #else
 471
 472         *__SIMD32(pOut)++ = __PKHBT(acc1 >> 15, acc0 >> 15, 16);
 473         *__SIMD32(pOut)++ = __PKHBT(acc3 >> 15, acc2 >> 15, 16);
 474
 475 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
 476
 477         /* Increment the pointer pIn1 index, count by 4 */
 478         count += 4u;
 479
 480         /* Update the inputA and inputB pointers for next MAC calculation */
 481         px = pIn1 + count;
 482         py = pSrc2;
 483
 484         /* Decrement the loop counter */
 485         blkCnt--;
 486       }
 487
 488       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 489        ** No loop unrolling is used. */
 490       blkCnt = (uint32_t) blockSize2 % 0x4u;
 491
 492       while(blkCnt > 0u)
 493       {
 494         /* Accumulator is made zero for every iteration */
 495         sum = 0;
 496
 497         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 498         k = srcBLen >> 2u;
 499
 500         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 501          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 502         while(k > 0u)
 503         {
 504           /* Perform the multiply-accumulates */
 505           sum += ((q31_t) * px++ * *py--);
 506           sum += ((q31_t) * px++ * *py--);
 507           sum += ((q31_t) * px++ * *py--);
 508           sum += ((q31_t) * px++ * *py--);
 509
 510           /* Decrement the loop counter */
 511           k--;
 512         }
 513
 514         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 515          ** No loop unrolling is used. */
 516         k = srcBLen % 0x4u;
 517
 518         while(k > 0u)
 519         {
 520           /* Perform the multiply-accumulates */
 521           sum += ((q31_t) * px++ * *py--);
 522
 523           /* Decrement the loop counter */
 524           k--;
 525         }
 526
 527         /* Store the result in the accumulator in the destination buffer. */
 528         *pOut++ = (q15_t) (sum >> 15);
 529
 530         /* Increment the pointer pIn1 index, count by 1 */
 531         count++;
 532
 533         /* Update the inputA and inputB pointers for next MAC calculation */
 534         px = pIn1 + count;
 535         py = pSrc2;
 536
 537         /* Decrement the loop counter */
 538         blkCnt--;
 539       }
 540     }
 541     else
 542     {
 543       /* If the srcBLen is not a multiple of 4,
 544        * the blockSize2 loop cannot be unrolled by 4 */
 545       blkCnt = (uint32_t) blockSize2;
 546
 547       while(blkCnt > 0u)
 548       {
 549         /* Accumulator is made zero for every iteration */
 550         sum = 0;
 551
 552         /* srcBLen number of MACS should be performed */
 553         k = srcBLen;
 554
 555         while(k > 0u)
 556         {
 557           /* Perform the multiply-accumulate */
 558           sum += ((q31_t) * px++ * *py--);
 559
 560           /* Decrement the loop counter */
 561           k--;
 562         }
 563
 564         /* Store the result in the accumulator in the destination buffer. */
 565         *pOut++ = (q15_t) (sum >> 15);
 566
 567         /* Increment the MAC count */
 568         count++;
 569
 570         /* Update the inputA and inputB pointers for next MAC calculation */
 571         px = pIn1 + count;
 572         py = pSrc2;
 573
 574         /* Decrement the loop counter */
 575         blkCnt--;
 576       }
 577     }
 578
 579
 580     /* --------------------------
 581      * Initializations of stage3
 582      * -------------------------*/
 583
 584     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
 585      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
 586      * ....
 587      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
 588      * sum +=  x[srcALen-1] * y[srcBLen-1]
 589      */
 590
 591     /* In this stage the MAC operations are decreased by 1 for every iteration.
 592        The count variable holds the number of MAC operations performed */
 593     count = srcBLen - 1u;
 594
 595     /* Working pointer of inputA */
 596     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 597     px = pSrc1;
 598
 599     /* Working pointer of inputB */
 600     pSrc2 = pIn2 + (srcBLen - 1u);
 601     pIn2 = pSrc2 - 1u;
 602     py = pIn2;
 603
 604     /* -------------------
 605      * Stage3 process
 606      * ------------------*/
 607
 608     /* For loop unrolling by 4, this stage is divided into two. */
 609     /* First part of this stage computes the MAC operations greater than 4 */
 610     /* Second part of this stage computes the MAC operations less than or equal to 4 */
 611
 612     /* The first part of the stage starts here */
 613     j = count >> 2u;
 614
 615     while((j > 0u) && (blockSize3 > 0))
 616     {
 617       /* Accumulator is made zero for every iteration */
 618       sum = 0;
 619
 620       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 621       k = count >> 2u;
 622
 623       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 624        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 625       while(k > 0u)
 626       {
 627         /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
 628          * with y[srcBLen - 1], y[srcBLen - 2] respectively */
 629         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 630         /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
 631          * with y[srcBLen - 3], y[srcBLen - 4] respectively */
 632         sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 633
 634         /* Decrement the loop counter */
 635         k--;
 636       }
 637
 638       /* For the next MAC operations, the pointer py is used without SIMD
 639        * So, py is incremented by 1 */
 640       py = py + 1u;
 641
 642       /* If the count is not a multiple of 4, compute any remaining MACs here.
 643        ** No loop unrolling is used. */
 644       k = count % 0x4u;
 645
 646       while(k > 0u)
 647       {
 648         /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
 649         sum = __SMLAD(*px++, *py--, sum);
 650
 651         /* Decrement the loop counter */
 652         k--;
 653       }
 654
 655       /* Store the result in the accumulator in the destination buffer. */
 656       *pOut++ = (q15_t) (sum >> 15);
 657
 658       /* Update the inputA and inputB pointers for next MAC calculation */
 659       px = ++pSrc1;
 660       py = pIn2;
 661
 662       /* Decrement the MAC count */
 663       count--;
 664
 665       /* Decrement the loop counter */
 666       blockSize3--;
 667
 668       j--;
 669     }
 670
 671     /* The second part of the stage starts here */
 672     /* SIMD is not used for the next MAC operations,
 673      * so pointer py is updated to read only one sample at a time */
 674     py = py + 1u;
 675
 676     while(blockSize3 > 0)
 677     {
 678       /* Accumulator is made zero for every iteration */
 679       sum = 0;
 680
 681       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 682       k = count;
 683
 684       while(k > 0u)
 685       {
 686         /* Perform the multiply-accumulates */
 687         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
 688         sum = __SMLAD(*px++, *py--, sum);
 689
 690         /* Decrement the loop counter */
 691         k--;
 692       }
 693
 694       /* Store the result in the accumulator in the destination buffer. */
 695       *pOut++ = (q15_t) (sum >> 15);
 696
 697       /* Update the inputA and inputB pointers for next MAC calculation */
 698       px = ++pSrc1;
 699       py = pSrc2;
 700
 701       /* Decrement the MAC count */
 702       count--;
 703
 704       /* Decrement the loop counter */
 705       blockSize3--;
 706     }
 707
 708     /* set status as ARM_MATH_SUCCESS */
 709     status = ARM_MATH_SUCCESS;
 710   }
 711
 712   /* Return to application */
 713   return (status);
 714
 715 #else
 716
 717   q15_t *pIn1;                                   /* inputA pointer               */
 718   q15_t *pIn2;                                   /* inputB pointer               */
 719   q15_t *pOut = pDst;                            /* output pointer               */
 720   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator                  */
 721   q15_t *px;                                     /* Intermediate inputA pointer  */
 722   q15_t *py;                                     /* Intermediate inputB pointer  */
 723   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
 724   q31_t x0, x1, x2, x3, c0;
 725   uint32_t j, k, count, check, blkCnt;
 726   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters                 */
 727   arm_status status;                             /* status of Partial convolution */
 728   q15_t a, b;
 729
 730   /* Check for range of output samples to be calculated */
 731   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
 732   {
 733     /* Set status as ARM_MATH_ARGUMENT_ERROR */
 734     status = ARM_MATH_ARGUMENT_ERROR;
 735   }
 736   else
 737   {
 738
 739     /* The algorithm implementation is based on the lengths of the inputs. */
 740     /* srcB is always made to slide across srcA. */
 741     /* So srcBLen is always considered as shorter or equal to srcALen */
 742     if(srcALen >=srcBLen)
 743     {
 744       /* Initialization of inputA pointer */
 745       pIn1 = pSrcA;
 746
 747       /* Initialization of inputB pointer */
 748       pIn2 = pSrcB;
 749     }
 750     else
 751     {
 752       /* Initialization of inputA pointer */
 753       pIn1 = pSrcB;
 754
 755       /* Initialization of inputB pointer */
 756       pIn2 = pSrcA;
 757
 758       /* srcBLen is always considered as shorter or equal to srcALen */
 759       j = srcBLen;
 760       srcBLen = srcALen;
 761       srcALen = j;
 762     }
 763
 764     /* Conditions to check which loopCounter holds
 765      * the first and last indices of the output samples to be calculated. */
 766     check = firstIndex + numPoints;
 767     blockSize3 = ((int32_t) check - (int32_t) srcALen);
 768     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
 769     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
 770     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
 771                                      (int32_t) numPoints) : 0;
 772     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
 773                                     (int32_t) firstIndex);
 774     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
 775
 776     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
 777     /* The function is internally
 778      * divided into three stages according to the number of multiplications that has to be
 779      * taken place between inputA samples and inputB samples. In the first stage of the
 780      * algorithm, the multiplications increase by one for every iteration.
 781      * In the second stage of the algorithm, srcBLen number of multiplications are done.
 782      * In the third stage of the algorithm, the multiplications decrease by one
 783      * for every iteration. */
 784
 785     /* Set the output pointer to point to the firstIndex
 786      * of the output sample to be calculated. */
 787     pOut = pDst + firstIndex;
 788
 789     /* --------------------------
 790      * Initializations of stage1
 791      * -------------------------*/
 792
 793     /* sum = x[0] * y[0]
 794      * sum = x[0] * y[1] + x[1] * y[0]
 795      * ....
 796      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
 797      */
 798
 799     /* In this stage the MAC operations are increased by 1 for every iteration.
 800        The count variable holds the number of MAC operations performed.
 801        Since the partial convolution starts from firstIndex
 802        Number of Macs to be performed is firstIndex + 1 */
 803     count = 1u + firstIndex;
 804
 805     /* Working pointer of inputA */
 806     px = pIn1;
 807
 808     /* Working pointer of inputB */
 809     pSrc2 = pIn2 + firstIndex;
 810     py = pSrc2;
 811
 812     /* ------------------------
 813      * Stage1 process
 814      * ----------------------*/
 815
 816     /* For loop unrolling by 4, this stage is divided into two. */
 817     /* First part of this stage computes the MAC operations less than 4 */
 818     /* Second part of this stage computes the MAC operations greater than or equal to 4 */
 819
 820     /* The first part of the stage starts here */
 821   while((count < 4u) && (blockSize1 > 0u))
 822     {
 823       /* Accumulator is made zero for every iteration */
 824       sum = 0;
 825
 826       /* Loop over number of MAC operations between
 827        * inputA samples and inputB samples */
 828       k = count;
 829
 830       while(k > 0u)
 831       {
 832         /* Perform the multiply-accumulates */
 833       sum += ((q31_t) * px++ * *py--);
 834
 835         /* Decrement the loop counter */
 836         k--;
 837       }
 838
 839       /* Store the result in the accumulator in the destination buffer. */
 840       *pOut++ = (q15_t) (sum >> 15);
 841
 842       /* Update the inputA and inputB pointers for next MAC calculation */
 843       py = ++pSrc2;
 844       px = pIn1;
 845
 846       /* Increment the MAC count */
 847       count++;
 848
 849       /* Decrement the loop counter */
 850       blockSize1--;
 851     }
 852
 853     /* The second part of the stage starts here */
 854     /* The internal loop, over count, is unrolled by 4 */
 855     /* To, read the last two inputB samples using SIMD:
 856      * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
 857     py = py - 1;
 858
 859   while(blockSize1 > 0u)
 860     {
 861       /* Accumulator is made zero for every iteration */
 862       sum = 0;
 863
 864       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 865       k = count >> 2u;
 866
 867       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 868        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 869         py++;
 870
 871     while(k > 0u)
 872     {
 873       /* Perform the multiply-accumulates */
 874         sum += ((q31_t) * px++ * *py--);
 875         sum += ((q31_t) * px++ * *py--);
 876         sum += ((q31_t) * px++ * *py--);
 877         sum += ((q31_t) * px++ * *py--);
 878
 879       /* Decrement the loop counter */
 880       k--;
 881     }
 882
 883       /* If the count is not a multiple of 4, compute any remaining MACs here.
 884        ** No loop unrolling is used. */
 885       k = count % 0x4u;
 886
 887       while(k > 0u)
 888       {
 889         /* Perform the multiply-accumulates */
 890       sum += ((q31_t) * px++ * *py--);
 891
 892         /* Decrement the loop counter */
 893         k--;
 894       }
 895
 896       /* Store the result in the accumulator in the destination buffer. */
 897       *pOut++ = (q15_t) (sum >> 15);
 898
 899       /* Update the inputA and inputB pointers for next MAC calculation */
 900       py = ++pSrc2 - 1u;
 901       px = pIn1;
 902
 903       /* Increment the MAC count */
 904       count++;
 905
 906       /* Decrement the loop counter */
 907       blockSize1--;
 908     }
 909
 910     /* --------------------------
 911      * Initializations of stage2
 912      * ------------------------*/
 913
 914     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
 915      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
 916      * ....
 917      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
 918      */
 919
 920     /* Working pointer of inputA */
 921     px = pIn1;
 922
 923     /* Working pointer of inputB */
 924     pSrc2 = pIn2 + (srcBLen - 1u);
 925     py = pSrc2;
 926
 927     /* count is the index by which the pointer pIn1 to be incremented */
 928     count = 0u;
 929
 930
 931     /* --------------------
 932      * Stage2 process
 933      * -------------------*/
 934
 935     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 936      * So, to loop unroll over blockSize2,
 937      * srcBLen should be greater than or equal to 4 */
 938     if(srcBLen >= 4u)
 939     {
 940       /* Loop unroll over blockSize2, by 4 */
 941       blkCnt = ((uint32_t) blockSize2 >> 2u);
 942
 943       while(blkCnt > 0u)
 944       {
 945       py = py - 1u;
 946
 947         /* Set all accumulators to zero */
 948         acc0 = 0;
 949         acc1 = 0;
 950         acc2 = 0;
 951         acc3 = 0;
 952
 953       /* read x[0], x[1] samples */
 954           a = *px++;
 955           b = *px++;
 956
 957 #ifndef ARM_MATH_BIG_ENDIAN
 958
 959           x0 = __PKHBT(a, b, 16);
 960           a = *px;
 961           x1 = __PKHBT(b, a, 16);
 962
 963 #else
 964
 965           x0 = __PKHBT(b, a, 16);
 966           a = *px;
 967           x1 = __PKHBT(a, b, 16);
 968
 969 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN        */
 970
 971       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 972       k = srcBLen >> 2u;
 973
 974       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 975        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 976       do
 977       {
 978         /* Read the last two inputB samples using SIMD:
 979          * y[srcBLen - 1] and y[srcBLen - 2] */
 980                 a = *py;
 981                 b = *(py+1);
 982                 py -= 2;
 983
 984 #ifndef ARM_MATH_BIG_ENDIAN
 985
 986                 c0 = __PKHBT(a, b, 16);
 987
 988 #else
 989
 990                 c0 = __PKHBT(b, a, 16);;
 991
 992 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
 993
 994         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
 995         acc0 = __SMLADX(x0, c0, acc0);
 996
 997         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
 998         acc1 = __SMLADX(x1, c0, acc1);
 999
1000           a = *px;
1001           b = *(px + 1);
1002
1003 #ifndef ARM_MATH_BIG_ENDIAN
1004
1005           x2 = __PKHBT(a, b, 16);
1006           a = *(px + 2);
1007           x3 = __PKHBT(b, a, 16);
1008
1009 #else
1010
1011           x2 = __PKHBT(b, a, 16);
1012           a = *(px + 2);
1013           x3 = __PKHBT(a, b, 16);
1014
1015 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN        */
1016
1017         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
1018         acc2 = __SMLADX(x2, c0, acc2);
1019
1020         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
1021         acc3 = __SMLADX(x3, c0, acc3);
1022
1023         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
1024                 a = *py;
1025                 b = *(py+1);
1026                 py -= 2;
1027
1028 #ifndef ARM_MATH_BIG_ENDIAN
1029
1030                 c0 = __PKHBT(a, b, 16);
1031
1032 #else
1033
1034                 c0 = __PKHBT(b, a, 16);;
1035
1036 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1037
1038         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
1039         acc0 = __SMLADX(x2, c0, acc0);
1040
1041         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
1042         acc1 = __SMLADX(x3, c0, acc1);
1043
1044         /* Read x[4], x[5], x[6] */
1045           a = *(px + 2);
1046           b = *(px + 3);
1047
1048 #ifndef ARM_MATH_BIG_ENDIAN
1049
1050           x0 = __PKHBT(a, b, 16);
1051           a = *(px + 4);
1052           x1 = __PKHBT(b, a, 16);
1053
1054 #else
1055
1056           x0 = __PKHBT(b, a, 16);
1057           a = *(px + 4);
1058           x1 = __PKHBT(a, b, 16);
1059
1060 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN        */
1061
1062                 px += 4u;
1063
1064         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
1065         acc2 = __SMLADX(x0, c0, acc2);
1066
1067         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
1068         acc3 = __SMLADX(x1, c0, acc3);
1069
1070       } while(--k);
1071
1072       /* For the next MAC operations, SIMD is not used
1073        * So, the 16 bit pointer if inputB, py is updated */
1074
1075       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1076        ** No loop unrolling is used. */
1077       k = srcBLen % 0x4u;
1078
1079       if(k == 1u)
1080       {
1081         /* Read y[srcBLen - 5] */
1082         c0 = *(py+1);
1083
1084 #ifdef  ARM_MATH_BIG_ENDIAN
1085
1086         c0 = c0 << 16u;
1087
1088 #else
1089
1090         c0 = c0 & 0x0000FFFF;
1091
1092 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
1093
1094         /* Read x[7] */
1095                 a = *px;
1096                 b = *(px+1);
1097                 px++;
1098
1099 #ifndef ARM_MATH_BIG_ENDIAN
1100
1101                 x3 = __PKHBT(a, b, 16);
1102
1103 #else
1104
1105                 x3 = __PKHBT(b, a, 16);;
1106
1107 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1108
1109
1110         /* Perform the multiply-accumulates */
1111         acc0 = __SMLAD(x0, c0, acc0);
1112         acc1 = __SMLAD(x1, c0, acc1);
1113         acc2 = __SMLADX(x1, c0, acc2);
1114         acc3 = __SMLADX(x3, c0, acc3);
1115       }
1116
1117       if(k == 2u)
1118       {
1119         /* Read y[srcBLen - 5], y[srcBLen - 6] */
1120                 a = *py;
1121                 b = *(py+1);
1122
1123 #ifndef ARM_MATH_BIG_ENDIAN
1124
1125                 c0 = __PKHBT(a, b, 16);
1126
1127 #else
1128
1129                 c0 = __PKHBT(b, a, 16);;
1130
1131 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1132
1133         /* Read x[7], x[8], x[9] */
1134           a = *px;
1135           b = *(px + 1);
1136
1137 #ifndef ARM_MATH_BIG_ENDIAN
1138
1139           x3 = __PKHBT(a, b, 16);
1140           a = *(px + 2);
1141           x2 = __PKHBT(b, a, 16);
1142
1143 #else
1144
1145           x3 = __PKHBT(b, a, 16);
1146           a = *(px + 2);
1147           x2 = __PKHBT(a, b, 16);
1148
1149 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN        */
1150                 px += 2u;
1151
1152         /* Perform the multiply-accumulates */
1153         acc0 = __SMLADX(x0, c0, acc0);
1154         acc1 = __SMLADX(x1, c0, acc1);
1155         acc2 = __SMLADX(x3, c0, acc2);
1156         acc3 = __SMLADX(x2, c0, acc3);
1157       }
1158
1159       if(k == 3u)
1160       {
1161         /* Read y[srcBLen - 5], y[srcBLen - 6] */
1162                 a = *py;
1163                 b = *(py+1);
1164
1165 #ifndef ARM_MATH_BIG_ENDIAN
1166
1167                 c0 = __PKHBT(a, b, 16);
1168
1169 #else
1170
1171                 c0 = __PKHBT(b, a, 16);;
1172
1173 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1174
1175         /* Read x[7], x[8], x[9] */
1176           a = *px;
1177           b = *(px + 1);
1178
1179 #ifndef ARM_MATH_BIG_ENDIAN
1180
1181           x3 = __PKHBT(a, b, 16);
1182           a = *(px + 2);
1183           x2 = __PKHBT(b, a, 16);
1184
1185 #else
1186
1187           x3 = __PKHBT(b, a, 16);
1188           a = *(px + 2);
1189           x2 = __PKHBT(a, b, 16);
1190
1191 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN        */
1192
1193         /* Perform the multiply-accumulates */
1194         acc0 = __SMLADX(x0, c0, acc0);
1195         acc1 = __SMLADX(x1, c0, acc1);
1196         acc2 = __SMLADX(x3, c0, acc2);
1197         acc3 = __SMLADX(x2, c0, acc3);
1198
1199         /* Read y[srcBLen - 7] */
1200                 c0 = *(py-1);
1201 #ifdef  ARM_MATH_BIG_ENDIAN
1202
1203         c0 = c0 << 16u;
1204 #else
1205
1206         c0 = c0 & 0x0000FFFF;
1207 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
1208
1209         /* Read x[10] */
1210                 a = *(px+2);
1211                 b = *(px+3);
1212
1213 #ifndef ARM_MATH_BIG_ENDIAN
1214
1215                 x3 = __PKHBT(a, b, 16);
1216
1217 #else
1218
1219                 x3 = __PKHBT(b, a, 16);;
1220
1221 #endif  /*      #ifndef ARM_MATH_BIG_ENDIAN     */
1222
1223                 px += 3u;
1224
1225         /* Perform the multiply-accumulates */
1226         acc0 = __SMLADX(x1, c0, acc0);
1227         acc1 = __SMLAD(x2, c0, acc1);
1228         acc2 = __SMLADX(x2, c0, acc2);
1229         acc3 = __SMLADX(x3, c0, acc3);
1230       }
1231
1232       /* Store the results in the accumulators in the destination buffer. */
1233           *pOut++ = (q15_t)(acc0 >> 15);
1234           *pOut++ = (q15_t)(acc1 >> 15);
1235           *pOut++ = (q15_t)(acc2 >> 15);
1236           *pOut++ = (q15_t)(acc3 >> 15);
1237
1238         /* Increment the pointer pIn1 index, count by 4 */
1239         count += 4u;
1240
1241         /* Update the inputA and inputB pointers for next MAC calculation */
1242         px = pIn1 + count;
1243         py = pSrc2;
1244
1245         /* Decrement the loop counter */
1246         blkCnt--;
1247       }
1248
1249       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
1250        ** No loop unrolling is used. */
1251       blkCnt = (uint32_t) blockSize2 % 0x4u;
1252
1253       while(blkCnt > 0u)
1254       {
1255         /* Accumulator is made zero for every iteration */
1256         sum = 0;
1257
1258         /* Apply loop unrolling and compute 4 MACs simultaneously. */
1259         k = srcBLen >> 2u;
1260
1261         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
1262          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1263         while(k > 0u)
1264         {
1265           /* Perform the multiply-accumulates */
1266           sum += ((q31_t) * px++ * *py--);
1267           sum += ((q31_t) * px++ * *py--);
1268           sum += ((q31_t) * px++ * *py--);
1269           sum += ((q31_t) * px++ * *py--);
1270
1271           /* Decrement the loop counter */
1272           k--;
1273         }
1274
1275         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1276          ** No loop unrolling is used. */
1277         k = srcBLen % 0x4u;
1278
1279         while(k > 0u)
1280         {
1281           /* Perform the multiply-accumulates */
1282           sum += ((q31_t) * px++ * *py--);
1283
1284           /* Decrement the loop counter */
1285           k--;
1286         }
1287
1288         /* Store the result in the accumulator in the destination buffer. */
1289         *pOut++ = (q15_t) (sum >> 15);
1290
1291         /* Increment the pointer pIn1 index, count by 1 */
1292         count++;
1293
1294         /* Update the inputA and inputB pointers for next MAC calculation */
1295         px = pIn1 + count;
1296         py = pSrc2;
1297
1298         /* Decrement the loop counter */
1299         blkCnt--;
1300       }
1301     }
1302     else
1303     {
1304       /* If the srcBLen is not a multiple of 4,
1305        * the blockSize2 loop cannot be unrolled by 4 */
1306       blkCnt = (uint32_t) blockSize2;
1307
1308       while(blkCnt > 0u)
1309       {
1310         /* Accumulator is made zero for every iteration */
1311         sum = 0;
1312
1313         /* srcBLen number of MACS should be performed */
1314         k = srcBLen;
1315
1316         while(k > 0u)
1317         {
1318           /* Perform the multiply-accumulate */
1319           sum += ((q31_t) * px++ * *py--);
1320
1321           /* Decrement the loop counter */
1322           k--;
1323         }
1324
1325         /* Store the result in the accumulator in the destination buffer. */
1326         *pOut++ = (q15_t) (sum >> 15);
1327
1328         /* Increment the MAC count */
1329         count++;
1330
1331         /* Update the inputA and inputB pointers for next MAC calculation */
1332         px = pIn1 + count;
1333         py = pSrc2;
1334
1335         /* Decrement the loop counter */
1336         blkCnt--;
1337       }
1338     }
1339
1340
1341     /* --------------------------
1342      * Initializations of stage3
1343      * -------------------------*/
1344
1345     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
1346      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
1347      * ....
1348      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
1349      * sum +=  x[srcALen-1] * y[srcBLen-1]
1350      */
1351
1352     /* In this stage the MAC operations are decreased by 1 for every iteration.
1353        The count variable holds the number of MAC operations performed */
1354     count = srcBLen - 1u;
1355
1356     /* Working pointer of inputA */
1357     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
1358     px = pSrc1;
1359
1360     /* Working pointer of inputB */
1361     pSrc2 = pIn2 + (srcBLen - 1u);
1362     pIn2 = pSrc2 - 1u;
1363     py = pIn2;
1364
1365     /* -------------------
1366      * Stage3 process
1367      * ------------------*/
1368
1369     /* For loop unrolling by 4, this stage is divided into two. */
1370     /* First part of this stage computes the MAC operations greater than 4 */
1371     /* Second part of this stage computes the MAC operations less than or equal to 4 */
1372
1373     /* The first part of the stage starts here */
1374     j = count >> 2u;
1375
1376     while((j > 0u) && (blockSize3 > 0))
1377     {
1378       /* Accumulator is made zero for every iteration */
1379       sum = 0;
1380
1381       /* Apply loop unrolling and compute 4 MACs simultaneously. */
1382       k = count >> 2u;
1383
1384       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
1385        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1386         py++;
1387
1388     while(k > 0u)
1389     {
1390       /* Perform the multiply-accumulates */
1391         sum += ((q31_t) * px++ * *py--);
1392         sum += ((q31_t) * px++ * *py--);
1393         sum += ((q31_t) * px++ * *py--);
1394         sum += ((q31_t) * px++ * *py--);
1395       /* Decrement the loop counter */
1396       k--;
1397     }
1398
1399
1400       /* If the count is not a multiple of 4, compute any remaining MACs here.
1401        ** No loop unrolling is used. */
1402       k = count % 0x4u;
1403
1404       while(k > 0u)
1405       {
1406       /* Perform the multiply-accumulates */
1407         sum += ((q31_t) * px++ * *py--);
1408
1409         /* Decrement the loop counter */
1410         k--;
1411       }
1412
1413       /* Store the result in the accumulator in the destination buffer. */
1414       *pOut++ = (q15_t) (sum >> 15);
1415
1416       /* Update the inputA and inputB pointers for next MAC calculation */
1417       px = ++pSrc1;
1418       py = pIn2;
1419
1420       /* Decrement the MAC count */
1421       count--;
1422
1423       /* Decrement the loop counter */
1424       blockSize3--;
1425
1426       j--;
1427     }
1428
1429     /* The second part of the stage starts here */
1430     /* SIMD is not used for the next MAC operations,
1431      * so pointer py is updated to read only one sample at a time */
1432     py = py + 1u;
1433
1434   while(blockSize3 > 0u)
1435     {
1436       /* Accumulator is made zero for every iteration */
1437       sum = 0;
1438
1439       /* Apply loop unrolling and compute 4 MACs simultaneously. */
1440       k = count;
1441
1442       while(k > 0u)
1443       {
1444         /* Perform the multiply-accumulates */
1445         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
1446         sum += ((q31_t) * px++ * *py--);
1447
1448         /* Decrement the loop counter */
1449         k--;
1450       }
1451
1452       /* Store the result in the accumulator in the destination buffer. */
1453       *pOut++ = (q15_t) (sum >> 15);
1454
1455       /* Update the inputA and inputB pointers for next MAC calculation */
1456       px = ++pSrc1;
1457       py = pSrc2;
1458
1459       /* Decrement the MAC count */
1460       count--;
1461
1462       /* Decrement the loop counter */
1463       blockSize3--;
1464     }
1465
1466     /* set status as ARM_MATH_SUCCESS */
1467     status = ARM_MATH_SUCCESS;
1468   }
1469
1470   /* Return to application */
1471   return (status);
1472
1473 #endif /*     #ifndef UNALIGNED_SUPPORT_DISABLE      */
1474 }
1475
1476 /**
1477  * @} end of PartialConv group
1478  */