tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_q15.c
   9 *
  10 * Description:  Convolution of Q15 sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @addtogroup Conv
  49  * @{
  50  */
  51
  52 /**
  53  * @brief Convolution of Q15 sequences.
  54  * @param[in] *pSrcA points to the first input sequence.
  55  * @param[in] srcALen length of the first input sequence.
  56  * @param[in] *pSrcB points to the second input sequence.
  57  * @param[in] srcBLen length of the second input sequence.
  58  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
  59  * @return none.
  60  *
  61  * @details
  62  * <b>Scaling and Overflow Behavior:</b>
  63  *
  64  * \par
  65  * The function is implemented using a 64-bit internal accumulator.
  66  * Both inputs are in 1.15 format and multiplications yield a 2.30 result.
  67  * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
  68  * This approach provides 33 guard bits and there is no risk of overflow.
  69  * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
  70  *
  71  * \par
  72  * Refer to <code>arm_conv_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
  73  *
  74  * \par
  75  * Refer the function <code>arm_conv_opt_q15()</code> for a faster implementation of this function using scratch buffers.
  76  *
  77  */
  78
  79 void arm_conv_q15(
  80   q15_t * pSrcA,
  81   uint32_t srcALen,
  82   q15_t * pSrcB,
  83   uint32_t srcBLen,
  84   q15_t * pDst)
  85 {
  86
  87 #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
  88
  89   /* Run the below code for Cortex-M4 and Cortex-M3 */
  90
  91   q15_t *pIn1;                                   /* inputA pointer */
  92   q15_t *pIn2;                                   /* inputB pointer */
  93   q15_t *pOut = pDst;                            /* output pointer */
  94   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
  95   q15_t *px;                                     /* Intermediate inputA pointer  */
  96   q15_t *py;                                     /* Intermediate inputB pointer  */
  97   q15_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
  98   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
  99   uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt;     /* loop counter */
 100
 101   /* The algorithm implementation is based on the lengths of the inputs. */
 102   /* srcB is always made to slide across srcA. */
 103   /* So srcBLen is always considered as shorter or equal to srcALen */
 104   if(srcALen >= srcBLen)
 105   {
 106     /* Initialization of inputA pointer */
 107     pIn1 = pSrcA;
 108
 109     /* Initialization of inputB pointer */
 110     pIn2 = pSrcB;
 111   }
 112   else
 113   {
 114     /* Initialization of inputA pointer */
 115     pIn1 = pSrcB;
 116
 117     /* Initialization of inputB pointer */
 118     pIn2 = pSrcA;
 119
 120     /* srcBLen is always considered as shorter or equal to srcALen */
 121     j = srcBLen;
 122     srcBLen = srcALen;
 123     srcALen = j;
 124   }
 125
 126   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
 127   /* The function is internally
 128    * divided into three stages according to the number of multiplications that has to be
 129    * taken place between inputA samples and inputB samples. In the first stage of the
 130    * algorithm, the multiplications increase by one for every iteration.
 131    * In the second stage of the algorithm, srcBLen number of multiplications are done.
 132    * In the third stage of the algorithm, the multiplications decrease by one
 133    * for every iteration. */
 134
 135   /* The algorithm is implemented in three stages.
 136      The loop counters of each stage is initiated here. */
 137   blockSize1 = srcBLen - 1u;
 138   blockSize2 = srcALen - (srcBLen - 1u);
 139
 140   /* --------------------------
 141    * Initializations of stage1
 142    * -------------------------*/
 143
 144   /* sum = x[0] * y[0]
 145    * sum = x[0] * y[1] + x[1] * y[0]
 146    * ....
 147    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
 148    */
 149
 150   /* In this stage the MAC operations are increased by 1 for every iteration.
 151      The count variable holds the number of MAC operations performed */
 152   count = 1u;
 153
 154   /* Working pointer of inputA */
 155   px = pIn1;
 156
 157   /* Working pointer of inputB */
 158   py = pIn2;
 159
 160
 161   /* ------------------------
 162    * Stage1 process
 163    * ----------------------*/
 164
 165   /* For loop unrolling by 4, this stage is divided into two. */
 166   /* First part of this stage computes the MAC operations less than 4 */
 167   /* Second part of this stage computes the MAC operations greater than or equal to 4 */
 168
 169   /* The first part of the stage starts here */
 170   while((count < 4u) && (blockSize1 > 0u))
 171   {
 172     /* Accumulator is made zero for every iteration */
 173     sum = 0;
 174
 175     /* Loop over number of MAC operations between
 176      * inputA samples and inputB samples */
 177     k = count;
 178
 179     while(k > 0u)
 180     {
 181       /* Perform the multiply-accumulates */
 182       sum = __SMLALD(*px++, *py--, sum);
 183
 184       /* Decrement the loop counter */
 185       k--;
 186     }
 187
 188     /* Store the result in the accumulator in the destination buffer. */
 189     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
 190
 191     /* Update the inputA and inputB pointers for next MAC calculation */
 192     py = pIn2 + count;
 193     px = pIn1;
 194
 195     /* Increment the MAC count */
 196     count++;
 197
 198     /* Decrement the loop counter */
 199     blockSize1--;
 200   }
 201
 202   /* The second part of the stage starts here */
 203   /* The internal loop, over count, is unrolled by 4 */
 204   /* To, read the last two inputB samples using SIMD:
 205    * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
 206   py = py - 1;
 207
 208   while(blockSize1 > 0u)
 209   {
 210     /* Accumulator is made zero for every iteration */
 211     sum = 0;
 212
 213     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 214     k = count >> 2u;
 215
 216     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 217      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 218     while(k > 0u)
 219     {
 220       /* Perform the multiply-accumulates */
 221       /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
 222       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 223       /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
 224       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 225
 226       /* Decrement the loop counter */
 227       k--;
 228     }
 229
 230     /* For the next MAC operations, the pointer py is used without SIMD
 231      * So, py is incremented by 1 */
 232     py = py + 1u;
 233
 234     /* If the count is not a multiple of 4, compute any remaining MACs here.
 235      ** No loop unrolling is used. */
 236     k = count % 0x4u;
 237
 238     while(k > 0u)
 239     {
 240       /* Perform the multiply-accumulates */
 241       sum = __SMLALD(*px++, *py--, sum);
 242
 243       /* Decrement the loop counter */
 244       k--;
 245     }
 246
 247     /* Store the result in the accumulator in the destination buffer. */
 248     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
 249
 250     /* Update the inputA and inputB pointers for next MAC calculation */
 251     py = pIn2 + (count - 1u);
 252     px = pIn1;
 253
 254     /* Increment the MAC count */
 255     count++;
 256
 257     /* Decrement the loop counter */
 258     blockSize1--;
 259   }
 260
 261   /* --------------------------
 262    * Initializations of stage2
 263    * ------------------------*/
 264
 265   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
 266    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
 267    * ....
 268    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
 269    */
 270
 271   /* Working pointer of inputA */
 272   px = pIn1;
 273
 274   /* Working pointer of inputB */
 275   pSrc2 = pIn2 + (srcBLen - 1u);
 276   py = pSrc2;
 277
 278   /* count is the index by which the pointer pIn1 to be incremented */
 279   count = 0u;
 280
 281
 282   /* --------------------
 283    * Stage2 process
 284    * -------------------*/
 285
 286   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 287    * So, to loop unroll over blockSize2,
 288    * srcBLen should be greater than or equal to 4 */
 289   if(srcBLen >= 4u)
 290   {
 291     /* Loop unroll over blockSize2, by 4 */
 292     blkCnt = blockSize2 >> 2u;
 293
 294     while(blkCnt > 0u)
 295     {
 296       py = py - 1u;
 297
 298       /* Set all accumulators to zero */
 299       acc0 = 0;
 300       acc1 = 0;
 301       acc2 = 0;
 302       acc3 = 0;
 303
 304
 305       /* read x[0], x[1] samples */
 306       x0 = *__SIMD32(px);
 307       /* read x[1], x[2] samples */
 308       x1 = _SIMD32_OFFSET(px+1);
 309           px+= 2u;
 310
 311
 312       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 313       k = srcBLen >> 2u;
 314
 315       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 316        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 317       do
 318       {
 319         /* Read the last two inputB samples using SIMD:
 320          * y[srcBLen - 1] and y[srcBLen - 2] */
 321         c0 = *__SIMD32(py)--;
 322
 323         /* acc0 +=  x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
 324         acc0 = __SMLALDX(x0, c0, acc0);
 325
 326         /* acc1 +=  x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
 327         acc1 = __SMLALDX(x1, c0, acc1);
 328
 329         /* Read x[2], x[3] */
 330         x2 = *__SIMD32(px);
 331
 332         /* Read x[3], x[4] */
 333         x3 = _SIMD32_OFFSET(px+1);
 334
 335         /* acc2 +=  x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
 336         acc2 = __SMLALDX(x2, c0, acc2);
 337
 338         /* acc3 +=  x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
 339         acc3 = __SMLALDX(x3, c0, acc3);
 340
 341         /* Read y[srcBLen - 3] and y[srcBLen - 4] */
 342         c0 = *__SIMD32(py)--;
 343
 344         /* acc0 +=  x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
 345         acc0 = __SMLALDX(x2, c0, acc0);
 346
 347         /* acc1 +=  x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
 348         acc1 = __SMLALDX(x3, c0, acc1);
 349
 350         /* Read x[4], x[5] */
 351         x0 = _SIMD32_OFFSET(px+2);
 352
 353         /* Read x[5], x[6] */
 354         x1 = _SIMD32_OFFSET(px+3);
 355                 px += 4u;
 356
 357         /* acc2 +=  x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
 358         acc2 = __SMLALDX(x0, c0, acc2);
 359
 360         /* acc3 +=  x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
 361         acc3 = __SMLALDX(x1, c0, acc3);
 362
 363       } while(--k);
 364
 365       /* For the next MAC operations, SIMD is not used
 366        * So, the 16 bit pointer if inputB, py is updated */
 367
 368       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 369        ** No loop unrolling is used. */
 370       k = srcBLen % 0x4u;
 371
 372       if(k == 1u)
 373       {
 374         /* Read y[srcBLen - 5] */
 375         c0 = *(py+1);
 376
 377 #ifdef  ARM_MATH_BIG_ENDIAN
 378
 379         c0 = c0 << 16u;
 380
 381 #else
 382
 383         c0 = c0 & 0x0000FFFF;
 384
 385 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 386         /* Read x[7] */
 387         x3 = *__SIMD32(px);
 388                 px++;
 389
 390         /* Perform the multiply-accumulates */
 391         acc0 = __SMLALD(x0, c0, acc0);
 392         acc1 = __SMLALD(x1, c0, acc1);
 393         acc2 = __SMLALDX(x1, c0, acc2);
 394         acc3 = __SMLALDX(x3, c0, acc3);
 395       }
 396
 397       if(k == 2u)
 398       {
 399         /* Read y[srcBLen - 5], y[srcBLen - 6] */
 400         c0 = _SIMD32_OFFSET(py);
 401
 402         /* Read x[7], x[8] */
 403         x3 = *__SIMD32(px);
 404
 405         /* Read x[9] */
 406         x2 = _SIMD32_OFFSET(px+1);
 407                 px += 2u;
 408
 409         /* Perform the multiply-accumulates */
 410         acc0 = __SMLALDX(x0, c0, acc0);
 411         acc1 = __SMLALDX(x1, c0, acc1);
 412         acc2 = __SMLALDX(x3, c0, acc2);
 413         acc3 = __SMLALDX(x2, c0, acc3);
 414       }
 415
 416       if(k == 3u)
 417       {
 418         /* Read y[srcBLen - 5], y[srcBLen - 6] */
 419         c0 = _SIMD32_OFFSET(py);
 420
 421         /* Read x[7], x[8] */
 422         x3 = *__SIMD32(px);
 423
 424         /* Read x[9] */
 425         x2 = _SIMD32_OFFSET(px+1);
 426
 427         /* Perform the multiply-accumulates */
 428         acc0 = __SMLALDX(x0, c0, acc0);
 429         acc1 = __SMLALDX(x1, c0, acc1);
 430         acc2 = __SMLALDX(x3, c0, acc2);
 431         acc3 = __SMLALDX(x2, c0, acc3);
 432
 433                 c0 = *(py-1);
 434
 435 #ifdef  ARM_MATH_BIG_ENDIAN
 436
 437         c0 = c0 << 16u;
 438 #else
 439
 440         c0 = c0 & 0x0000FFFF;
 441 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 442         /* Read x[10] */
 443         x3 =  _SIMD32_OFFSET(px+2);
 444                 px += 3u;
 445
 446         /* Perform the multiply-accumulates */
 447         acc0 = __SMLALDX(x1, c0, acc0);
 448         acc1 = __SMLALD(x2, c0, acc1);
 449         acc2 = __SMLALDX(x2, c0, acc2);
 450         acc3 = __SMLALDX(x3, c0, acc3);
 451       }
 452
 453
 454       /* Store the results in the accumulators in the destination buffer. */
 455
 456 #ifndef  ARM_MATH_BIG_ENDIAN
 457
 458       *__SIMD32(pOut)++ =
 459         __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
 460       *__SIMD32(pOut)++ =
 461         __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
 462
 463 #else
 464
 465       *__SIMD32(pOut)++ =
 466         __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
 467       *__SIMD32(pOut)++ =
 468         __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
 469
 470 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
 471
 472       /* Increment the pointer pIn1 index, count by 4 */
 473       count += 4u;
 474
 475       /* Update the inputA and inputB pointers for next MAC calculation */
 476       px = pIn1 + count;
 477       py = pSrc2;
 478
 479        /* Decrement the loop counter */
 480       blkCnt--;
 481     }
 482
 483     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 484      ** No loop unrolling is used. */
 485     blkCnt = blockSize2 % 0x4u;
 486
 487     while(blkCnt > 0u)
 488     {
 489       /* Accumulator is made zero for every iteration */
 490       sum = 0;
 491
 492       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 493       k = srcBLen >> 2u;
 494
 495       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 496        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 497       while(k > 0u)
 498       {
 499         /* Perform the multiply-accumulates */
 500         sum += (q63_t) ((q31_t) * px++ * *py--);
 501         sum += (q63_t) ((q31_t) * px++ * *py--);
 502         sum += (q63_t) ((q31_t) * px++ * *py--);
 503         sum += (q63_t) ((q31_t) * px++ * *py--);
 504
 505         /* Decrement the loop counter */
 506         k--;
 507       }
 508
 509       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 510        ** No loop unrolling is used. */
 511       k = srcBLen % 0x4u;
 512
 513       while(k > 0u)
 514       {
 515         /* Perform the multiply-accumulates */
 516         sum += (q63_t) ((q31_t) * px++ * *py--);
 517
 518         /* Decrement the loop counter */
 519         k--;
 520       }
 521
 522       /* Store the result in the accumulator in the destination buffer. */
 523       *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
 524
 525       /* Increment the pointer pIn1 index, count by 1 */
 526       count++;
 527
 528       /* Update the inputA and inputB pointers for next MAC calculation */
 529       px = pIn1 + count;
 530       py = pSrc2;
 531
 532       /* Decrement the loop counter */
 533       blkCnt--;
 534     }
 535   }
 536   else
 537   {
 538     /* If the srcBLen is not a multiple of 4,
 539      * the blockSize2 loop cannot be unrolled by 4 */
 540     blkCnt = blockSize2;
 541
 542     while(blkCnt > 0u)
 543     {
 544       /* Accumulator is made zero for every iteration */
 545       sum = 0;
 546
 547       /* srcBLen number of MACS should be performed */
 548       k = srcBLen;
 549
 550       while(k > 0u)
 551       {
 552         /* Perform the multiply-accumulate */
 553         sum += (q63_t) ((q31_t) * px++ * *py--);
 554
 555         /* Decrement the loop counter */
 556         k--;
 557       }
 558
 559       /* Store the result in the accumulator in the destination buffer. */
 560       *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
 561
 562       /* Increment the MAC count */
 563       count++;
 564
 565       /* Update the inputA and inputB pointers for next MAC calculation */
 566       px = pIn1 + count;
 567       py = pSrc2;
 568
 569       /* Decrement the loop counter */
 570       blkCnt--;
 571     }
 572   }
 573
 574
 575   /* --------------------------
 576    * Initializations of stage3
 577    * -------------------------*/
 578
 579   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
 580    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
 581    * ....
 582    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
 583    * sum +=  x[srcALen-1] * y[srcBLen-1]
 584    */
 585
 586   /* In this stage the MAC operations are decreased by 1 for every iteration.
 587      The blockSize3 variable holds the number of MAC operations performed */
 588
 589   blockSize3 = srcBLen - 1u;
 590
 591   /* Working pointer of inputA */
 592   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 593   px = pSrc1;
 594
 595   /* Working pointer of inputB */
 596   pSrc2 = pIn2 + (srcBLen - 1u);
 597   pIn2 = pSrc2 - 1u;
 598   py = pIn2;
 599
 600   /* -------------------
 601    * Stage3 process
 602    * ------------------*/
 603
 604   /* For loop unrolling by 4, this stage is divided into two. */
 605   /* First part of this stage computes the MAC operations greater than 4 */
 606   /* Second part of this stage computes the MAC operations less than or equal to 4 */
 607
 608   /* The first part of the stage starts here */
 609   j = blockSize3 >> 2u;
 610
 611   while((j > 0u) && (blockSize3 > 0u))
 612   {
 613     /* Accumulator is made zero for every iteration */
 614     sum = 0;
 615
 616     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 617     k = blockSize3 >> 2u;
 618
 619     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 620      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 621     while(k > 0u)
 622     {
 623       /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
 624        * with y[srcBLen - 1], y[srcBLen - 2] respectively */
 625       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 626       /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
 627        * with y[srcBLen - 3], y[srcBLen - 4] respectively */
 628       sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
 629
 630       /* Decrement the loop counter */
 631       k--;
 632     }
 633
 634     /* For the next MAC operations, the pointer py is used without SIMD
 635      * So, py is incremented by 1 */
 636     py = py + 1u;
 637
 638     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
 639      ** No loop unrolling is used. */
 640     k = blockSize3 % 0x4u;
 641
 642     while(k > 0u)
 643     {
 644       /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
 645       sum = __SMLALD(*px++, *py--, sum);
 646
 647       /* Decrement the loop counter */
 648       k--;
 649     }
 650
 651     /* Store the result in the accumulator in the destination buffer. */
 652     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
 653
 654     /* Update the inputA and inputB pointers for next MAC calculation */
 655     px = ++pSrc1;
 656     py = pIn2;
 657
 658     /* Decrement the loop counter */
 659     blockSize3--;
 660
 661     j--;
 662   }
 663
 664   /* The second part of the stage starts here */
 665   /* SIMD is not used for the next MAC operations,
 666    * so pointer py is updated to read only one sample at a time */
 667   py = py + 1u;
 668
 669   while(blockSize3 > 0u)
 670   {
 671     /* Accumulator is made zero for every iteration */
 672     sum = 0;
 673
 674     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 675     k = blockSize3;
 676
 677     while(k > 0u)
 678     {
 679       /* Perform the multiply-accumulates */
 680       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
 681       sum = __SMLALD(*px++, *py--, sum);
 682
 683       /* Decrement the loop counter */
 684       k--;
 685     }
 686
 687     /* Store the result in the accumulator in the destination buffer. */
 688     *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
 689
 690     /* Update the inputA and inputB pointers for next MAC calculation */
 691     px = ++pSrc1;
 692     py = pSrc2;
 693
 694     /* Decrement the loop counter */
 695     blockSize3--;
 696   }
 697
 698 #else
 699
 700 /* Run the below code for Cortex-M0 */
 701
 702   q15_t *pIn1 = pSrcA;                           /* input pointer */
 703   q15_t *pIn2 = pSrcB;                           /* coefficient pointer */
 704   q63_t sum;                                     /* Accumulator */
 705   uint32_t i, j;                                 /* loop counter */
 706
 707   /* Loop to calculate output of convolution for output length number of times */
 708   for (i = 0; i < (srcALen + srcBLen - 1); i++)
 709   {
 710     /* Initialize sum with zero to carry on MAC operations */
 711     sum = 0;
 712
 713     /* Loop to perform MAC operations according to convolution equation */
 714     for (j = 0; j <= i; j++)
 715     {
 716       /* Check the array limitations */
 717       if(((i - j) < srcBLen) && (j < srcALen))
 718       {
 719         /* z[i] += x[i-j] * y[j] */
 720         sum += (q31_t) pIn1[j] * (pIn2[i - j]);
 721       }
 722     }
 723
 724     /* Store the output in the destination buffer */
 725     pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u);
 726   }
 727
 728 #endif /*  #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)*/
 729
 730 }
 731
 732 /**
 733  * @} end of Conv group
 734  */