tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_correlate_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_correlate_q15.c
   9 *
  10 * Description:  Correlation of Q15 sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @addtogroup Corr
  49  * @{
  50  */
  51
  52 /**
  53  * @brief Correlation of Q15 sequences.
  54  * @param[in] *pSrcA points to the first input sequence.
  55  * @param[in] srcALen length of the first input sequence.
  56  * @param[in] *pSrcB points to the second input sequence.
  57  * @param[in] srcBLen length of the second input sequence.
  58  * @param[out] *pDst points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
  59  * @return none.
  60  *
  61  * @details
  62  * <b>Scaling and Overflow Behavior:</b>
  63  *
  64  * \par
  65  * The function is implemented using a 64-bit internal accumulator.
  66  * Both inputs are in 1.15 format and multiplications yield a 2.30 result.
  67  * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
  68  * This approach provides 33 guard bits and there is no risk of overflow.
  69  * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
  70  *
  71  * \par
  72  * Refer to <code>arm_correlate_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
  73  *
  74  * \par
  75  * Refer the function <code>arm_correlate_opt_q15()</code> for a faster implementation of this function using scratch buffers.
  76  *
  77  */
  78
  79 void arm_correlate_q15(
  80   q15_t * pSrcA,
  81   uint32_t srcALen,
  82   q15_t * pSrcB,
  83   uint32_t srcBLen,
  84   q15_t * pDst)
  85 {
  86
  87 #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
  88
  89   /* Run the below code for Cortex-M4 and Cortex-M3 */
  90
  91   q15_t *pIn1;                                   /* inputA pointer               */
  92   q15_t *pIn2;                                   /* inputB pointer               */
  93   q15_t *pOut = pDst;                            /* output pointer               */
  94   q63_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
  95   q15_t *px;                                     /* Intermediate inputA pointer  */
  96   q15_t *py;                                     /* Intermediate inputB pointer  */
  97   q15_t *pSrc1;                                  /* Intermediate pointers        */
  98   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */
  99   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */
 100   int32_t inc = 1;                               /* Destination address modifier */
 101
 102
 103   /* The algorithm implementation is based on the lengths of the inputs. */
 104   /* srcB is always made to slide across srcA. */
 105   /* So srcBLen is always considered as shorter or equal to srcALen */
 106   /* But CORR(x, y) is reverse of CORR(y, x) */
 107   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
 108   /* and the destination pointer modifier, inc is set to -1 */
 109   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
 110   /* But to improve the performance,
 111    * we include zeroes in the output instead of zero padding either of the the inputs*/
 112   /* If srcALen > srcBLen,
 113    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
 114   /* If srcALen < srcBLen,
 115    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
 116   if(srcALen >= srcBLen)
 117   {
 118     /* Initialization of inputA pointer */
 119     pIn1 = (pSrcA);
 120
 121     /* Initialization of inputB pointer */
 122     pIn2 = (pSrcB);
 123
 124     /* Number of output samples is calculated */
 125     outBlockSize = (2u * srcALen) - 1u;
 126
 127     /* When srcALen > srcBLen, zero padding is done to srcB
 128      * to make their lengths equal.
 129      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
 130      * number of output samples are made zero */
 131     j = outBlockSize - (srcALen + (srcBLen - 1u));
 132
 133     /* Updating the pointer position to non zero value */
 134     pOut += j;
 135
 136   }
 137   else
 138   {
 139     /* Initialization of inputA pointer */
 140     pIn1 = (pSrcB);
 141
 142     /* Initialization of inputB pointer */
 143     pIn2 = (pSrcA);
 144
 145     /* srcBLen is always considered as shorter or equal to srcALen */
 146     j = srcBLen;
 147     srcBLen = srcALen;
 148     srcALen = j;
 149
 150     /* CORR(x, y) = Reverse order(CORR(y, x)) */
 151     /* Hence set the destination pointer to point to the last output sample */
 152     pOut = pDst + ((srcALen + srcBLen) - 2u);
 153
 154     /* Destination address modifier is set to -1 */
 155     inc = -1;
 156
 157   }
 158
 159   /* The function is internally
 160    * divided into three parts according to the number of multiplications that has to be
 161    * taken place between inputA samples and inputB samples. In the first part of the
 162    * algorithm, the multiplications increase by one for every iteration.
 163    * In the second part of the algorithm, srcBLen number of multiplications are done.
 164    * In the third part of the algorithm, the multiplications decrease by one
 165    * for every iteration.*/
 166   /* The algorithm is implemented in three stages.
 167    * The loop counters of each stage is initiated here. */
 168   blockSize1 = srcBLen - 1u;
 169   blockSize2 = srcALen - (srcBLen - 1u);
 170   blockSize3 = blockSize1;
 171
 172   /* --------------------------
 173    * Initializations of stage1
 174    * -------------------------*/
 175
 176   /* sum = x[0] * y[srcBlen - 1]
 177    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
 178    * ....
 179    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
 180    */
 181
 182   /* In this stage the MAC operations are increased by 1 for every iteration.
 183      The count variable holds the number of MAC operations performed */
 184   count = 1u;
 185
 186   /* Working pointer of inputA */
 187   px = pIn1;
 188
 189   /* Working pointer of inputB */
 190   pSrc1 = pIn2 + (srcBLen - 1u);
 191   py = pSrc1;
 192
 193   /* ------------------------
 194    * Stage1 process
 195    * ----------------------*/
 196
 197   /* The first loop starts here */
 198   while(blockSize1 > 0u)
 199   {
 200     /* Accumulator is made zero for every iteration */
 201     sum = 0;
 202
 203     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 204     k = count >> 2;
 205
 206     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 207      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 208     while(k > 0u)
 209     {
 210       /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
 211       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
 212       /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
 213       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
 214
 215       /* Decrement the loop counter */
 216       k--;
 217     }
 218
 219     /* If the count is not a multiple of 4, compute any remaining MACs here.
 220      ** No loop unrolling is used. */
 221     k = count % 0x4u;
 222
 223     while(k > 0u)
 224     {
 225       /* Perform the multiply-accumulates */
 226       /* x[0] * y[srcBLen - 1] */
 227       sum = __SMLALD(*px++, *py++, sum);
 228
 229       /* Decrement the loop counter */
 230       k--;
 231     }
 232
 233     /* Store the result in the accumulator in the destination buffer. */
 234     *pOut = (q15_t) (__SSAT((sum >> 15), 16));
 235     /* Destination pointer is updated according to the address modifier, inc */
 236     pOut += inc;
 237
 238     /* Update the inputA and inputB pointers for next MAC calculation */
 239     py = pSrc1 - count;
 240     px = pIn1;
 241
 242     /* Increment the MAC count */
 243     count++;
 244
 245     /* Decrement the loop counter */
 246     blockSize1--;
 247   }
 248
 249   /* --------------------------
 250    * Initializations of stage2
 251    * ------------------------*/
 252
 253   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
 254    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
 255    * ....
 256    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 257    */
 258
 259   /* Working pointer of inputA */
 260   px = pIn1;
 261
 262   /* Working pointer of inputB */
 263   py = pIn2;
 264
 265   /* count is index by which the pointer pIn1 to be incremented */
 266   count = 0u;
 267
 268   /* -------------------
 269    * Stage2 process
 270    * ------------------*/
 271
 272   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 273    * So, to loop unroll over blockSize2,
 274    * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
 275   if(srcBLen >= 4u)
 276   {
 277     /* Loop unroll over blockSize2, by 4 */
 278     blkCnt = blockSize2 >> 2u;
 279
 280     while(blkCnt > 0u)
 281     {
 282       /* Set all accumulators to zero */
 283       acc0 = 0;
 284       acc1 = 0;
 285       acc2 = 0;
 286       acc3 = 0;
 287
 288       /* read x[0], x[1] samples */
 289       x0 = *__SIMD32(px);
 290       /* read x[1], x[2] samples */
 291       x1 = _SIMD32_OFFSET(px + 1);
 292           px += 2u;
 293
 294       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 295       k = srcBLen >> 2u;
 296
 297       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 298        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 299       do
 300       {
 301         /* Read the first two inputB samples using SIMD:
 302          * y[0] and y[1] */
 303         c0 = *__SIMD32(py)++;
 304
 305         /* acc0 +=  x[0] * y[0] + x[1] * y[1] */
 306         acc0 = __SMLALD(x0, c0, acc0);
 307
 308         /* acc1 +=  x[1] * y[0] + x[2] * y[1] */
 309         acc1 = __SMLALD(x1, c0, acc1);
 310
 311         /* Read x[2], x[3] */
 312         x2 = *__SIMD32(px);
 313
 314         /* Read x[3], x[4] */
 315         x3 = _SIMD32_OFFSET(px + 1);
 316
 317         /* acc2 +=  x[2] * y[0] + x[3] * y[1] */
 318         acc2 = __SMLALD(x2, c0, acc2);
 319
 320         /* acc3 +=  x[3] * y[0] + x[4] * y[1] */
 321         acc3 = __SMLALD(x3, c0, acc3);
 322
 323         /* Read y[2] and y[3] */
 324         c0 = *__SIMD32(py)++;
 325
 326         /* acc0 +=  x[2] * y[2] + x[3] * y[3] */
 327         acc0 = __SMLALD(x2, c0, acc0);
 328
 329         /* acc1 +=  x[3] * y[2] + x[4] * y[3] */
 330         acc1 = __SMLALD(x3, c0, acc1);
 331
 332         /* Read x[4], x[5] */
 333         x0 = _SIMD32_OFFSET(px + 2);
 334
 335         /* Read x[5], x[6] */
 336         x1 = _SIMD32_OFFSET(px + 3);
 337
 338                 px += 4u;
 339
 340         /* acc2 +=  x[4] * y[2] + x[5] * y[3] */
 341         acc2 = __SMLALD(x0, c0, acc2);
 342
 343         /* acc3 +=  x[5] * y[2] + x[6] * y[3] */
 344         acc3 = __SMLALD(x1, c0, acc3);
 345
 346       } while(--k);
 347
 348       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 349        ** No loop unrolling is used. */
 350       k = srcBLen % 0x4u;
 351
 352       if(k == 1u)
 353       {
 354         /* Read y[4] */
 355         c0 = *py;
 356 #ifdef  ARM_MATH_BIG_ENDIAN
 357
 358         c0 = c0 << 16u;
 359
 360 #else
 361
 362         c0 = c0 & 0x0000FFFF;
 363
 364 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 365         /* Read x[7] */
 366         x3 = *__SIMD32(px);
 367                 px++;
 368
 369         /* Perform the multiply-accumulates */
 370         acc0 = __SMLALD(x0, c0, acc0);
 371         acc1 = __SMLALD(x1, c0, acc1);
 372         acc2 = __SMLALDX(x1, c0, acc2);
 373         acc3 = __SMLALDX(x3, c0, acc3);
 374       }
 375
 376       if(k == 2u)
 377       {
 378         /* Read y[4], y[5] */
 379         c0 = *__SIMD32(py);
 380
 381         /* Read x[7], x[8] */
 382         x3 = *__SIMD32(px);
 383
 384         /* Read x[9] */
 385         x2 = _SIMD32_OFFSET(px + 1);
 386                 px += 2u;
 387
 388         /* Perform the multiply-accumulates */
 389         acc0 = __SMLALD(x0, c0, acc0);
 390         acc1 = __SMLALD(x1, c0, acc1);
 391         acc2 = __SMLALD(x3, c0, acc2);
 392         acc3 = __SMLALD(x2, c0, acc3);
 393       }
 394
 395       if(k == 3u)
 396       {
 397         /* Read y[4], y[5] */
 398         c0 = *__SIMD32(py)++;
 399
 400         /* Read x[7], x[8] */
 401         x3 = *__SIMD32(px);
 402
 403         /* Read x[9] */
 404         x2 = _SIMD32_OFFSET(px + 1);
 405
 406         /* Perform the multiply-accumulates */
 407         acc0 = __SMLALD(x0, c0, acc0);
 408         acc1 = __SMLALD(x1, c0, acc1);
 409         acc2 = __SMLALD(x3, c0, acc2);
 410         acc3 = __SMLALD(x2, c0, acc3);
 411
 412         c0 = (*py);
 413
 414         /* Read y[6] */
 415 #ifdef  ARM_MATH_BIG_ENDIAN
 416
 417         c0 = c0 << 16u;
 418 #else
 419
 420         c0 = c0 & 0x0000FFFF;
 421 #endif /*      #ifdef  ARM_MATH_BIG_ENDIAN     */
 422         /* Read x[10] */
 423         x3 = _SIMD32_OFFSET(px + 2);
 424                 px += 3u;
 425
 426         /* Perform the multiply-accumulates */
 427         acc0 = __SMLALDX(x1, c0, acc0);
 428         acc1 = __SMLALD(x2, c0, acc1);
 429         acc2 = __SMLALDX(x2, c0, acc2);
 430         acc3 = __SMLALDX(x3, c0, acc3);
 431       }
 432
 433       /* Store the result in the accumulator in the destination buffer. */
 434       *pOut = (q15_t) (__SSAT(acc0 >> 15, 16));
 435       /* Destination pointer is updated according to the address modifier, inc */
 436       pOut += inc;
 437
 438       *pOut = (q15_t) (__SSAT(acc1 >> 15, 16));
 439       pOut += inc;
 440
 441       *pOut = (q15_t) (__SSAT(acc2 >> 15, 16));
 442       pOut += inc;
 443
 444       *pOut = (q15_t) (__SSAT(acc3 >> 15, 16));
 445       pOut += inc;
 446
 447       /* Increment the count by 4 as 4 output values are computed */
 448       count += 4u;
 449
 450       /* Update the inputA and inputB pointers for next MAC calculation */
 451       px = pIn1 + count;
 452       py = pIn2;
 453
 454       /* Decrement the loop counter */
 455       blkCnt--;
 456     }
 457
 458     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 459      ** No loop unrolling is used. */
 460     blkCnt = blockSize2 % 0x4u;
 461
 462     while(blkCnt > 0u)
 463     {
 464       /* Accumulator is made zero for every iteration */
 465       sum = 0;
 466
 467       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 468       k = srcBLen >> 2u;
 469
 470       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 471        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 472       while(k > 0u)
 473       {
 474         /* Perform the multiply-accumulates */
 475         sum += ((q63_t) * px++ * *py++);
 476         sum += ((q63_t) * px++ * *py++);
 477         sum += ((q63_t) * px++ * *py++);
 478         sum += ((q63_t) * px++ * *py++);
 479
 480         /* Decrement the loop counter */
 481         k--;
 482       }
 483
 484       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 485        ** No loop unrolling is used. */
 486       k = srcBLen % 0x4u;
 487
 488       while(k > 0u)
 489       {
 490         /* Perform the multiply-accumulates */
 491         sum += ((q63_t) * px++ * *py++);
 492
 493         /* Decrement the loop counter */
 494         k--;
 495       }
 496
 497       /* Store the result in the accumulator in the destination buffer. */
 498       *pOut = (q15_t) (__SSAT(sum >> 15, 16));
 499       /* Destination pointer is updated according to the address modifier, inc */
 500       pOut += inc;
 501
 502       /* Increment count by 1, as one output value is computed */
 503       count++;
 504
 505       /* Update the inputA and inputB pointers for next MAC calculation */
 506       px = pIn1 + count;
 507       py = pIn2;
 508
 509       /* Decrement the loop counter */
 510       blkCnt--;
 511     }
 512   }
 513   else
 514   {
 515     /* If the srcBLen is not a multiple of 4,
 516      * the blockSize2 loop cannot be unrolled by 4 */
 517     blkCnt = blockSize2;
 518
 519     while(blkCnt > 0u)
 520     {
 521       /* Accumulator is made zero for every iteration */
 522       sum = 0;
 523
 524       /* Loop over srcBLen */
 525       k = srcBLen;
 526
 527       while(k > 0u)
 528       {
 529         /* Perform the multiply-accumulate */
 530         sum += ((q63_t) * px++ * *py++);
 531
 532         /* Decrement the loop counter */
 533         k--;
 534       }
 535
 536       /* Store the result in the accumulator in the destination buffer. */
 537       *pOut = (q15_t) (__SSAT(sum >> 15, 16));
 538       /* Destination pointer is updated according to the address modifier, inc */
 539       pOut += inc;
 540
 541       /* Increment the MAC count */
 542       count++;
 543
 544       /* Update the inputA and inputB pointers for next MAC calculation */
 545       px = pIn1 + count;
 546       py = pIn2;
 547
 548       /* Decrement the loop counter */
 549       blkCnt--;
 550     }
 551   }
 552
 553   /* --------------------------
 554    * Initializations of stage3
 555    * -------------------------*/
 556
 557   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 558    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
 559    * ....
 560    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
 561    * sum +=  x[srcALen-1] * y[0]
 562    */
 563
 564   /* In this stage the MAC operations are decreased by 1 for every iteration.
 565      The count variable holds the number of MAC operations performed */
 566   count = srcBLen - 1u;
 567
 568   /* Working pointer of inputA */
 569   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 570   px = pSrc1;
 571
 572   /* Working pointer of inputB */
 573   py = pIn2;
 574
 575   /* -------------------
 576    * Stage3 process
 577    * ------------------*/
 578
 579   while(blockSize3 > 0u)
 580   {
 581     /* Accumulator is made zero for every iteration */
 582     sum = 0;
 583
 584     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 585     k = count >> 2u;
 586
 587     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 588      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 589     while(k > 0u)
 590     {
 591       /* Perform the multiply-accumulates */
 592       /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
 593       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
 594       /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
 595       sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
 596
 597       /* Decrement the loop counter */
 598       k--;
 599     }
 600
 601     /* If the count is not a multiple of 4, compute any remaining MACs here.
 602      ** No loop unrolling is used. */
 603     k = count % 0x4u;
 604
 605     while(k > 0u)
 606     {
 607       /* Perform the multiply-accumulates */
 608       sum = __SMLALD(*px++, *py++, sum);
 609
 610       /* Decrement the loop counter */
 611       k--;
 612     }
 613
 614     /* Store the result in the accumulator in the destination buffer. */
 615     *pOut = (q15_t) (__SSAT((sum >> 15), 16));
 616     /* Destination pointer is updated according to the address modifier, inc */
 617     pOut += inc;
 618
 619     /* Update the inputA and inputB pointers for next MAC calculation */
 620     px = ++pSrc1;
 621     py = pIn2;
 622
 623     /* Decrement the MAC count */
 624     count--;
 625
 626     /* Decrement the loop counter */
 627     blockSize3--;
 628   }
 629
 630 #else
 631
 632 /* Run the below code for Cortex-M0 */
 633
 634   q15_t *pIn1 = pSrcA;                           /* inputA pointer               */
 635   q15_t *pIn2 = pSrcB + (srcBLen - 1u);          /* inputB pointer               */
 636   q63_t sum;                                     /* Accumulators                  */
 637   uint32_t i = 0u, j;                            /* loop counters */
 638   uint32_t inv = 0u;                             /* Reverse order flag */
 639   uint32_t tot = 0u;                             /* Length */
 640
 641   /* The algorithm implementation is based on the lengths of the inputs. */
 642   /* srcB is always made to slide across srcA. */
 643   /* So srcBLen is always considered as shorter or equal to srcALen */
 644   /* But CORR(x, y) is reverse of CORR(y, x) */
 645   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
 646   /* and a varaible, inv is set to 1 */
 647   /* If lengths are not equal then zero pad has to be done to  make the two
 648    * inputs of same length. But to improve the performance, we include zeroes
 649    * in the output instead of zero padding either of the the inputs*/
 650   /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
 651    * starting of the output buffer */
 652   /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
 653    * ending of the output buffer */
 654   /* Once the zero padding is done the remaining of the output is calcualted
 655    * using convolution but with the shorter signal time shifted. */
 656
 657   /* Calculate the length of the remaining sequence */
 658   tot = ((srcALen + srcBLen) - 2u);
 659
 660   if(srcALen > srcBLen)
 661   {
 662     /* Calculating the number of zeros to be padded to the output */
 663     j = srcALen - srcBLen;
 664
 665     /* Initialise the pointer after zero padding */
 666     pDst += j;
 667   }
 668
 669   else if(srcALen < srcBLen)
 670   {
 671     /* Initialization to inputB pointer */
 672     pIn1 = pSrcB;
 673
 674     /* Initialization to the end of inputA pointer */
 675     pIn2 = pSrcA + (srcALen - 1u);
 676
 677     /* Initialisation of the pointer after zero padding */
 678     pDst = pDst + tot;
 679
 680     /* Swapping the lengths */
 681     j = srcALen;
 682     srcALen = srcBLen;
 683     srcBLen = j;
 684
 685     /* Setting the reverse flag */
 686     inv = 1;
 687
 688   }
 689
 690   /* Loop to calculate convolution for output length number of times */
 691   for (i = 0u; i <= tot; i++)
 692   {
 693     /* Initialize sum with zero to carry on MAC operations */
 694     sum = 0;
 695
 696     /* Loop to perform MAC operations according to convolution equation */
 697     for (j = 0u; j <= i; j++)
 698     {
 699       /* Check the array limitations */
 700       if((((i - j) < srcBLen) && (j < srcALen)))
 701       {
 702         /* z[i] += x[i-j] * y[j] */
 703         sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
 704       }
 705     }
 706     /* Store the output in the destination buffer */
 707     if(inv == 1)
 708       *pDst-- = (q15_t) __SSAT((sum >> 15u), 16u);
 709     else
 710       *pDst++ = (q15_t) __SSAT((sum >> 15u), 16u);
 711   }
 712
 713 #endif /*#if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
 714
 715 }
 716
 717 /**
 718  * @} end of Corr group
 719  */