tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_partial_opt_q15.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_partial_opt_q15.c
   9 *
  10 * Description:  Partial convolution of Q15 sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @addtogroup PartialConv
  49  * @{
  50  */
  51
  52 /**
  53  * @brief Partial convolution of Q15 sequences.
  54  * @param[in]       *pSrcA points to the first input sequence.
  55  * @param[in]       srcALen length of the first input sequence.
  56  * @param[in]       *pSrcB points to the second input sequence.
  57  * @param[in]       srcBLen length of the second input sequence.
  58  * @param[out]      *pDst points to the location where the output result is written.
  59  * @param[in]       firstIndex is the first output sample to start with.
  60  * @param[in]       numPoints is the number of output points to be computed.
  61  * @param[in]       *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
  62  * @param[in]       *pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
  63  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
  64  *
  65  * \par Restrictions
  66  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
  67  *      In this case input, output, state buffers should be aligned by 32-bit
  68  *
  69  * Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
  70  *
  71  *
  72  */
  73
  74 #ifndef UNALIGNED_SUPPORT_DISABLE
  75
  76 arm_status arm_conv_partial_opt_q15(
  77   q15_t * pSrcA,
  78   uint32_t srcALen,
  79   q15_t * pSrcB,
  80   uint32_t srcBLen,
  81   q15_t * pDst,
  82   uint32_t firstIndex,
  83   uint32_t numPoints,
  84   q15_t * pScratch1,
  85   q15_t * pScratch2)
  86 {
  87
  88   q15_t *pOut = pDst;                            /* output pointer */
  89   q15_t *pScr1 = pScratch1;                      /* Temporary pointer for scratch1 */
  90   q15_t *pScr2 = pScratch2;                      /* Temporary pointer for scratch1 */
  91   q63_t acc0, acc1, acc2, acc3;                  /* Accumulator */
  92   q31_t x1, x2, x3;                              /* Temporary variables to hold state and coefficient values */
  93   q31_t y1, y2;                                  /* State variables */
  94   q15_t *pIn1;                                   /* inputA pointer */
  95   q15_t *pIn2;                                   /* inputB pointer */
  96   q15_t *px;                                     /* Intermediate inputA pointer  */
  97   q15_t *py;                                     /* Intermediate inputB pointer  */
  98   uint32_t j, k, blkCnt;                         /* loop counter */
  99   arm_status status;                             /* Status variable */
 100   uint32_t tapCnt;                               /* loop count */
 101
 102   /* Check for range of output samples to be calculated */
 103   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
 104   {
 105     /* Set status as ARM_MATH_ARGUMENT_ERROR */
 106     status = ARM_MATH_ARGUMENT_ERROR;
 107   }
 108   else
 109   {
 110
 111     /* The algorithm implementation is based on the lengths of the inputs. */
 112     /* srcB is always made to slide across srcA. */
 113     /* So srcBLen is always considered as shorter or equal to srcALen */
 114     if(srcALen >= srcBLen)
 115     {
 116       /* Initialization of inputA pointer */
 117       pIn1 = pSrcA;
 118
 119       /* Initialization of inputB pointer */
 120       pIn2 = pSrcB;
 121     }
 122     else
 123     {
 124       /* Initialization of inputA pointer */
 125       pIn1 = pSrcB;
 126
 127       /* Initialization of inputB pointer */
 128       pIn2 = pSrcA;
 129
 130       /* srcBLen is always considered as shorter or equal to srcALen */
 131       j = srcBLen;
 132       srcBLen = srcALen;
 133       srcALen = j;
 134     }
 135
 136     /* Temporary pointer for scratch2 */
 137     py = pScratch2;
 138
 139     /* pointer to take end of scratch2 buffer */
 140     pScr2 = pScratch2 + srcBLen - 1;
 141
 142     /* points to smaller length sequence */
 143     px = pIn2;
 144
 145     /* Apply loop unrolling and do 4 Copies simultaneously. */
 146     k = srcBLen >> 2u;
 147
 148     /* First part of the processing with loop unrolling copies 4 data points at a time.
 149      ** a second loop below copies for the remaining 1 to 3 samples. */
 150     while(k > 0u)
 151     {
 152       /* copy second buffer in reversal manner */
 153       *pScr2-- = *px++;
 154       *pScr2-- = *px++;
 155       *pScr2-- = *px++;
 156       *pScr2-- = *px++;
 157
 158       /* Decrement the loop counter */
 159       k--;
 160     }
 161
 162     /* If the count is not a multiple of 4, copy remaining samples here.
 163      ** No loop unrolling is used. */
 164     k = srcBLen % 0x4u;
 165
 166     while(k > 0u)
 167     {
 168       /* copy second buffer in reversal manner for remaining samples */
 169       *pScr2-- = *px++;
 170
 171       /* Decrement the loop counter */
 172       k--;
 173     }
 174
 175     /* Initialze temporary scratch pointer */
 176     pScr1 = pScratch1;
 177
 178     /* Fill (srcBLen - 1u) zeros in scratch buffer */
 179     arm_fill_q15(0, pScr1, (srcBLen - 1u));
 180
 181     /* Update temporary scratch pointer */
 182     pScr1 += (srcBLen - 1u);
 183
 184     /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
 185
 186     /* Copy (srcALen) samples in scratch buffer */
 187     arm_copy_q15(pIn1, pScr1, srcALen);
 188
 189     /* Update pointers */
 190     pScr1 += srcALen;
 191
 192     /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
 193     arm_fill_q15(0, pScr1, (srcBLen - 1u));
 194
 195     /* Update pointer */
 196     pScr1 += (srcBLen - 1u);
 197
 198     /* Initialization of pIn2 pointer */
 199     pIn2 = py;
 200
 201     pScratch1 += firstIndex;
 202
 203     pOut = pDst + firstIndex;
 204
 205     /* Actual convolution process starts here */
 206     blkCnt = (numPoints) >> 2;
 207
 208     while(blkCnt > 0)
 209     {
 210       /* Initialze temporary scratch pointer as scratch1 */
 211       pScr1 = pScratch1;
 212
 213       /* Clear Accumlators */
 214       acc0 = 0;
 215       acc1 = 0;
 216       acc2 = 0;
 217       acc3 = 0;
 218
 219       /* Read two samples from scratch1 buffer */
 220       x1 = *__SIMD32(pScr1)++;
 221
 222       /* Read next two samples from scratch1 buffer */
 223       x2 = *__SIMD32(pScr1)++;
 224
 225       tapCnt = (srcBLen) >> 2u;
 226
 227       while(tapCnt > 0u)
 228       {
 229
 230         /* Read four samples from smaller buffer */
 231         y1 = _SIMD32_OFFSET(pIn2);
 232         y2 = _SIMD32_OFFSET(pIn2 + 2u);
 233
 234         /* multiply and accumlate */
 235         acc0 = __SMLALD(x1, y1, acc0);
 236         acc2 = __SMLALD(x2, y1, acc2);
 237
 238         /* pack input data */
 239 #ifndef ARM_MATH_BIG_ENDIAN
 240         x3 = __PKHBT(x2, x1, 0);
 241 #else
 242         x3 = __PKHBT(x1, x2, 0);
 243 #endif
 244
 245         /* multiply and accumlate */
 246         acc1 = __SMLALDX(x3, y1, acc1);
 247
 248         /* Read next two samples from scratch1 buffer */
 249         x1 = _SIMD32_OFFSET(pScr1);
 250
 251         /* multiply and accumlate */
 252         acc0 = __SMLALD(x2, y2, acc0);
 253         acc2 = __SMLALD(x1, y2, acc2);
 254
 255         /* pack input data */
 256 #ifndef ARM_MATH_BIG_ENDIAN
 257         x3 = __PKHBT(x1, x2, 0);
 258 #else
 259         x3 = __PKHBT(x2, x1, 0);
 260 #endif
 261
 262         acc3 = __SMLALDX(x3, y1, acc3);
 263         acc1 = __SMLALDX(x3, y2, acc1);
 264
 265         x2 = _SIMD32_OFFSET(pScr1 + 2u);
 266
 267 #ifndef ARM_MATH_BIG_ENDIAN
 268         x3 = __PKHBT(x2, x1, 0);
 269 #else
 270         x3 = __PKHBT(x1, x2, 0);
 271 #endif
 272
 273         acc3 = __SMLALDX(x3, y2, acc3);
 274
 275         /* update scratch pointers */
 276         pIn2 += 4u;
 277         pScr1 += 4u;
 278
 279
 280         /* Decrement the loop counter */
 281         tapCnt--;
 282       }
 283
 284       /* Update scratch pointer for remaining samples of smaller length sequence */
 285       pScr1 -= 4u;
 286
 287       /* apply same above for remaining samples of smaller length sequence */
 288       tapCnt = (srcBLen) & 3u;
 289
 290       while(tapCnt > 0u)
 291       {
 292         /* accumlate the results */
 293         acc0 += (*pScr1++ * *pIn2);
 294         acc1 += (*pScr1++ * *pIn2);
 295         acc2 += (*pScr1++ * *pIn2);
 296         acc3 += (*pScr1++ * *pIn2++);
 297
 298         pScr1 -= 3u;
 299
 300         /* Decrement the loop counter */
 301         tapCnt--;
 302       }
 303
 304       blkCnt--;
 305
 306
 307       /* Store the results in the accumulators in the destination buffer. */
 308
 309 #ifndef  ARM_MATH_BIG_ENDIAN
 310
 311       *__SIMD32(pOut)++ =
 312         __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
 313       *__SIMD32(pOut)++ =
 314         __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
 315
 316 #else
 317
 318       *__SIMD32(pOut)++ =
 319         __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
 320       *__SIMD32(pOut)++ =
 321         __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
 322
 323 #endif /*      #ifndef  ARM_MATH_BIG_ENDIAN    */
 324
 325       /* Initialization of inputB pointer */
 326       pIn2 = py;
 327
 328       pScratch1 += 4u;
 329
 330     }
 331
 332
 333     blkCnt = numPoints & 0x3;
 334
 335     /* Calculate convolution for remaining samples of Bigger length sequence */
 336     while(blkCnt > 0)
 337     {
 338       /* Initialze temporary scratch pointer as scratch1 */
 339       pScr1 = pScratch1;
 340
 341       /* Clear Accumlators */
 342       acc0 = 0;
 343
 344       tapCnt = (srcBLen) >> 1u;
 345
 346       while(tapCnt > 0u)
 347       {
 348
 349         /* Read next two samples from scratch1 buffer */
 350         x1 = *__SIMD32(pScr1)++;
 351
 352         /* Read two samples from smaller buffer */
 353         y1 = *__SIMD32(pIn2)++;
 354
 355         acc0 = __SMLALD(x1, y1, acc0);
 356
 357         /* Decrement the loop counter */
 358         tapCnt--;
 359       }
 360
 361       tapCnt = (srcBLen) & 1u;
 362
 363       /* apply same above for remaining samples of smaller length sequence */
 364       while(tapCnt > 0u)
 365       {
 366
 367         /* accumlate the results */
 368         acc0 += (*pScr1++ * *pIn2++);
 369
 370         /* Decrement the loop counter */
 371         tapCnt--;
 372       }
 373
 374       blkCnt--;
 375
 376       /* Store the result in the accumulator in the destination buffer. */
 377       *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
 378
 379       /* Initialization of inputB pointer */
 380       pIn2 = py;
 381
 382       pScratch1 += 1u;
 383
 384     }
 385
 386     /* set status as ARM_MATH_SUCCESS */
 387     status = ARM_MATH_SUCCESS;
 388
 389   }
 390
 391   /* Return to application */
 392   return (status);
 393 }
 394
 395 #else
 396
 397 arm_status arm_conv_partial_opt_q15(
 398   q15_t * pSrcA,
 399   uint32_t srcALen,
 400   q15_t * pSrcB,
 401   uint32_t srcBLen,
 402   q15_t * pDst,
 403   uint32_t firstIndex,
 404   uint32_t numPoints,
 405   q15_t * pScratch1,
 406   q15_t * pScratch2)
 407 {
 408
 409   q15_t *pOut = pDst;                            /* output pointer */
 410   q15_t *pScr1 = pScratch1;                      /* Temporary pointer for scratch1 */
 411   q15_t *pScr2 = pScratch2;                      /* Temporary pointer for scratch1 */
 412   q63_t acc0, acc1, acc2, acc3;                  /* Accumulator */
 413   q15_t *pIn1;                                   /* inputA pointer */
 414   q15_t *pIn2;                                   /* inputB pointer */
 415   q15_t *px;                                     /* Intermediate inputA pointer  */
 416   q15_t *py;                                     /* Intermediate inputB pointer  */
 417   uint32_t j, k, blkCnt;                         /* loop counter */
 418   arm_status status;                             /* Status variable */
 419   uint32_t tapCnt;                               /* loop count */
 420   q15_t x10, x11, x20, x21;                      /* Temporary variables to hold srcA buffer */
 421   q15_t y10, y11;                                /* Temporary variables to hold srcB buffer */
 422
 423
 424   /* Check for range of output samples to be calculated */
 425   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
 426   {
 427     /* Set status as ARM_MATH_ARGUMENT_ERROR */
 428     status = ARM_MATH_ARGUMENT_ERROR;
 429   }
 430   else
 431   {
 432
 433     /* The algorithm implementation is based on the lengths of the inputs. */
 434     /* srcB is always made to slide across srcA. */
 435     /* So srcBLen is always considered as shorter or equal to srcALen */
 436     if(srcALen >= srcBLen)
 437     {
 438       /* Initialization of inputA pointer */
 439       pIn1 = pSrcA;
 440
 441       /* Initialization of inputB pointer */
 442       pIn2 = pSrcB;
 443     }
 444     else
 445     {
 446       /* Initialization of inputA pointer */
 447       pIn1 = pSrcB;
 448
 449       /* Initialization of inputB pointer */
 450       pIn2 = pSrcA;
 451
 452       /* srcBLen is always considered as shorter or equal to srcALen */
 453       j = srcBLen;
 454       srcBLen = srcALen;
 455       srcALen = j;
 456     }
 457
 458     /* Temporary pointer for scratch2 */
 459     py = pScratch2;
 460
 461     /* pointer to take end of scratch2 buffer */
 462     pScr2 = pScratch2 + srcBLen - 1;
 463
 464     /* points to smaller length sequence */
 465     px = pIn2;
 466
 467     /* Apply loop unrolling and do 4 Copies simultaneously. */
 468     k = srcBLen >> 2u;
 469
 470     /* First part of the processing with loop unrolling copies 4 data points at a time.
 471      ** a second loop below copies for the remaining 1 to 3 samples. */
 472     while(k > 0u)
 473     {
 474       /* copy second buffer in reversal manner */
 475       *pScr2-- = *px++;
 476       *pScr2-- = *px++;
 477       *pScr2-- = *px++;
 478       *pScr2-- = *px++;
 479
 480       /* Decrement the loop counter */
 481       k--;
 482     }
 483
 484     /* If the count is not a multiple of 4, copy remaining samples here.
 485      ** No loop unrolling is used. */
 486     k = srcBLen % 0x4u;
 487
 488     while(k > 0u)
 489     {
 490       /* copy second buffer in reversal manner for remaining samples */
 491       *pScr2-- = *px++;
 492
 493       /* Decrement the loop counter */
 494       k--;
 495     }
 496
 497     /* Initialze temporary scratch pointer */
 498     pScr1 = pScratch1;
 499
 500     /* Fill (srcBLen - 1u) zeros in scratch buffer */
 501     arm_fill_q15(0, pScr1, (srcBLen - 1u));
 502
 503     /* Update temporary scratch pointer */
 504     pScr1 += (srcBLen - 1u);
 505
 506     /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
 507
 508
 509     /* Apply loop unrolling and do 4 Copies simultaneously. */
 510     k = srcALen >> 2u;
 511
 512     /* First part of the processing with loop unrolling copies 4 data points at a time.
 513      ** a second loop below copies for the remaining 1 to 3 samples. */
 514     while(k > 0u)
 515     {
 516       /* copy second buffer in reversal manner */
 517       *pScr1++ = *pIn1++;
 518       *pScr1++ = *pIn1++;
 519       *pScr1++ = *pIn1++;
 520       *pScr1++ = *pIn1++;
 521
 522       /* Decrement the loop counter */
 523       k--;
 524     }
 525
 526     /* If the count is not a multiple of 4, copy remaining samples here.
 527      ** No loop unrolling is used. */
 528     k = srcALen % 0x4u;
 529
 530     while(k > 0u)
 531     {
 532       /* copy second buffer in reversal manner for remaining samples */
 533       *pScr1++ = *pIn1++;
 534
 535       /* Decrement the loop counter */
 536       k--;
 537     }
 538
 539
 540     /* Apply loop unrolling and do 4 Copies simultaneously. */
 541     k = (srcBLen - 1u) >> 2u;
 542
 543     /* First part of the processing with loop unrolling copies 4 data points at a time.
 544      ** a second loop below copies for the remaining 1 to 3 samples. */
 545     while(k > 0u)
 546     {
 547       /* copy second buffer in reversal manner */
 548       *pScr1++ = 0;
 549       *pScr1++ = 0;
 550       *pScr1++ = 0;
 551       *pScr1++ = 0;
 552
 553       /* Decrement the loop counter */
 554       k--;
 555     }
 556
 557     /* If the count is not a multiple of 4, copy remaining samples here.
 558      ** No loop unrolling is used. */
 559     k = (srcBLen - 1u) % 0x4u;
 560
 561     while(k > 0u)
 562     {
 563       /* copy second buffer in reversal manner for remaining samples */
 564       *pScr1++ = 0;
 565
 566       /* Decrement the loop counter */
 567       k--;
 568     }
 569
 570
 571     /* Initialization of pIn2 pointer */
 572     pIn2 = py;
 573
 574     pScratch1 += firstIndex;
 575
 576     pOut = pDst + firstIndex;
 577
 578     /* Actual convolution process starts here */
 579     blkCnt = (numPoints) >> 2;
 580
 581     while(blkCnt > 0)
 582     {
 583       /* Initialze temporary scratch pointer as scratch1 */
 584       pScr1 = pScratch1;
 585
 586       /* Clear Accumlators */
 587       acc0 = 0;
 588       acc1 = 0;
 589       acc2 = 0;
 590       acc3 = 0;
 591
 592       /* Read two samples from scratch1 buffer */
 593       x10 = *pScr1++;
 594       x11 = *pScr1++;
 595
 596       /* Read next two samples from scratch1 buffer */
 597       x20 = *pScr1++;
 598       x21 = *pScr1++;
 599
 600       tapCnt = (srcBLen) >> 2u;
 601
 602       while(tapCnt > 0u)
 603       {
 604
 605         /* Read two samples from smaller buffer */
 606         y10 = *pIn2;
 607         y11 = *(pIn2 + 1u);
 608
 609         /* multiply and accumlate */
 610         acc0 += (q63_t) x10 *y10;
 611         acc0 += (q63_t) x11 *y11;
 612         acc2 += (q63_t) x20 *y10;
 613         acc2 += (q63_t) x21 *y11;
 614
 615         /* multiply and accumlate */
 616         acc1 += (q63_t) x11 *y10;
 617         acc1 += (q63_t) x20 *y11;
 618
 619         /* Read next two samples from scratch1 buffer */
 620         x10 = *pScr1;
 621         x11 = *(pScr1 + 1u);
 622
 623         /* multiply and accumlate */
 624         acc3 += (q63_t) x21 *y10;
 625         acc3 += (q63_t) x10 *y11;
 626
 627         /* Read next two samples from scratch2 buffer */
 628         y10 = *(pIn2 + 2u);
 629         y11 = *(pIn2 + 3u);
 630
 631         /* multiply and accumlate */
 632         acc0 += (q63_t) x20 *y10;
 633         acc0 += (q63_t) x21 *y11;
 634         acc2 += (q63_t) x10 *y10;
 635         acc2 += (q63_t) x11 *y11;
 636         acc1 += (q63_t) x21 *y10;
 637         acc1 += (q63_t) x10 *y11;
 638
 639         /* Read next two samples from scratch1 buffer */
 640         x20 = *(pScr1 + 2);
 641         x21 = *(pScr1 + 3);
 642
 643         /* multiply and accumlate */
 644         acc3 += (q63_t) x11 *y10;
 645         acc3 += (q63_t) x20 *y11;
 646
 647         /* update scratch pointers */
 648         pIn2 += 4u;
 649         pScr1 += 4u;
 650
 651         /* Decrement the loop counter */
 652         tapCnt--;
 653       }
 654
 655       /* Update scratch pointer for remaining samples of smaller length sequence */
 656       pScr1 -= 4u;
 657
 658       /* apply same above for remaining samples of smaller length sequence */
 659       tapCnt = (srcBLen) & 3u;
 660
 661       while(tapCnt > 0u)
 662       {
 663         /* accumlate the results */
 664         acc0 += (*pScr1++ * *pIn2);
 665         acc1 += (*pScr1++ * *pIn2);
 666         acc2 += (*pScr1++ * *pIn2);
 667         acc3 += (*pScr1++ * *pIn2++);
 668
 669         pScr1 -= 3u;
 670
 671         /* Decrement the loop counter */
 672         tapCnt--;
 673       }
 674
 675       blkCnt--;
 676
 677
 678       /* Store the results in the accumulators in the destination buffer. */
 679       *pOut++ = __SSAT((acc0 >> 15), 16);
 680       *pOut++ = __SSAT((acc1 >> 15), 16);
 681       *pOut++ = __SSAT((acc2 >> 15), 16);
 682       *pOut++ = __SSAT((acc3 >> 15), 16);
 683
 684
 685       /* Initialization of inputB pointer */
 686       pIn2 = py;
 687
 688       pScratch1 += 4u;
 689
 690     }
 691
 692
 693     blkCnt = numPoints & 0x3;
 694
 695     /* Calculate convolution for remaining samples of Bigger length sequence */
 696     while(blkCnt > 0)
 697     {
 698       /* Initialze temporary scratch pointer as scratch1 */
 699       pScr1 = pScratch1;
 700
 701       /* Clear Accumlators */
 702       acc0 = 0;
 703
 704       tapCnt = (srcBLen) >> 1u;
 705
 706       while(tapCnt > 0u)
 707       {
 708
 709         /* Read next two samples from scratch1 buffer */
 710         x10 = *pScr1++;
 711         x11 = *pScr1++;
 712
 713         /* Read two samples from smaller buffer */
 714         y10 = *pIn2++;
 715         y11 = *pIn2++;
 716
 717         /* multiply and accumlate */
 718         acc0 += (q63_t) x10 *y10;
 719         acc0 += (q63_t) x11 *y11;
 720
 721         /* Decrement the loop counter */
 722         tapCnt--;
 723       }
 724
 725       tapCnt = (srcBLen) & 1u;
 726
 727       /* apply same above for remaining samples of smaller length sequence */
 728       while(tapCnt > 0u)
 729       {
 730
 731         /* accumlate the results */
 732         acc0 += (*pScr1++ * *pIn2++);
 733
 734         /* Decrement the loop counter */
 735         tapCnt--;
 736       }
 737
 738       blkCnt--;
 739
 740       /* Store the result in the accumulator in the destination buffer. */
 741       *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
 742
 743
 744       /* Initialization of inputB pointer */
 745       pIn2 = py;
 746
 747       pScratch1 += 1u;
 748
 749     }
 750
 751     /* set status as ARM_MATH_SUCCESS */
 752     status = ARM_MATH_SUCCESS;
 753
 754   }
 755
 756   /* Return to application */
 757   return (status);
 758 }
 759
 760 #endif  /*      #ifndef UNALIGNED_SUPPORT_DISABLE       */
 761
 762
 763 /**
 764  * @} end of PartialConv group
 765  */