tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_partial_opt_q7.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_partial_opt_q7.c
   9 *
  10 * Description:  Partial convolution of Q7 sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @addtogroup PartialConv
  49  * @{
  50  */
  51
  52 /**
  53  * @brief Partial convolution of Q7 sequences.
  54  * @param[in]       *pSrcA points to the first input sequence.
  55  * @param[in]       srcALen length of the first input sequence.
  56  * @param[in]       *pSrcB points to the second input sequence.
  57  * @param[in]       srcBLen length of the second input sequence.
  58  * @param[out]      *pDst points to the location where the output result is written.
  59  * @param[in]       firstIndex is the first output sample to start with.
  60  * @param[in]       numPoints is the number of output points to be computed.
  61  * @param[in]      *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
  62  * @param[in]      *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
  63  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
  64  *
  65  * \par Restrictions
  66  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
  67  *      In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
  68  *
  69  *
  70  *
  71  */
  72
  73
  74 #ifndef UNALIGNED_SUPPORT_DISABLE
  75
  76 arm_status arm_conv_partial_opt_q7(
  77   q7_t * pSrcA,
  78   uint32_t srcALen,
  79   q7_t * pSrcB,
  80   uint32_t srcBLen,
  81   q7_t * pDst,
  82   uint32_t firstIndex,
  83   uint32_t numPoints,
  84   q15_t * pScratch1,
  85   q15_t * pScratch2)
  86 {
  87
  88   q15_t *pScr2, *pScr1;                          /* Intermediate pointers for scratch pointers */
  89   q15_t x4;                                      /* Temporary input variable */
  90   q7_t *pIn1, *pIn2;                             /* inputA and inputB pointer */
  91   uint32_t j, k, blkCnt, tapCnt;                 /* loop counter */
  92   q7_t *px;                                      /* Temporary input1 pointer */
  93   q15_t *py;                                     /* Temporary input2 pointer */
  94   q31_t acc0, acc1, acc2, acc3;                  /* Accumulator */
  95   q31_t x1, x2, x3, y1;                          /* Temporary input variables */
  96   arm_status status;
  97   q7_t *pOut = pDst;                             /* output pointer */
  98   q7_t out0, out1, out2, out3;                   /* temporary variables */
  99
 100   /* Check for range of output samples to be calculated */
 101   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
 102   {
 103     /* Set status as ARM_MATH_ARGUMENT_ERROR */
 104     status = ARM_MATH_ARGUMENT_ERROR;
 105   }
 106   else
 107   {
 108
 109     /* The algorithm implementation is based on the lengths of the inputs. */
 110     /* srcB is always made to slide across srcA. */
 111     /* So srcBLen is always considered as shorter or equal to srcALen */
 112     if(srcALen >= srcBLen)
 113     {
 114       /* Initialization of inputA pointer */
 115       pIn1 = pSrcA;
 116
 117       /* Initialization of inputB pointer */
 118       pIn2 = pSrcB;
 119     }
 120     else
 121     {
 122       /* Initialization of inputA pointer */
 123       pIn1 = pSrcB;
 124
 125       /* Initialization of inputB pointer */
 126       pIn2 = pSrcA;
 127
 128       /* srcBLen is always considered as shorter or equal to srcALen */
 129       j = srcBLen;
 130       srcBLen = srcALen;
 131       srcALen = j;
 132     }
 133
 134     /* pointer to take end of scratch2 buffer */
 135     pScr2 = pScratch2;
 136
 137     /* points to smaller length sequence */
 138     px = pIn2 + srcBLen - 1;
 139
 140     /* Apply loop unrolling and do 4 Copies simultaneously. */
 141     k = srcBLen >> 2u;
 142
 143     /* First part of the processing with loop unrolling copies 4 data points at a time.
 144      ** a second loop below copies for the remaining 1 to 3 samples. */
 145     while(k > 0u)
 146     {
 147       /* copy second buffer in reversal manner */
 148       x4 = (q15_t) * px--;
 149       *pScr2++ = x4;
 150       x4 = (q15_t) * px--;
 151       *pScr2++ = x4;
 152       x4 = (q15_t) * px--;
 153       *pScr2++ = x4;
 154       x4 = (q15_t) * px--;
 155       *pScr2++ = x4;
 156
 157       /* Decrement the loop counter */
 158       k--;
 159     }
 160
 161     /* If the count is not a multiple of 4, copy remaining samples here.
 162      ** No loop unrolling is used. */
 163     k = srcBLen % 0x4u;
 164
 165     while(k > 0u)
 166     {
 167       /* copy second buffer in reversal manner for remaining samples */
 168       x4 = (q15_t) * px--;
 169       *pScr2++ = x4;
 170
 171       /* Decrement the loop counter */
 172       k--;
 173     }
 174
 175     /* Initialze temporary scratch pointer */
 176     pScr1 = pScratch1;
 177
 178     /* Fill (srcBLen - 1u) zeros in scratch buffer */
 179     arm_fill_q15(0, pScr1, (srcBLen - 1u));
 180
 181     /* Update temporary scratch pointer */
 182     pScr1 += (srcBLen - 1u);
 183
 184     /* Copy (srcALen) samples in scratch buffer */
 185     /* Apply loop unrolling and do 4 Copies simultaneously. */
 186     k = srcALen >> 2u;
 187
 188     /* First part of the processing with loop unrolling copies 4 data points at a time.
 189      ** a second loop below copies for the remaining 1 to 3 samples. */
 190     while(k > 0u)
 191     {
 192       /* copy second buffer in reversal manner */
 193       x4 = (q15_t) * pIn1++;
 194       *pScr1++ = x4;
 195       x4 = (q15_t) * pIn1++;
 196       *pScr1++ = x4;
 197       x4 = (q15_t) * pIn1++;
 198       *pScr1++ = x4;
 199       x4 = (q15_t) * pIn1++;
 200       *pScr1++ = x4;
 201
 202       /* Decrement the loop counter */
 203       k--;
 204     }
 205
 206     /* If the count is not a multiple of 4, copy remaining samples here.
 207      ** No loop unrolling is used. */
 208     k = srcALen % 0x4u;
 209
 210     while(k > 0u)
 211     {
 212       /* copy second buffer in reversal manner for remaining samples */
 213       x4 = (q15_t) * pIn1++;
 214       *pScr1++ = x4;
 215
 216       /* Decrement the loop counter */
 217       k--;
 218     }
 219
 220     /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
 221     arm_fill_q15(0, pScr1, (srcBLen - 1u));
 222
 223     /* Update pointer */
 224     pScr1 += (srcBLen - 1u);
 225
 226
 227     /* Temporary pointer for scratch2 */
 228     py = pScratch2;
 229
 230     /* Initialization of pIn2 pointer */
 231     pIn2 = (q7_t *) py;
 232
 233     pScr2 = py;
 234
 235     pOut = pDst + firstIndex;
 236
 237     pScratch1 += firstIndex;
 238
 239     /* Actual convolution process starts here */
 240     blkCnt = (numPoints) >> 2;
 241
 242
 243     while(blkCnt > 0)
 244     {
 245       /* Initialze temporary scratch pointer as scratch1 */
 246       pScr1 = pScratch1;
 247
 248       /* Clear Accumlators */
 249       acc0 = 0;
 250       acc1 = 0;
 251       acc2 = 0;
 252       acc3 = 0;
 253
 254       /* Read two samples from scratch1 buffer */
 255       x1 = *__SIMD32(pScr1)++;
 256
 257       /* Read next two samples from scratch1 buffer */
 258       x2 = *__SIMD32(pScr1)++;
 259
 260       tapCnt = (srcBLen) >> 2u;
 261
 262       while(tapCnt > 0u)
 263       {
 264
 265         /* Read four samples from smaller buffer */
 266         y1 = _SIMD32_OFFSET(pScr2);
 267
 268         /* multiply and accumlate */
 269         acc0 = __SMLAD(x1, y1, acc0);
 270         acc2 = __SMLAD(x2, y1, acc2);
 271
 272         /* pack input data */
 273 #ifndef ARM_MATH_BIG_ENDIAN
 274         x3 = __PKHBT(x2, x1, 0);
 275 #else
 276         x3 = __PKHBT(x1, x2, 0);
 277 #endif
 278
 279         /* multiply and accumlate */
 280         acc1 = __SMLADX(x3, y1, acc1);
 281
 282         /* Read next two samples from scratch1 buffer */
 283         x1 = *__SIMD32(pScr1)++;
 284
 285         /* pack input data */
 286 #ifndef ARM_MATH_BIG_ENDIAN
 287         x3 = __PKHBT(x1, x2, 0);
 288 #else
 289         x3 = __PKHBT(x2, x1, 0);
 290 #endif
 291
 292         acc3 = __SMLADX(x3, y1, acc3);
 293
 294         /* Read four samples from smaller buffer */
 295         y1 = _SIMD32_OFFSET(pScr2 + 2u);
 296
 297         acc0 = __SMLAD(x2, y1, acc0);
 298
 299         acc2 = __SMLAD(x1, y1, acc2);
 300
 301         acc1 = __SMLADX(x3, y1, acc1);
 302
 303         x2 = *__SIMD32(pScr1)++;
 304
 305 #ifndef ARM_MATH_BIG_ENDIAN
 306         x3 = __PKHBT(x2, x1, 0);
 307 #else
 308         x3 = __PKHBT(x1, x2, 0);
 309 #endif
 310
 311         acc3 = __SMLADX(x3, y1, acc3);
 312
 313         pScr2 += 4u;
 314
 315
 316         /* Decrement the loop counter */
 317         tapCnt--;
 318       }
 319
 320
 321
 322       /* Update scratch pointer for remaining samples of smaller length sequence */
 323       pScr1 -= 4u;
 324
 325
 326       /* apply same above for remaining samples of smaller length sequence */
 327       tapCnt = (srcBLen) & 3u;
 328
 329       while(tapCnt > 0u)
 330       {
 331
 332         /* accumlate the results */
 333         acc0 += (*pScr1++ * *pScr2);
 334         acc1 += (*pScr1++ * *pScr2);
 335         acc2 += (*pScr1++ * *pScr2);
 336         acc3 += (*pScr1++ * *pScr2++);
 337
 338         pScr1 -= 3u;
 339
 340         /* Decrement the loop counter */
 341         tapCnt--;
 342       }
 343
 344       blkCnt--;
 345
 346       /* Store the result in the accumulator in the destination buffer. */
 347       out0 = (q7_t) (__SSAT(acc0 >> 7u, 8));
 348       out1 = (q7_t) (__SSAT(acc1 >> 7u, 8));
 349       out2 = (q7_t) (__SSAT(acc2 >> 7u, 8));
 350       out3 = (q7_t) (__SSAT(acc3 >> 7u, 8));
 351
 352       *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
 353
 354       /* Initialization of inputB pointer */
 355       pScr2 = py;
 356
 357       pScratch1 += 4u;
 358
 359     }
 360
 361     blkCnt = (numPoints) & 0x3;
 362
 363     /* Calculate convolution for remaining samples of Bigger length sequence */
 364     while(blkCnt > 0)
 365     {
 366       /* Initialze temporary scratch pointer as scratch1 */
 367       pScr1 = pScratch1;
 368
 369       /* Clear Accumlators */
 370       acc0 = 0;
 371
 372       tapCnt = (srcBLen) >> 1u;
 373
 374       while(tapCnt > 0u)
 375       {
 376
 377         /* Read next two samples from scratch1 buffer */
 378         x1 = *__SIMD32(pScr1)++;
 379
 380         /* Read two samples from smaller buffer */
 381         y1 = *__SIMD32(pScr2)++;
 382
 383         acc0 = __SMLAD(x1, y1, acc0);
 384
 385         /* Decrement the loop counter */
 386         tapCnt--;
 387       }
 388
 389       tapCnt = (srcBLen) & 1u;
 390
 391       /* apply same above for remaining samples of smaller length sequence */
 392       while(tapCnt > 0u)
 393       {
 394
 395         /* accumlate the results */
 396         acc0 += (*pScr1++ * *pScr2++);
 397
 398         /* Decrement the loop counter */
 399         tapCnt--;
 400       }
 401
 402       blkCnt--;
 403
 404       /* Store the result in the accumulator in the destination buffer. */
 405       *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
 406
 407       /* Initialization of inputB pointer */
 408       pScr2 = py;
 409
 410       pScratch1 += 1u;
 411
 412     }
 413
 414     /* set status as ARM_MATH_SUCCESS */
 415     status = ARM_MATH_SUCCESS;
 416
 417
 418   }
 419
 420   return (status);
 421
 422 }
 423
 424 #else
 425
 426 arm_status arm_conv_partial_opt_q7(
 427   q7_t * pSrcA,
 428   uint32_t srcALen,
 429   q7_t * pSrcB,
 430   uint32_t srcBLen,
 431   q7_t * pDst,
 432   uint32_t firstIndex,
 433   uint32_t numPoints,
 434   q15_t * pScratch1,
 435   q15_t * pScratch2)
 436 {
 437
 438   q15_t *pScr2, *pScr1;                          /* Intermediate pointers for scratch pointers */
 439   q15_t x4;                                      /* Temporary input variable */
 440   q7_t *pIn1, *pIn2;                             /* inputA and inputB pointer */
 441   uint32_t j, k, blkCnt, tapCnt;                 /* loop counter */
 442   q7_t *px;                                      /* Temporary input1 pointer */
 443   q15_t *py;                                     /* Temporary input2 pointer */
 444   q31_t acc0, acc1, acc2, acc3;                  /* Accumulator */
 445   arm_status status;
 446   q7_t *pOut = pDst;                             /* output pointer */
 447   q15_t x10, x11, x20, x21;                      /* Temporary input variables */
 448   q15_t y10, y11;                                /* Temporary input variables */
 449   q7_t out0, out1, out2, out3;                   /* temporary variables */
 450
 451   /* Check for range of output samples to be calculated */
 452   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
 453   {
 454     /* Set status as ARM_MATH_ARGUMENT_ERROR */
 455     status = ARM_MATH_ARGUMENT_ERROR;
 456   }
 457   else
 458   {
 459
 460     /* The algorithm implementation is based on the lengths of the inputs. */
 461     /* srcB is always made to slide across srcA. */
 462     /* So srcBLen is always considered as shorter or equal to srcALen */
 463     if(srcALen >= srcBLen)
 464     {
 465       /* Initialization of inputA pointer */
 466       pIn1 = pSrcA;
 467
 468       /* Initialization of inputB pointer */
 469       pIn2 = pSrcB;
 470     }
 471     else
 472     {
 473       /* Initialization of inputA pointer */
 474       pIn1 = pSrcB;
 475
 476       /* Initialization of inputB pointer */
 477       pIn2 = pSrcA;
 478
 479       /* srcBLen is always considered as shorter or equal to srcALen */
 480       j = srcBLen;
 481       srcBLen = srcALen;
 482       srcALen = j;
 483     }
 484
 485     /* pointer to take end of scratch2 buffer */
 486     pScr2 = pScratch2;
 487
 488     /* points to smaller length sequence */
 489     px = pIn2 + srcBLen - 1;
 490
 491     /* Apply loop unrolling and do 4 Copies simultaneously. */
 492     k = srcBLen >> 2u;
 493
 494     /* First part of the processing with loop unrolling copies 4 data points at a time.
 495      ** a second loop below copies for the remaining 1 to 3 samples. */
 496     while(k > 0u)
 497     {
 498       /* copy second buffer in reversal manner */
 499       x4 = (q15_t) * px--;
 500       *pScr2++ = x4;
 501       x4 = (q15_t) * px--;
 502       *pScr2++ = x4;
 503       x4 = (q15_t) * px--;
 504       *pScr2++ = x4;
 505       x4 = (q15_t) * px--;
 506       *pScr2++ = x4;
 507
 508       /* Decrement the loop counter */
 509       k--;
 510     }
 511
 512     /* If the count is not a multiple of 4, copy remaining samples here.
 513      ** No loop unrolling is used. */
 514     k = srcBLen % 0x4u;
 515
 516     while(k > 0u)
 517     {
 518       /* copy second buffer in reversal manner for remaining samples */
 519       x4 = (q15_t) * px--;
 520       *pScr2++ = x4;
 521
 522       /* Decrement the loop counter */
 523       k--;
 524     }
 525
 526     /* Initialze temporary scratch pointer */
 527     pScr1 = pScratch1;
 528
 529     /* Fill (srcBLen - 1u) zeros in scratch buffer */
 530     arm_fill_q15(0, pScr1, (srcBLen - 1u));
 531
 532     /* Update temporary scratch pointer */
 533     pScr1 += (srcBLen - 1u);
 534
 535     /* Copy (srcALen) samples in scratch buffer */
 536     /* Apply loop unrolling and do 4 Copies simultaneously. */
 537     k = srcALen >> 2u;
 538
 539     /* First part of the processing with loop unrolling copies 4 data points at a time.
 540      ** a second loop below copies for the remaining 1 to 3 samples. */
 541     while(k > 0u)
 542     {
 543       /* copy second buffer in reversal manner */
 544       x4 = (q15_t) * pIn1++;
 545       *pScr1++ = x4;
 546       x4 = (q15_t) * pIn1++;
 547       *pScr1++ = x4;
 548       x4 = (q15_t) * pIn1++;
 549       *pScr1++ = x4;
 550       x4 = (q15_t) * pIn1++;
 551       *pScr1++ = x4;
 552
 553       /* Decrement the loop counter */
 554       k--;
 555     }
 556
 557     /* If the count is not a multiple of 4, copy remaining samples here.
 558      ** No loop unrolling is used. */
 559     k = srcALen % 0x4u;
 560
 561     while(k > 0u)
 562     {
 563       /* copy second buffer in reversal manner for remaining samples */
 564       x4 = (q15_t) * pIn1++;
 565       *pScr1++ = x4;
 566
 567       /* Decrement the loop counter */
 568       k--;
 569     }
 570
 571     /* Apply loop unrolling and do 4 Copies simultaneously. */
 572     k = (srcBLen - 1u) >> 2u;
 573
 574     /* First part of the processing with loop unrolling copies 4 data points at a time.
 575      ** a second loop below copies for the remaining 1 to 3 samples. */
 576     while(k > 0u)
 577     {
 578       /* copy second buffer in reversal manner */
 579       *pScr1++ = 0;
 580       *pScr1++ = 0;
 581       *pScr1++ = 0;
 582       *pScr1++ = 0;
 583
 584       /* Decrement the loop counter */
 585       k--;
 586     }
 587
 588     /* If the count is not a multiple of 4, copy remaining samples here.
 589      ** No loop unrolling is used. */
 590     k = (srcBLen - 1u) % 0x4u;
 591
 592     while(k > 0u)
 593     {
 594       /* copy second buffer in reversal manner for remaining samples */
 595       *pScr1++ = 0;
 596
 597       /* Decrement the loop counter */
 598       k--;
 599     }
 600
 601
 602     /* Temporary pointer for scratch2 */
 603     py = pScratch2;
 604
 605     /* Initialization of pIn2 pointer */
 606     pIn2 = (q7_t *) py;
 607
 608     pScr2 = py;
 609
 610     pOut = pDst + firstIndex;
 611
 612     pScratch1 += firstIndex;
 613
 614     /* Actual convolution process starts here */
 615     blkCnt = (numPoints) >> 2;
 616
 617
 618     while(blkCnt > 0)
 619     {
 620       /* Initialze temporary scratch pointer as scratch1 */
 621       pScr1 = pScratch1;
 622
 623       /* Clear Accumlators */
 624       acc0 = 0;
 625       acc1 = 0;
 626       acc2 = 0;
 627       acc3 = 0;
 628
 629       /* Read two samples from scratch1 buffer */
 630       x10 = *pScr1++;
 631       x11 = *pScr1++;
 632
 633       /* Read next two samples from scratch1 buffer */
 634       x20 = *pScr1++;
 635       x21 = *pScr1++;
 636
 637       tapCnt = (srcBLen) >> 2u;
 638
 639       while(tapCnt > 0u)
 640       {
 641
 642         /* Read four samples from smaller buffer */
 643         y10 = *pScr2;
 644         y11 = *(pScr2 + 1u);
 645
 646         /* multiply and accumlate */
 647         acc0 += (q31_t) x10 *y10;
 648         acc0 += (q31_t) x11 *y11;
 649         acc2 += (q31_t) x20 *y10;
 650         acc2 += (q31_t) x21 *y11;
 651
 652
 653         acc1 += (q31_t) x11 *y10;
 654         acc1 += (q31_t) x20 *y11;
 655
 656         /* Read next two samples from scratch1 buffer */
 657         x10 = *pScr1;
 658         x11 = *(pScr1 + 1u);
 659
 660         /* multiply and accumlate */
 661         acc3 += (q31_t) x21 *y10;
 662         acc3 += (q31_t) x10 *y11;
 663
 664         /* Read next two samples from scratch2 buffer */
 665         y10 = *(pScr2 + 2u);
 666         y11 = *(pScr2 + 3u);
 667
 668         /* multiply and accumlate */
 669         acc0 += (q31_t) x20 *y10;
 670         acc0 += (q31_t) x21 *y11;
 671         acc2 += (q31_t) x10 *y10;
 672         acc2 += (q31_t) x11 *y11;
 673         acc1 += (q31_t) x21 *y10;
 674         acc1 += (q31_t) x10 *y11;
 675
 676         /* Read next two samples from scratch1 buffer */
 677         x20 = *(pScr1 + 2);
 678         x21 = *(pScr1 + 3);
 679
 680         /* multiply and accumlate */
 681         acc3 += (q31_t) x11 *y10;
 682         acc3 += (q31_t) x20 *y11;
 683
 684         /* update scratch pointers */
 685
 686         pScr1 += 4u;
 687         pScr2 += 4u;
 688
 689         /* Decrement the loop counter */
 690         tapCnt--;
 691       }
 692
 693
 694
 695       /* Update scratch pointer for remaining samples of smaller length sequence */
 696       pScr1 -= 4u;
 697
 698
 699       /* apply same above for remaining samples of smaller length sequence */
 700       tapCnt = (srcBLen) & 3u;
 701
 702       while(tapCnt > 0u)
 703       {
 704
 705         /* accumlate the results */
 706         acc0 += (*pScr1++ * *pScr2);
 707         acc1 += (*pScr1++ * *pScr2);
 708         acc2 += (*pScr1++ * *pScr2);
 709         acc3 += (*pScr1++ * *pScr2++);
 710
 711         pScr1 -= 3u;
 712
 713         /* Decrement the loop counter */
 714         tapCnt--;
 715       }
 716
 717       blkCnt--;
 718
 719       /* Store the result in the accumulator in the destination buffer. */
 720       out0 = (q7_t) (__SSAT(acc0 >> 7u, 8));
 721       out1 = (q7_t) (__SSAT(acc1 >> 7u, 8));
 722       out2 = (q7_t) (__SSAT(acc2 >> 7u, 8));
 723       out3 = (q7_t) (__SSAT(acc3 >> 7u, 8));
 724
 725
 726       *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
 727
 728       /* Initialization of inputB pointer */
 729       pScr2 = py;
 730
 731       pScratch1 += 4u;
 732
 733     }
 734
 735     blkCnt = (numPoints) & 0x3;
 736
 737     /* Calculate convolution for remaining samples of Bigger length sequence */
 738     while(blkCnt > 0)
 739     {
 740       /* Initialze temporary scratch pointer as scratch1 */
 741       pScr1 = pScratch1;
 742
 743       /* Clear Accumlators */
 744       acc0 = 0;
 745
 746       tapCnt = (srcBLen) >> 1u;
 747
 748       while(tapCnt > 0u)
 749       {
 750
 751         /* Read next two samples from scratch1 buffer */
 752         x10 = *pScr1++;
 753         x11 = *pScr1++;
 754
 755         /* Read two samples from smaller buffer */
 756         y10 = *pScr2++;
 757         y11 = *pScr2++;
 758
 759         /* multiply and accumlate */
 760         acc0 += (q31_t) x10 *y10;
 761         acc0 += (q31_t) x11 *y11;
 762
 763         /* Decrement the loop counter */
 764         tapCnt--;
 765       }
 766
 767       tapCnt = (srcBLen) & 1u;
 768
 769       /* apply same above for remaining samples of smaller length sequence */
 770       while(tapCnt > 0u)
 771       {
 772
 773         /* accumlate the results */
 774         acc0 += (*pScr1++ * *pScr2++);
 775
 776         /* Decrement the loop counter */
 777         tapCnt--;
 778       }
 779
 780       blkCnt--;
 781
 782       /* Store the result in the accumulator in the destination buffer. */
 783       *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
 784
 785       /* Initialization of inputB pointer */
 786       pScr2 = py;
 787
 788       pScratch1 += 1u;
 789
 790     }
 791
 792     /* set status as ARM_MATH_SUCCESS */
 793     status = ARM_MATH_SUCCESS;
 794
 795   }
 796
 797   return (status);
 798
 799 }
 800
 801 #endif  /*      #ifndef UNALIGNED_SUPPORT_DISABLE       */
 802
 803
 804
 805 /**
 806  * @} end of PartialConv group
 807  */