tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_partial_fast_q31.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_partial_fast_q31.c
   9 *
  10 * Description:  Fast Q31 Partial convolution.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @addtogroup PartialConv
  49  * @{
  50  */
  51
  52 /**
  53  * @brief Partial convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4.
  54  * @param[in]       *pSrcA points to the first input sequence.
  55  * @param[in]       srcALen length of the first input sequence.
  56  * @param[in]       *pSrcB points to the second input sequence.
  57  * @param[in]       srcBLen length of the second input sequence.
  58  * @param[out]      *pDst points to the location where the output result is written.
  59  * @param[in]       firstIndex is the first output sample to start with.
  60  * @param[in]       numPoints is the number of output points to be computed.
  61  * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
  62  *
  63  * \par
  64  * See <code>arm_conv_partial_q31()</code> for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.
  65  */
  66
  67 arm_status arm_conv_partial_fast_q31(
  68   q31_t * pSrcA,
  69   uint32_t srcALen,
  70   q31_t * pSrcB,
  71   uint32_t srcBLen,
  72   q31_t * pDst,
  73   uint32_t firstIndex,
  74   uint32_t numPoints)
  75 {
  76   q31_t *pIn1;                                   /* inputA pointer               */
  77   q31_t *pIn2;                                   /* inputB pointer               */
  78   q31_t *pOut = pDst;                            /* output pointer               */
  79   q31_t *px;                                     /* Intermediate inputA pointer  */
  80   q31_t *py;                                     /* Intermediate inputB pointer  */
  81   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */
  82   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */
  83   q31_t x0, x1, x2, x3, c0;
  84   uint32_t j, k, count, check, blkCnt;
  85   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters                 */
  86   arm_status status;                             /* status of Partial convolution */
  87
  88
  89   /* Check for range of output samples to be calculated */
  90   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
  91   {
  92     /* Set status as ARM_MATH_ARGUMENT_ERROR */
  93     status = ARM_MATH_ARGUMENT_ERROR;
  94   }
  95   else
  96   {
  97
  98     /* The algorithm implementation is based on the lengths of the inputs. */
  99     /* srcB is always made to slide across srcA. */
 100     /* So srcBLen is always considered as shorter or equal to srcALen */
 101     if(srcALen >= srcBLen)
 102     {
 103       /* Initialization of inputA pointer */
 104       pIn1 = pSrcA;
 105
 106       /* Initialization of inputB pointer */
 107       pIn2 = pSrcB;
 108     }
 109     else
 110     {
 111       /* Initialization of inputA pointer */
 112       pIn1 = pSrcB;
 113
 114       /* Initialization of inputB pointer */
 115       pIn2 = pSrcA;
 116
 117       /* srcBLen is always considered as shorter or equal to srcALen */
 118       j = srcBLen;
 119       srcBLen = srcALen;
 120       srcALen = j;
 121     }
 122
 123     /* Conditions to check which loopCounter holds
 124      * the first and last indices of the output samples to be calculated. */
 125     check = firstIndex + numPoints;
 126     blockSize3 = ((int32_t) check - (int32_t) srcALen);
 127     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
 128     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
 129     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
 130                                      (int32_t) numPoints) : 0;
 131     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
 132                                     (int32_t) firstIndex);
 133     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
 134
 135     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
 136     /* The function is internally
 137      * divided into three stages according to the number of multiplications that has to be
 138      * taken place between inputA samples and inputB samples. In the first stage of the
 139      * algorithm, the multiplications increase by one for every iteration.
 140      * In the second stage of the algorithm, srcBLen number of multiplications are done.
 141      * In the third stage of the algorithm, the multiplications decrease by one
 142      * for every iteration. */
 143
 144     /* Set the output pointer to point to the firstIndex
 145      * of the output sample to be calculated. */
 146     pOut = pDst + firstIndex;
 147
 148     /* --------------------------
 149      * Initializations of stage1
 150      * -------------------------*/
 151
 152     /* sum = x[0] * y[0]
 153      * sum = x[0] * y[1] + x[1] * y[0]
 154      * ....
 155      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
 156      */
 157
 158     /* In this stage the MAC operations are increased by 1 for every iteration.
 159        The count variable holds the number of MAC operations performed.
 160        Since the partial convolution starts from firstIndex
 161        Number of Macs to be performed is firstIndex + 1 */
 162     count = 1u + firstIndex;
 163
 164     /* Working pointer of inputA */
 165     px = pIn1;
 166
 167     /* Working pointer of inputB */
 168     pSrc2 = pIn2 + firstIndex;
 169     py = pSrc2;
 170
 171     /* ------------------------
 172      * Stage1 process
 173      * ----------------------*/
 174
 175     /* The first loop starts here */
 176     while(blockSize1 > 0)
 177     {
 178       /* Accumulator is made zero for every iteration */
 179       sum = 0;
 180
 181       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 182       k = count >> 2u;
 183
 184       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 185        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 186       while(k > 0u)
 187       {
 188         /* x[0] * y[srcBLen - 1] */
 189         sum = (q31_t) ((((q63_t) sum << 32) +
 190                         ((q63_t) * px++ * (*py--))) >> 32);
 191
 192         /* x[1] * y[srcBLen - 2] */
 193         sum = (q31_t) ((((q63_t) sum << 32) +
 194                         ((q63_t) * px++ * (*py--))) >> 32);
 195
 196         /* x[2] * y[srcBLen - 3] */
 197         sum = (q31_t) ((((q63_t) sum << 32) +
 198                         ((q63_t) * px++ * (*py--))) >> 32);
 199
 200         /* x[3] * y[srcBLen - 4] */
 201         sum = (q31_t) ((((q63_t) sum << 32) +
 202                         ((q63_t) * px++ * (*py--))) >> 32);
 203
 204         /* Decrement the loop counter */
 205         k--;
 206       }
 207
 208       /* If the count is not a multiple of 4, compute any remaining MACs here.
 209        ** No loop unrolling is used. */
 210       k = count % 0x4u;
 211
 212       while(k > 0u)
 213       {
 214         /* Perform the multiply-accumulates */
 215         sum = (q31_t) ((((q63_t) sum << 32) +
 216                         ((q63_t) * px++ * (*py--))) >> 32);
 217
 218         /* Decrement the loop counter */
 219         k--;
 220       }
 221
 222       /* Store the result in the accumulator in the destination buffer. */
 223       *pOut++ = sum << 1;
 224
 225       /* Update the inputA and inputB pointers for next MAC calculation */
 226       py = ++pSrc2;
 227       px = pIn1;
 228
 229       /* Increment the MAC count */
 230       count++;
 231
 232       /* Decrement the loop counter */
 233       blockSize1--;
 234     }
 235
 236     /* --------------------------
 237      * Initializations of stage2
 238      * ------------------------*/
 239
 240     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
 241      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
 242      * ....
 243      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
 244      */
 245
 246     /* Working pointer of inputA */
 247     px = pIn1;
 248
 249     /* Working pointer of inputB */
 250     pSrc2 = pIn2 + (srcBLen - 1u);
 251     py = pSrc2;
 252
 253     /* count is index by which the pointer pIn1 to be incremented */
 254     count = 0u;
 255
 256     /* -------------------
 257      * Stage2 process
 258      * ------------------*/
 259
 260     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 261      * So, to loop unroll over blockSize2,
 262      * srcBLen should be greater than or equal to 4 */
 263     if(srcBLen >= 4u)
 264     {
 265       /* Loop unroll over blockSize2 */
 266       blkCnt = ((uint32_t) blockSize2 >> 2u);
 267
 268       while(blkCnt > 0u)
 269       {
 270         /* Set all accumulators to zero */
 271         acc0 = 0;
 272         acc1 = 0;
 273         acc2 = 0;
 274         acc3 = 0;
 275
 276         /* read x[0], x[1], x[2] samples */
 277         x0 = *(px++);
 278         x1 = *(px++);
 279         x2 = *(px++);
 280
 281         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 282         k = srcBLen >> 2u;
 283
 284         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 285          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 286         do
 287         {
 288           /* Read y[srcBLen - 1] sample */
 289           c0 = *(py--);
 290
 291           /* Read x[3] sample */
 292           x3 = *(px++);
 293
 294           /* Perform the multiply-accumulate */
 295           /* acc0 +=  x[0] * y[srcBLen - 1] */
 296           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
 297
 298           /* acc1 +=  x[1] * y[srcBLen - 1] */
 299           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
 300
 301           /* acc2 +=  x[2] * y[srcBLen - 1] */
 302           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
 303
 304           /* acc3 +=  x[3] * y[srcBLen - 1] */
 305           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
 306
 307           /* Read y[srcBLen - 2] sample */
 308           c0 = *(py--);
 309
 310           /* Read x[4] sample */
 311           x0 = *(px++);
 312
 313           /* Perform the multiply-accumulate */
 314           /* acc0 +=  x[1] * y[srcBLen - 2] */
 315           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
 316           /* acc1 +=  x[2] * y[srcBLen - 2] */
 317           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
 318           /* acc2 +=  x[3] * y[srcBLen - 2] */
 319           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
 320           /* acc3 +=  x[4] * y[srcBLen - 2] */
 321           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
 322
 323           /* Read y[srcBLen - 3] sample */
 324           c0 = *(py--);
 325
 326           /* Read x[5] sample */
 327           x1 = *(px++);
 328
 329           /* Perform the multiply-accumulates */
 330           /* acc0 +=  x[2] * y[srcBLen - 3] */
 331           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
 332           /* acc1 +=  x[3] * y[srcBLen - 2] */
 333           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
 334           /* acc2 +=  x[4] * y[srcBLen - 2] */
 335           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
 336           /* acc3 +=  x[5] * y[srcBLen - 2] */
 337           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
 338
 339           /* Read y[srcBLen - 4] sample */
 340           c0 = *(py--);
 341
 342           /* Read x[6] sample */
 343           x2 = *(px++);
 344
 345           /* Perform the multiply-accumulates */
 346           /* acc0 +=  x[3] * y[srcBLen - 4] */
 347           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
 348           /* acc1 +=  x[4] * y[srcBLen - 4] */
 349           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
 350           /* acc2 +=  x[5] * y[srcBLen - 4] */
 351           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
 352           /* acc3 +=  x[6] * y[srcBLen - 4] */
 353           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
 354
 355
 356         } while(--k);
 357
 358         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 359          ** No loop unrolling is used. */
 360         k = srcBLen % 0x4u;
 361
 362         while(k > 0u)
 363         {
 364           /* Read y[srcBLen - 5] sample */
 365           c0 = *(py--);
 366
 367           /* Read x[7] sample */
 368           x3 = *(px++);
 369
 370           /* Perform the multiply-accumulates */
 371           /* acc0 +=  x[4] * y[srcBLen - 5] */
 372           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
 373           /* acc1 +=  x[5] * y[srcBLen - 5] */
 374           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
 375           /* acc2 +=  x[6] * y[srcBLen - 5] */
 376           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
 377           /* acc3 +=  x[7] * y[srcBLen - 5] */
 378           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
 379
 380           /* Reuse the present samples for the next MAC */
 381           x0 = x1;
 382           x1 = x2;
 383           x2 = x3;
 384
 385           /* Decrement the loop counter */
 386           k--;
 387         }
 388
 389         /* Store the result in the accumulator in the destination buffer. */
 390         *pOut++ = (q31_t) (acc0 << 1);
 391         *pOut++ = (q31_t) (acc1 << 1);
 392         *pOut++ = (q31_t) (acc2 << 1);
 393         *pOut++ = (q31_t) (acc3 << 1);
 394
 395         /* Increment the pointer pIn1 index, count by 4 */
 396         count += 4u;
 397
 398         /* Update the inputA and inputB pointers for next MAC calculation */
 399         px = pIn1 + count;
 400         py = pSrc2;
 401
 402         /* Decrement the loop counter */
 403         blkCnt--;
 404       }
 405
 406       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 407        ** No loop unrolling is used. */
 408       blkCnt = (uint32_t) blockSize2 % 0x4u;
 409
 410       while(blkCnt > 0u)
 411       {
 412         /* Accumulator is made zero for every iteration */
 413         sum = 0;
 414
 415         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 416         k = srcBLen >> 2u;
 417
 418         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 419          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 420         while(k > 0u)
 421         {
 422           /* Perform the multiply-accumulates */
 423           sum = (q31_t) ((((q63_t) sum << 32) +
 424                           ((q63_t) * px++ * (*py--))) >> 32);
 425           sum = (q31_t) ((((q63_t) sum << 32) +
 426                           ((q63_t) * px++ * (*py--))) >> 32);
 427           sum = (q31_t) ((((q63_t) sum << 32) +
 428                           ((q63_t) * px++ * (*py--))) >> 32);
 429           sum = (q31_t) ((((q63_t) sum << 32) +
 430                           ((q63_t) * px++ * (*py--))) >> 32);
 431
 432           /* Decrement the loop counter */
 433           k--;
 434         }
 435
 436         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 437          ** No loop unrolling is used. */
 438         k = srcBLen % 0x4u;
 439
 440         while(k > 0u)
 441         {
 442           /* Perform the multiply-accumulate */
 443           sum = (q31_t) ((((q63_t) sum << 32) +
 444                           ((q63_t) * px++ * (*py--))) >> 32);
 445
 446           /* Decrement the loop counter */
 447           k--;
 448         }
 449
 450         /* Store the result in the accumulator in the destination buffer. */
 451         *pOut++ = sum << 1;
 452
 453         /* Increment the MAC count */
 454         count++;
 455
 456         /* Update the inputA and inputB pointers for next MAC calculation */
 457         px = pIn1 + count;
 458         py = pSrc2;
 459
 460         /* Decrement the loop counter */
 461         blkCnt--;
 462       }
 463     }
 464     else
 465     {
 466       /* If the srcBLen is not a multiple of 4,
 467        * the blockSize2 loop cannot be unrolled by 4 */
 468       blkCnt = (uint32_t) blockSize2;
 469
 470       while(blkCnt > 0u)
 471       {
 472         /* Accumulator is made zero for every iteration */
 473         sum = 0;
 474
 475         /* srcBLen number of MACS should be performed */
 476         k = srcBLen;
 477
 478         while(k > 0u)
 479         {
 480           /* Perform the multiply-accumulate */
 481           sum = (q31_t) ((((q63_t) sum << 32) +
 482                           ((q63_t) * px++ * (*py--))) >> 32);
 483
 484           /* Decrement the loop counter */
 485           k--;
 486         }
 487
 488         /* Store the result in the accumulator in the destination buffer. */
 489         *pOut++ = sum << 1;
 490
 491         /* Increment the MAC count */
 492         count++;
 493
 494         /* Update the inputA and inputB pointers for next MAC calculation */
 495         px = pIn1 + count;
 496         py = pSrc2;
 497
 498         /* Decrement the loop counter */
 499         blkCnt--;
 500       }
 501     }
 502
 503
 504     /* --------------------------
 505      * Initializations of stage3
 506      * -------------------------*/
 507
 508     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
 509      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
 510      * ....
 511      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
 512      * sum +=  x[srcALen-1] * y[srcBLen-1]
 513      */
 514
 515     /* In this stage the MAC operations are decreased by 1 for every iteration.
 516        The count variable holds the number of MAC operations performed */
 517     count = srcBLen - 1u;
 518
 519     /* Working pointer of inputA */
 520     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 521     px = pSrc1;
 522
 523     /* Working pointer of inputB */
 524     pSrc2 = pIn2 + (srcBLen - 1u);
 525     py = pSrc2;
 526
 527     /* -------------------
 528      * Stage3 process
 529      * ------------------*/
 530
 531     while(blockSize3 > 0)
 532     {
 533       /* Accumulator is made zero for every iteration */
 534       sum = 0;
 535
 536       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 537       k = count >> 2u;
 538
 539       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 540        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 541       while(k > 0u)
 542       {
 543         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
 544         sum = (q31_t) ((((q63_t) sum << 32) +
 545                         ((q63_t) * px++ * (*py--))) >> 32);
 546
 547         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
 548         sum = (q31_t) ((((q63_t) sum << 32) +
 549                         ((q63_t) * px++ * (*py--))) >> 32);
 550
 551         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
 552         sum = (q31_t) ((((q63_t) sum << 32) +
 553                         ((q63_t) * px++ * (*py--))) >> 32);
 554
 555         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
 556         sum = (q31_t) ((((q63_t) sum << 32) +
 557                         ((q63_t) * px++ * (*py--))) >> 32);
 558
 559         /* Decrement the loop counter */
 560         k--;
 561       }
 562
 563       /* If the count is not a multiple of 4, compute any remaining MACs here.
 564        ** No loop unrolling is used. */
 565       k = count % 0x4u;
 566
 567       while(k > 0u)
 568       {
 569         /* Perform the multiply-accumulates */
 570         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
 571         sum = (q31_t) ((((q63_t) sum << 32) +
 572                         ((q63_t) * px++ * (*py--))) >> 32);
 573
 574         /* Decrement the loop counter */
 575         k--;
 576       }
 577
 578       /* Store the result in the accumulator in the destination buffer. */
 579       *pOut++ = sum << 1;
 580
 581       /* Update the inputA and inputB pointers for next MAC calculation */
 582       px = ++pSrc1;
 583       py = pSrc2;
 584
 585       /* Decrement the MAC count */
 586       count--;
 587
 588       /* Decrement the loop counter */
 589       blockSize3--;
 590
 591     }
 592
 593     /* set status as ARM_MATH_SUCCESS */
 594     status = ARM_MATH_SUCCESS;
 595   }
 596
 597   /* Return to application */
 598   return (status);
 599
 600 }
 601
 602 /**
 603  * @} end of PartialConv group
 604  */