tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_partial_f32.c

   1 /* ----------------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_partial_f32.c
   9 *
  10 * Description:  Partial convolution of floating-point sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @defgroup PartialConv Partial Convolution
  49  *
  50  * Partial Convolution is equivalent to Convolution except that a subset of the output samples is generated.
  51  * Each function has two additional arguments.
  52  * <code>firstIndex</code> specifies the starting index of the subset of output samples.
  53  * <code>numPoints</code> is the number of output samples to compute.
  54  * The function computes the output in the range
  55  * <code>[firstIndex, ..., firstIndex+numPoints-1]</code>.
  56  * The output array <code>pDst</code> contains <code>numPoints</code> values.
  57  *
  58  * The allowable range of output indices is [0 srcALen+srcBLen-2].
  59  * If the requested subset does not fall in this range then the functions return ARM_MATH_ARGUMENT_ERROR.
  60  * Otherwise the functions return ARM_MATH_SUCCESS.
  61  * \note Refer arm_conv_f32() for details on fixed point behavior.
  62  *
  63  *
  64  * <b>Fast Versions</b>
  65  *
  66  * \par
  67  * Fast versions are supported for Q31 and Q15 of partial convolution.  Cycles for Fast versions are less compared to Q31 and Q15 of partial conv and the design requires
  68  * the input signals should be scaled down to avoid intermediate overflows.
  69  *
  70  *
  71  * <b>Opt Versions</b>
  72  *
  73  * \par
  74  * Opt versions are supported for Q15 and Q7.  Design uses internal scratch buffer for getting good optimisation.
  75  * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of partial convolution
  76  */
  77
  78 /**
  79  * @addtogroup PartialConv
  80  * @{
  81  */
  82
  83 /**
  84  * @brief Partial convolution of floating-point sequences.
  85  * @param[in]       *pSrcA points to the first input sequence.
  86  * @param[in]       srcALen length of the first input sequence.
  87  * @param[in]       *pSrcB points to the second input sequence.
  88  * @param[in]       srcBLen length of the second input sequence.
  89  * @param[out]      *pDst points to the location where the output result is written.
  90  * @param[in]       firstIndex is the first output sample to start with.
  91  * @param[in]       numPoints is the number of output points to be computed.
  92  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
  93  */
  94
  95 arm_status arm_conv_partial_f32(
  96   float32_t * pSrcA,
  97   uint32_t srcALen,
  98   float32_t * pSrcB,
  99   uint32_t srcBLen,
 100   float32_t * pDst,
 101   uint32_t firstIndex,
 102   uint32_t numPoints)
 103 {
 104
 105
 106 #ifndef ARM_MATH_CM0_FAMILY
 107
 108   /* Run the below code for Cortex-M4 and Cortex-M3 */
 109
 110   float32_t *pIn1 = pSrcA;                       /* inputA pointer */
 111   float32_t *pIn2 = pSrcB;                       /* inputB pointer */
 112   float32_t *pOut = pDst;                        /* output pointer */
 113   float32_t *px;                                 /* Intermediate inputA pointer */
 114   float32_t *py;                                 /* Intermediate inputB pointer */
 115   float32_t *pSrc1, *pSrc2;                      /* Intermediate pointers */
 116   float32_t sum, acc0, acc1, acc2, acc3;         /* Accumulator */
 117   float32_t x0, x1, x2, x3, c0;                  /* Temporary variables to hold state and coefficient values */
 118   uint32_t j, k, count = 0u, blkCnt, check;
 119   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters */
 120   arm_status status;                             /* status of Partial convolution */
 121
 122
 123   /* Check for range of output samples to be calculated */
 124   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
 125   {
 126     /* Set status as ARM_MATH_ARGUMENT_ERROR */
 127     status = ARM_MATH_ARGUMENT_ERROR;
 128   }
 129   else
 130   {
 131
 132     /* The algorithm implementation is based on the lengths of the inputs. */
 133     /* srcB is always made to slide across srcA. */
 134     /* So srcBLen is always considered as shorter or equal to srcALen */
 135     if(srcALen >= srcBLen)
 136     {
 137       /* Initialization of inputA pointer */
 138       pIn1 = pSrcA;
 139
 140       /* Initialization of inputB pointer */
 141       pIn2 = pSrcB;
 142     }
 143     else
 144     {
 145       /* Initialization of inputA pointer */
 146       pIn1 = pSrcB;
 147
 148       /* Initialization of inputB pointer */
 149       pIn2 = pSrcA;
 150
 151       /* srcBLen is always considered as shorter or equal to srcALen */
 152       j = srcBLen;
 153       srcBLen = srcALen;
 154       srcALen = j;
 155     }
 156
 157     /* Conditions to check which loopCounter holds
 158      * the first and last indices of the output samples to be calculated. */
 159     check = firstIndex + numPoints;
 160     blockSize3 = (int32_t) check - (int32_t) srcALen;
 161     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
 162     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
 163     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
 164                                      (int32_t) numPoints) : 0;
 165     blockSize2 = ((int32_t) check - blockSize3) -
 166       (blockSize1 + (int32_t) firstIndex);
 167     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
 168
 169     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
 170     /* The function is internally
 171      * divided into three stages according to the number of multiplications that has to be
 172      * taken place between inputA samples and inputB samples. In the first stage of the
 173      * algorithm, the multiplications increase by one for every iteration.
 174      * In the second stage of the algorithm, srcBLen number of multiplications are done.
 175      * In the third stage of the algorithm, the multiplications decrease by one
 176      * for every iteration. */
 177
 178     /* Set the output pointer to point to the firstIndex
 179      * of the output sample to be calculated. */
 180     pOut = pDst + firstIndex;
 181
 182     /* --------------------------
 183      * Initializations of stage1
 184      * -------------------------*/
 185
 186     /* sum = x[0] * y[0]
 187      * sum = x[0] * y[1] + x[1] * y[0]
 188      * ....
 189      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
 190      */
 191
 192     /* In this stage the MAC operations are increased by 1 for every iteration.
 193        The count variable holds the number of MAC operations performed.
 194        Since the partial convolution starts from from firstIndex
 195        Number of Macs to be performed is firstIndex + 1 */
 196     count = 1u + firstIndex;
 197
 198     /* Working pointer of inputA */
 199     px = pIn1;
 200
 201     /* Working pointer of inputB */
 202     pSrc1 = pIn2 + firstIndex;
 203     py = pSrc1;
 204
 205     /* ------------------------
 206      * Stage1 process
 207      * ----------------------*/
 208
 209     /* The first stage starts here */
 210     while(blockSize1 > 0)
 211     {
 212       /* Accumulator is made zero for every iteration */
 213       sum = 0.0f;
 214
 215       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 216       k = count >> 2u;
 217
 218       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 219        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 220       while(k > 0u)
 221       {
 222         /* x[0] * y[srcBLen - 1] */
 223         sum += *px++ * *py--;
 224
 225         /* x[1] * y[srcBLen - 2] */
 226         sum += *px++ * *py--;
 227
 228         /* x[2] * y[srcBLen - 3] */
 229         sum += *px++ * *py--;
 230
 231         /* x[3] * y[srcBLen - 4] */
 232         sum += *px++ * *py--;
 233
 234         /* Decrement the loop counter */
 235         k--;
 236       }
 237
 238       /* If the count is not a multiple of 4, compute any remaining MACs here.
 239        ** No loop unrolling is used. */
 240       k = count % 0x4u;
 241
 242       while(k > 0u)
 243       {
 244         /* Perform the multiply-accumulates */
 245         sum += *px++ * *py--;
 246
 247         /* Decrement the loop counter */
 248         k--;
 249       }
 250
 251       /* Store the result in the accumulator in the destination buffer. */
 252       *pOut++ = sum;
 253
 254       /* Update the inputA and inputB pointers for next MAC calculation */
 255       py = ++pSrc1;
 256       px = pIn1;
 257
 258       /* Increment the MAC count */
 259       count++;
 260
 261       /* Decrement the loop counter */
 262       blockSize1--;
 263     }
 264
 265     /* --------------------------
 266      * Initializations of stage2
 267      * ------------------------*/
 268
 269     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
 270      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
 271      * ....
 272      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
 273      */
 274
 275     /* Working pointer of inputA */
 276     px = pIn1;
 277
 278     /* Working pointer of inputB */
 279     pSrc2 = pIn2 + (srcBLen - 1u);
 280     py = pSrc2;
 281
 282     /* count is index by which the pointer pIn1 to be incremented */
 283     count = 0u;
 284
 285     /* -------------------
 286      * Stage2 process
 287      * ------------------*/
 288
 289     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 290      * So, to loop unroll over blockSize2,
 291      * srcBLen should be greater than or equal to 4 */
 292     if(srcBLen >= 4u)
 293     {
 294       /* Loop unroll over blockSize2, by 4 */
 295       blkCnt = ((uint32_t) blockSize2 >> 2u);
 296
 297       while(blkCnt > 0u)
 298       {
 299         /* Set all accumulators to zero */
 300         acc0 = 0.0f;
 301         acc1 = 0.0f;
 302         acc2 = 0.0f;
 303         acc3 = 0.0f;
 304
 305         /* read x[0], x[1], x[2] samples */
 306         x0 = *(px++);
 307         x1 = *(px++);
 308         x2 = *(px++);
 309
 310         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 311         k = srcBLen >> 2u;
 312
 313         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 314          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 315         do
 316         {
 317           /* Read y[srcBLen - 1] sample */
 318           c0 = *(py--);
 319
 320           /* Read x[3] sample */
 321           x3 = *(px++);
 322
 323           /* Perform the multiply-accumulate */
 324           /* acc0 +=  x[0] * y[srcBLen - 1] */
 325           acc0 += x0 * c0;
 326
 327           /* acc1 +=  x[1] * y[srcBLen - 1] */
 328           acc1 += x1 * c0;
 329
 330           /* acc2 +=  x[2] * y[srcBLen - 1] */
 331           acc2 += x2 * c0;
 332
 333           /* acc3 +=  x[3] * y[srcBLen - 1] */
 334           acc3 += x3 * c0;
 335
 336           /* Read y[srcBLen - 2] sample */
 337           c0 = *(py--);
 338
 339           /* Read x[4] sample */
 340           x0 = *(px++);
 341
 342           /* Perform the multiply-accumulate */
 343           /* acc0 +=  x[1] * y[srcBLen - 2] */
 344           acc0 += x1 * c0;
 345           /* acc1 +=  x[2] * y[srcBLen - 2] */
 346           acc1 += x2 * c0;
 347           /* acc2 +=  x[3] * y[srcBLen - 2] */
 348           acc2 += x3 * c0;
 349           /* acc3 +=  x[4] * y[srcBLen - 2] */
 350           acc3 += x0 * c0;
 351
 352           /* Read y[srcBLen - 3] sample */
 353           c0 = *(py--);
 354
 355           /* Read x[5] sample */
 356           x1 = *(px++);
 357
 358           /* Perform the multiply-accumulates */
 359           /* acc0 +=  x[2] * y[srcBLen - 3] */
 360           acc0 += x2 * c0;
 361           /* acc1 +=  x[3] * y[srcBLen - 2] */
 362           acc1 += x3 * c0;
 363           /* acc2 +=  x[4] * y[srcBLen - 2] */
 364           acc2 += x0 * c0;
 365           /* acc3 +=  x[5] * y[srcBLen - 2] */
 366           acc3 += x1 * c0;
 367
 368           /* Read y[srcBLen - 4] sample */
 369           c0 = *(py--);
 370
 371           /* Read x[6] sample */
 372           x2 = *(px++);
 373
 374           /* Perform the multiply-accumulates */
 375           /* acc0 +=  x[3] * y[srcBLen - 4] */
 376           acc0 += x3 * c0;
 377           /* acc1 +=  x[4] * y[srcBLen - 4] */
 378           acc1 += x0 * c0;
 379           /* acc2 +=  x[5] * y[srcBLen - 4] */
 380           acc2 += x1 * c0;
 381           /* acc3 +=  x[6] * y[srcBLen - 4] */
 382           acc3 += x2 * c0;
 383
 384
 385         } while(--k);
 386
 387         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 388          ** No loop unrolling is used. */
 389         k = srcBLen % 0x4u;
 390
 391         while(k > 0u)
 392         {
 393           /* Read y[srcBLen - 5] sample */
 394           c0 = *(py--);
 395
 396           /* Read x[7] sample */
 397           x3 = *(px++);
 398
 399           /* Perform the multiply-accumulates */
 400           /* acc0 +=  x[4] * y[srcBLen - 5] */
 401           acc0 += x0 * c0;
 402           /* acc1 +=  x[5] * y[srcBLen - 5] */
 403           acc1 += x1 * c0;
 404           /* acc2 +=  x[6] * y[srcBLen - 5] */
 405           acc2 += x2 * c0;
 406           /* acc3 +=  x[7] * y[srcBLen - 5] */
 407           acc3 += x3 * c0;
 408
 409           /* Reuse the present samples for the next MAC */
 410           x0 = x1;
 411           x1 = x2;
 412           x2 = x3;
 413
 414           /* Decrement the loop counter */
 415           k--;
 416         }
 417
 418         /* Store the result in the accumulator in the destination buffer. */
 419         *pOut++ = acc0;
 420         *pOut++ = acc1;
 421         *pOut++ = acc2;
 422         *pOut++ = acc3;
 423
 424         /* Increment the pointer pIn1 index, count by 1 */
 425         count += 4u;
 426
 427         /* Update the inputA and inputB pointers for next MAC calculation */
 428         px = pIn1 + count;
 429         py = pSrc2;
 430
 431         /* Decrement the loop counter */
 432         blkCnt--;
 433       }
 434
 435       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 436        ** No loop unrolling is used. */
 437       blkCnt = (uint32_t) blockSize2 % 0x4u;
 438
 439       while(blkCnt > 0u)
 440       {
 441         /* Accumulator is made zero for every iteration */
 442         sum = 0.0f;
 443
 444         /* Apply loop unrolling and compute 4 MACs simultaneously. */
 445         k = srcBLen >> 2u;
 446
 447         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 448          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 449         while(k > 0u)
 450         {
 451           /* Perform the multiply-accumulates */
 452           sum += *px++ * *py--;
 453           sum += *px++ * *py--;
 454           sum += *px++ * *py--;
 455           sum += *px++ * *py--;
 456
 457           /* Decrement the loop counter */
 458           k--;
 459         }
 460
 461         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 462          ** No loop unrolling is used. */
 463         k = srcBLen % 0x4u;
 464
 465         while(k > 0u)
 466         {
 467           /* Perform the multiply-accumulate */
 468           sum += *px++ * *py--;
 469
 470           /* Decrement the loop counter */
 471           k--;
 472         }
 473
 474         /* Store the result in the accumulator in the destination buffer. */
 475         *pOut++ = sum;
 476
 477         /* Increment the MAC count */
 478         count++;
 479
 480         /* Update the inputA and inputB pointers for next MAC calculation */
 481         px = pIn1 + count;
 482         py = pSrc2;
 483
 484         /* Decrement the loop counter */
 485         blkCnt--;
 486       }
 487     }
 488     else
 489     {
 490       /* If the srcBLen is not a multiple of 4,
 491        * the blockSize2 loop cannot be unrolled by 4 */
 492       blkCnt = (uint32_t) blockSize2;
 493
 494       while(blkCnt > 0u)
 495       {
 496         /* Accumulator is made zero for every iteration */
 497         sum = 0.0f;
 498
 499         /* srcBLen number of MACS should be performed */
 500         k = srcBLen;
 501
 502         while(k > 0u)
 503         {
 504           /* Perform the multiply-accumulate */
 505           sum += *px++ * *py--;
 506
 507           /* Decrement the loop counter */
 508           k--;
 509         }
 510
 511         /* Store the result in the accumulator in the destination buffer. */
 512         *pOut++ = sum;
 513
 514         /* Increment the MAC count */
 515         count++;
 516
 517         /* Update the inputA and inputB pointers for next MAC calculation */
 518         px = pIn1 + count;
 519         py = pSrc2;
 520
 521         /* Decrement the loop counter */
 522         blkCnt--;
 523       }
 524     }
 525
 526
 527     /* --------------------------
 528      * Initializations of stage3
 529      * -------------------------*/
 530
 531     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
 532      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
 533      * ....
 534      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
 535      * sum +=  x[srcALen-1] * y[srcBLen-1]
 536      */
 537
 538     /* In this stage the MAC operations are decreased by 1 for every iteration.
 539        The count variable holds the number of MAC operations performed */
 540     count = srcBLen - 1u;
 541
 542     /* Working pointer of inputA */
 543     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 544     px = pSrc1;
 545
 546     /* Working pointer of inputB */
 547     pSrc2 = pIn2 + (srcBLen - 1u);
 548     py = pSrc2;
 549
 550     while(blockSize3 > 0)
 551     {
 552       /* Accumulator is made zero for every iteration */
 553       sum = 0.0f;
 554
 555       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 556       k = count >> 2u;
 557
 558       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 559        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 560       while(k > 0u)
 561       {
 562         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
 563         sum += *px++ * *py--;
 564
 565         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
 566         sum += *px++ * *py--;
 567
 568         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
 569         sum += *px++ * *py--;
 570
 571         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
 572         sum += *px++ * *py--;
 573
 574         /* Decrement the loop counter */
 575         k--;
 576       }
 577
 578       /* If the count is not a multiple of 4, compute any remaining MACs here.
 579        ** No loop unrolling is used. */
 580       k = count % 0x4u;
 581
 582       while(k > 0u)
 583       {
 584         /* Perform the multiply-accumulates */
 585         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
 586         sum += *px++ * *py--;
 587
 588         /* Decrement the loop counter */
 589         k--;
 590       }
 591
 592       /* Store the result in the accumulator in the destination buffer. */
 593       *pOut++ = sum;
 594
 595       /* Update the inputA and inputB pointers for next MAC calculation */
 596       px = ++pSrc1;
 597       py = pSrc2;
 598
 599       /* Decrement the MAC count */
 600       count--;
 601
 602       /* Decrement the loop counter */
 603       blockSize3--;
 604
 605     }
 606
 607     /* set status as ARM_MATH_SUCCESS */
 608     status = ARM_MATH_SUCCESS;
 609   }
 610
 611   /* Return to application */
 612   return (status);
 613
 614 #else
 615
 616   /* Run the below code for Cortex-M0 */
 617
 618   float32_t *pIn1 = pSrcA;                       /* inputA pointer */
 619   float32_t *pIn2 = pSrcB;                       /* inputB pointer */
 620   float32_t sum;                                 /* Accumulator */
 621   uint32_t i, j;                                 /* loop counters */
 622   arm_status status;                             /* status of Partial convolution */
 623
 624   /* Check for range of output samples to be calculated */
 625   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
 626   {
 627     /* Set status as ARM_ARGUMENT_ERROR */
 628     status = ARM_MATH_ARGUMENT_ERROR;
 629   }
 630   else
 631   {
 632     /* Loop to calculate convolution for output length number of values */
 633     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
 634     {
 635       /* Initialize sum with zero to carry on MAC operations */
 636       sum = 0.0f;
 637
 638       /* Loop to perform MAC operations according to convolution equation */
 639       for (j = 0u; j <= i; j++)
 640       {
 641         /* Check the array limitations for inputs */
 642         if((((i - j) < srcBLen) && (j < srcALen)))
 643         {
 644           /* z[i] += x[i-j] * y[j] */
 645           sum += pIn1[j] * pIn2[i - j];
 646         }
 647       }
 648       /* Store the output in the destination buffer */
 649       pDst[i] = sum;
 650     }
 651     /* set status as ARM_SUCCESS as there are no argument errors */
 652     status = ARM_MATH_SUCCESS;
 653   }
 654   return (status);
 655
 656 #endif /*   #ifndef ARM_MATH_CM0_FAMILY */
 657
 658 }
 659
 660 /**
 661  * @} end of PartialConv group
 662  */