tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_f32.c

   1 /* ----------------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_f32.c
   9 *
  10 * Description:  Convolution of floating-point sequences.
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @defgroup Conv Convolution
  49  *
  50  * Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector.
  51  * Convolution is similar to correlation and is frequently used in filtering and data analysis.
  52  * The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types.
  53  * The library also provides fast versions of the Q15 and Q31 functions on Cortex-M4 and Cortex-M3.
  54  *
  55  * \par Algorithm
  56  * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
  57  * Then the convolution
  58  *
  59  * <pre>
  60  *                   c[n] = a[n] * b[n]
  61  * </pre>
  62  *
  63  * \par
  64  * is defined as
  65  * \image html ConvolutionEquation.gif
  66  * \par
  67  * Note that <code>c[n]</code> is of length <code>srcALen + srcBLen - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., srcALen + srcBLen - 2</code>.
  68  * <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and
  69  * <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
  70  * The output result is written to <code>pDst</code> and the calling function must allocate <code>srcALen+srcBLen-1</code> words for the result.
  71  *
  72  * \par
  73  * Conceptually, when two signals <code>a[n]</code> and <code>b[n]</code> are convolved,
  74  * the signal <code>b[n]</code> slides over <code>a[n]</code>.
  75  * For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together.
  76  *
  77  * \par
  78  * Note that convolution is a commutative operation:
  79  *
  80  * <pre>
  81  *                   a[n] * b[n] = b[n] * a[n].
  82  * </pre>
  83  *
  84  * \par
  85  * This means that switching the A and B arguments to the convolution functions has no effect.
  86  *
  87  * <b>Fixed-Point Behavior</b>
  88  *
  89  * \par
  90  * Convolution requires summing up a large number of intermediate products.
  91  * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
  92  * Refer to the function specific documentation below for further details of the particular algorithm used.
  93  *
  94  *
  95  * <b>Fast Versions</b>
  96  *
  97  * \par
  98  * Fast versions are supported for Q31 and Q15.  Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires
  99  * the input signals should be scaled down to avoid intermediate overflows.
 100  *
 101  *
 102  * <b>Opt Versions</b>
 103  *
 104  * \par
 105  * Opt versions are supported for Q15 and Q7.  Design uses internal scratch buffer for getting good optimisation.
 106  * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions
 107  */
 108
 109 /**
 110  * @addtogroup Conv
 111  * @{
 112  */
 113
 114 /**
 115  * @brief Convolution of floating-point sequences.
 116  * @param[in] *pSrcA points to the first input sequence.
 117  * @param[in] srcALen length of the first input sequence.
 118  * @param[in] *pSrcB points to the second input sequence.
 119  * @param[in] srcBLen length of the second input sequence.
 120  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
 121  * @return none.
 122  */
 123
 124 void arm_conv_f32(
 125   float32_t * pSrcA,
 126   uint32_t srcALen,
 127   float32_t * pSrcB,
 128   uint32_t srcBLen,
 129   float32_t * pDst)
 130 {
 131
 132
 133 #ifndef ARM_MATH_CM0_FAMILY
 134
 135   /* Run the below code for Cortex-M4 and Cortex-M3 */
 136
 137   float32_t *pIn1;                               /* inputA pointer */
 138   float32_t *pIn2;                               /* inputB pointer */
 139   float32_t *pOut = pDst;                        /* output pointer */
 140   float32_t *px;                                 /* Intermediate inputA pointer */
 141   float32_t *py;                                 /* Intermediate inputB pointer */
 142   float32_t *pSrc1, *pSrc2;                      /* Intermediate pointers */
 143   float32_t sum, acc0, acc1, acc2, acc3;         /* Accumulator */
 144   float32_t x0, x1, x2, x3, c0;                  /* Temporary variables to hold state and coefficient values */
 145   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counters */
 146
 147   /* The algorithm implementation is based on the lengths of the inputs. */
 148   /* srcB is always made to slide across srcA. */
 149   /* So srcBLen is always considered as shorter or equal to srcALen */
 150   if(srcALen >= srcBLen)
 151   {
 152     /* Initialization of inputA pointer */
 153     pIn1 = pSrcA;
 154
 155     /* Initialization of inputB pointer */
 156     pIn2 = pSrcB;
 157   }
 158   else
 159   {
 160     /* Initialization of inputA pointer */
 161     pIn1 = pSrcB;
 162
 163     /* Initialization of inputB pointer */
 164     pIn2 = pSrcA;
 165
 166     /* srcBLen is always considered as shorter or equal to srcALen */
 167     j = srcBLen;
 168     srcBLen = srcALen;
 169     srcALen = j;
 170   }
 171
 172   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
 173   /* The function is internally
 174    * divided into three stages according to the number of multiplications that has to be
 175    * taken place between inputA samples and inputB samples. In the first stage of the
 176    * algorithm, the multiplications increase by one for every iteration.
 177    * In the second stage of the algorithm, srcBLen number of multiplications are done.
 178    * In the third stage of the algorithm, the multiplications decrease by one
 179    * for every iteration. */
 180
 181   /* The algorithm is implemented in three stages.
 182      The loop counters of each stage is initiated here. */
 183   blockSize1 = srcBLen - 1u;
 184   blockSize2 = srcALen - (srcBLen - 1u);
 185   blockSize3 = blockSize1;
 186
 187   /* --------------------------
 188    * initializations of stage1
 189    * -------------------------*/
 190
 191   /* sum = x[0] * y[0]
 192    * sum = x[0] * y[1] + x[1] * y[0]
 193    * ....
 194    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
 195    */
 196
 197   /* In this stage the MAC operations are increased by 1 for every iteration.
 198      The count variable holds the number of MAC operations performed */
 199   count = 1u;
 200
 201   /* Working pointer of inputA */
 202   px = pIn1;
 203
 204   /* Working pointer of inputB */
 205   py = pIn2;
 206
 207
 208   /* ------------------------
 209    * Stage1 process
 210    * ----------------------*/
 211
 212   /* The first stage starts here */
 213   while(blockSize1 > 0u)
 214   {
 215     /* Accumulator is made zero for every iteration */
 216     sum = 0.0f;
 217
 218     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 219     k = count >> 2u;
 220
 221     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 222      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 223     while(k > 0u)
 224     {
 225       /* x[0] * y[srcBLen - 1] */
 226       sum += *px++ * *py--;
 227
 228       /* x[1] * y[srcBLen - 2] */
 229       sum += *px++ * *py--;
 230
 231       /* x[2] * y[srcBLen - 3] */
 232       sum += *px++ * *py--;
 233
 234       /* x[3] * y[srcBLen - 4] */
 235       sum += *px++ * *py--;
 236
 237       /* Decrement the loop counter */
 238       k--;
 239     }
 240
 241     /* If the count is not a multiple of 4, compute any remaining MACs here.
 242      ** No loop unrolling is used. */
 243     k = count % 0x4u;
 244
 245     while(k > 0u)
 246     {
 247       /* Perform the multiply-accumulate */
 248       sum += *px++ * *py--;
 249
 250       /* Decrement the loop counter */
 251       k--;
 252     }
 253
 254     /* Store the result in the accumulator in the destination buffer. */
 255     *pOut++ = sum;
 256
 257     /* Update the inputA and inputB pointers for next MAC calculation */
 258     py = pIn2 + count;
 259     px = pIn1;
 260
 261     /* Increment the MAC count */
 262     count++;
 263
 264     /* Decrement the loop counter */
 265     blockSize1--;
 266   }
 267
 268   /* --------------------------
 269    * Initializations of stage2
 270    * ------------------------*/
 271
 272   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
 273    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
 274    * ....
 275    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
 276    */
 277
 278   /* Working pointer of inputA */
 279   px = pIn1;
 280
 281   /* Working pointer of inputB */
 282   pSrc2 = pIn2 + (srcBLen - 1u);
 283   py = pSrc2;
 284
 285   /* count is index by which the pointer pIn1 to be incremented */
 286   count = 0u;
 287
 288   /* -------------------
 289    * Stage2 process
 290    * ------------------*/
 291
 292   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 293    * So, to loop unroll over blockSize2,
 294    * srcBLen should be greater than or equal to 4 */
 295   if(srcBLen >= 4u)
 296   {
 297     /* Loop unroll over blockSize2, by 4 */
 298     blkCnt = blockSize2 >> 2u;
 299
 300     while(blkCnt > 0u)
 301     {
 302       /* Set all accumulators to zero */
 303       acc0 = 0.0f;
 304       acc1 = 0.0f;
 305       acc2 = 0.0f;
 306       acc3 = 0.0f;
 307
 308       /* read x[0], x[1], x[2] samples */
 309       x0 = *(px++);
 310       x1 = *(px++);
 311       x2 = *(px++);
 312
 313       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 314       k = srcBLen >> 2u;
 315
 316       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 317        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 318       do
 319       {
 320         /* Read y[srcBLen - 1] sample */
 321         c0 = *(py--);
 322
 323         /* Read x[3] sample */
 324         x3 = *(px);
 325
 326         /* Perform the multiply-accumulate */
 327         /* acc0 +=  x[0] * y[srcBLen - 1] */
 328         acc0 += x0 * c0;
 329
 330         /* acc1 +=  x[1] * y[srcBLen - 1] */
 331         acc1 += x1 * c0;
 332
 333         /* acc2 +=  x[2] * y[srcBLen - 1] */
 334         acc2 += x2 * c0;
 335
 336         /* acc3 +=  x[3] * y[srcBLen - 1] */
 337         acc3 += x3 * c0;
 338
 339         /* Read y[srcBLen - 2] sample */
 340         c0 = *(py--);
 341
 342         /* Read x[4] sample */
 343         x0 = *(px + 1u);
 344
 345         /* Perform the multiply-accumulate */
 346         /* acc0 +=  x[1] * y[srcBLen - 2] */
 347         acc0 += x1 * c0;
 348         /* acc1 +=  x[2] * y[srcBLen - 2] */
 349         acc1 += x2 * c0;
 350         /* acc2 +=  x[3] * y[srcBLen - 2] */
 351         acc2 += x3 * c0;
 352         /* acc3 +=  x[4] * y[srcBLen - 2] */
 353         acc3 += x0 * c0;
 354
 355         /* Read y[srcBLen - 3] sample */
 356         c0 = *(py--);
 357
 358         /* Read x[5] sample */
 359         x1 = *(px + 2u);
 360
 361         /* Perform the multiply-accumulates */
 362         /* acc0 +=  x[2] * y[srcBLen - 3] */
 363         acc0 += x2 * c0;
 364         /* acc1 +=  x[3] * y[srcBLen - 2] */
 365         acc1 += x3 * c0;
 366         /* acc2 +=  x[4] * y[srcBLen - 2] */
 367         acc2 += x0 * c0;
 368         /* acc3 +=  x[5] * y[srcBLen - 2] */
 369         acc3 += x1 * c0;
 370
 371         /* Read y[srcBLen - 4] sample */
 372         c0 = *(py--);
 373
 374         /* Read x[6] sample */
 375         x2 = *(px + 3u);
 376         px += 4u;
 377
 378         /* Perform the multiply-accumulates */
 379         /* acc0 +=  x[3] * y[srcBLen - 4] */
 380         acc0 += x3 * c0;
 381         /* acc1 +=  x[4] * y[srcBLen - 4] */
 382         acc1 += x0 * c0;
 383         /* acc2 +=  x[5] * y[srcBLen - 4] */
 384         acc2 += x1 * c0;
 385         /* acc3 +=  x[6] * y[srcBLen - 4] */
 386         acc3 += x2 * c0;
 387
 388
 389       } while(--k);
 390
 391       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 392        ** No loop unrolling is used. */
 393       k = srcBLen % 0x4u;
 394
 395       while(k > 0u)
 396       {
 397         /* Read y[srcBLen - 5] sample */
 398         c0 = *(py--);
 399
 400         /* Read x[7] sample */
 401         x3 = *(px++);
 402
 403         /* Perform the multiply-accumulates */
 404         /* acc0 +=  x[4] * y[srcBLen - 5] */
 405         acc0 += x0 * c0;
 406         /* acc1 +=  x[5] * y[srcBLen - 5] */
 407         acc1 += x1 * c0;
 408         /* acc2 +=  x[6] * y[srcBLen - 5] */
 409         acc2 += x2 * c0;
 410         /* acc3 +=  x[7] * y[srcBLen - 5] */
 411         acc3 += x3 * c0;
 412
 413         /* Reuse the present samples for the next MAC */
 414         x0 = x1;
 415         x1 = x2;
 416         x2 = x3;
 417
 418         /* Decrement the loop counter */
 419         k--;
 420       }
 421
 422       /* Store the result in the accumulator in the destination buffer. */
 423       *pOut++ = acc0;
 424       *pOut++ = acc1;
 425       *pOut++ = acc2;
 426       *pOut++ = acc3;
 427
 428       /* Increment the pointer pIn1 index, count by 4 */
 429       count += 4u;
 430
 431       /* Update the inputA and inputB pointers for next MAC calculation */
 432       px = pIn1 + count;
 433       py = pSrc2;
 434
 435
 436       /* Decrement the loop counter */
 437       blkCnt--;
 438     }
 439
 440
 441     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 442      ** No loop unrolling is used. */
 443     blkCnt = blockSize2 % 0x4u;
 444
 445     while(blkCnt > 0u)
 446     {
 447       /* Accumulator is made zero for every iteration */
 448       sum = 0.0f;
 449
 450       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 451       k = srcBLen >> 2u;
 452
 453       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 454        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 455       while(k > 0u)
 456       {
 457         /* Perform the multiply-accumulates */
 458         sum += *px++ * *py--;
 459         sum += *px++ * *py--;
 460         sum += *px++ * *py--;
 461         sum += *px++ * *py--;
 462
 463         /* Decrement the loop counter */
 464         k--;
 465       }
 466
 467       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 468        ** No loop unrolling is used. */
 469       k = srcBLen % 0x4u;
 470
 471       while(k > 0u)
 472       {
 473         /* Perform the multiply-accumulate */
 474         sum += *px++ * *py--;
 475
 476         /* Decrement the loop counter */
 477         k--;
 478       }
 479
 480       /* Store the result in the accumulator in the destination buffer. */
 481       *pOut++ = sum;
 482
 483       /* Increment the MAC count */
 484       count++;
 485
 486       /* Update the inputA and inputB pointers for next MAC calculation */
 487       px = pIn1 + count;
 488       py = pSrc2;
 489
 490       /* Decrement the loop counter */
 491       blkCnt--;
 492     }
 493   }
 494   else
 495   {
 496     /* If the srcBLen is not a multiple of 4,
 497      * the blockSize2 loop cannot be unrolled by 4 */
 498     blkCnt = blockSize2;
 499
 500     while(blkCnt > 0u)
 501     {
 502       /* Accumulator is made zero for every iteration */
 503       sum = 0.0f;
 504
 505       /* srcBLen number of MACS should be performed */
 506       k = srcBLen;
 507
 508       while(k > 0u)
 509       {
 510         /* Perform the multiply-accumulate */
 511         sum += *px++ * *py--;
 512
 513         /* Decrement the loop counter */
 514         k--;
 515       }
 516
 517       /* Store the result in the accumulator in the destination buffer. */
 518       *pOut++ = sum;
 519
 520       /* Increment the MAC count */
 521       count++;
 522
 523       /* Update the inputA and inputB pointers for next MAC calculation */
 524       px = pIn1 + count;
 525       py = pSrc2;
 526
 527       /* Decrement the loop counter */
 528       blkCnt--;
 529     }
 530   }
 531
 532
 533   /* --------------------------
 534    * Initializations of stage3
 535    * -------------------------*/
 536
 537   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
 538    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
 539    * ....
 540    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
 541    * sum +=  x[srcALen-1] * y[srcBLen-1]
 542    */
 543
 544   /* In this stage the MAC operations are decreased by 1 for every iteration.
 545      The blockSize3 variable holds the number of MAC operations performed */
 546
 547   /* Working pointer of inputA */
 548   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 549   px = pSrc1;
 550
 551   /* Working pointer of inputB */
 552   pSrc2 = pIn2 + (srcBLen - 1u);
 553   py = pSrc2;
 554
 555   /* -------------------
 556    * Stage3 process
 557    * ------------------*/
 558
 559   while(blockSize3 > 0u)
 560   {
 561     /* Accumulator is made zero for every iteration */
 562     sum = 0.0f;
 563
 564     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 565     k = blockSize3 >> 2u;
 566
 567     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 568      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 569     while(k > 0u)
 570     {
 571       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
 572       sum += *px++ * *py--;
 573
 574       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
 575       sum += *px++ * *py--;
 576
 577       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
 578       sum += *px++ * *py--;
 579
 580       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
 581       sum += *px++ * *py--;
 582
 583       /* Decrement the loop counter */
 584       k--;
 585     }
 586
 587     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
 588      ** No loop unrolling is used. */
 589     k = blockSize3 % 0x4u;
 590
 591     while(k > 0u)
 592     {
 593       /* Perform the multiply-accumulates */
 594       /* sum +=  x[srcALen-1] * y[srcBLen-1] */
 595       sum += *px++ * *py--;
 596
 597       /* Decrement the loop counter */
 598       k--;
 599     }
 600
 601     /* Store the result in the accumulator in the destination buffer. */
 602     *pOut++ = sum;
 603
 604     /* Update the inputA and inputB pointers for next MAC calculation */
 605     px = ++pSrc1;
 606     py = pSrc2;
 607
 608     /* Decrement the loop counter */
 609     blockSize3--;
 610   }
 611
 612 #else
 613
 614   /* Run the below code for Cortex-M0 */
 615
 616   float32_t *pIn1 = pSrcA;                       /* inputA pointer */
 617   float32_t *pIn2 = pSrcB;                       /* inputB pointer */
 618   float32_t sum;                                 /* Accumulator */
 619   uint32_t i, j;                                 /* loop counters */
 620
 621   /* Loop to calculate convolution for output length number of times */
 622   for (i = 0u; i < ((srcALen + srcBLen) - 1u); i++)
 623   {
 624     /* Initialize sum with zero to carry out MAC operations */
 625     sum = 0.0f;
 626
 627     /* Loop to perform MAC operations according to convolution equation */
 628     for (j = 0u; j <= i; j++)
 629     {
 630       /* Check the array limitations */
 631       if((((i - j) < srcBLen) && (j < srcALen)))
 632       {
 633         /* z[i] += x[i-j] * y[j] */
 634         sum += pIn1[j] * pIn2[i - j];
 635       }
 636     }
 637     /* Store the output in the destination buffer */
 638     pDst[i] = sum;
 639   }
 640
 641 #endif /*   #ifndef ARM_MATH_CM0_FAMILY        */
 642
 643 }
 644
 645 /**
 646  * @} end of Conv group
 647  */