tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_fast_q31.c

   1 /* ----------------------------------------------------------------------
   2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
   3 *
   4 * $Date:        17. January 2013
   5 * $Revision:    V1.4.1
   6 *
   7 * Project:          CMSIS DSP Library
   8 * Title:                arm_conv_fast_q31.c
   9 *
  10 * Description:  Q31 Convolution (fast version).
  11 *
  12 * Target Processor: Cortex-M4/Cortex-M3
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 *   - Redistributions of source code must retain the above copyright
  18 *     notice, this list of conditions and the following disclaimer.
  19 *   - Redistributions in binary form must reproduce the above copyright
  20 *     notice, this list of conditions and the following disclaimer in
  21 *     the documentation and/or other materials provided with the
  22 *     distribution.
  23 *   - Neither the name of ARM LIMITED nor the names of its contributors
  24 *     may be used to endorse or promote products derived from this
  25 *     software without specific prior written permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 * POSSIBILITY OF SUCH DAMAGE.
  39 * -------------------------------------------------------------------- */
  40
  41 #include "arm_math.h"
  42
  43 /**
  44  * @ingroup groupFilters
  45  */
  46
  47 /**
  48  * @addtogroup Conv
  49  * @{
  50  */
  51
  52 /**
  53  * @param[in] *pSrcA points to the first input sequence.
  54  * @param[in] srcALen length of the first input sequence.
  55  * @param[in] *pSrcB points to the second input sequence.
  56  * @param[in] srcBLen length of the second input sequence.
  57  * @param[out] *pDst points to the location where the output result is written.  Length srcALen+srcBLen-1.
  58  * @return none.
  59  *
  60  * @details
  61  * <b>Scaling and Overflow Behavior:</b>
  62  *
  63  * \par
  64  * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
  65  * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
  66  * These intermediate results are accumulated in a 32-bit register in 2.30 format.
  67  * Finally, the accumulator is saturated and converted to a 1.31 result.
  68  *
  69  * \par
  70  * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
  71  * In order to avoid overflows completely the input signals must be scaled down.
  72  * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
  73  * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
  74  *
  75  * \par
  76  * See <code>arm_conv_q31()</code> for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
  77  */
  78
  79 void arm_conv_fast_q31(
  80   q31_t * pSrcA,
  81   uint32_t srcALen,
  82   q31_t * pSrcB,
  83   uint32_t srcBLen,
  84   q31_t * pDst)
  85 {
  86   q31_t *pIn1;                                   /* inputA pointer */
  87   q31_t *pIn2;                                   /* inputB pointer */
  88   q31_t *pOut = pDst;                            /* output pointer */
  89   q31_t *px;                                     /* Intermediate inputA pointer  */
  90   q31_t *py;                                     /* Intermediate inputB pointer  */
  91   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
  92   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
  93   q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
  94   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counter */
  95
  96   /* The algorithm implementation is based on the lengths of the inputs. */
  97   /* srcB is always made to slide across srcA. */
  98   /* So srcBLen is always considered as shorter or equal to srcALen */
  99   if(srcALen >= srcBLen)
 100   {
 101     /* Initialization of inputA pointer */
 102     pIn1 = pSrcA;
 103
 104     /* Initialization of inputB pointer */
 105     pIn2 = pSrcB;
 106   }
 107   else
 108   {
 109     /* Initialization of inputA pointer */
 110     pIn1 = pSrcB;
 111
 112     /* Initialization of inputB pointer */
 113     pIn2 = pSrcA;
 114
 115     /* srcBLen is always considered as shorter or equal to srcALen */
 116     j = srcBLen;
 117     srcBLen = srcALen;
 118     srcALen = j;
 119   }
 120
 121   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
 122   /* The function is internally
 123    * divided into three stages according to the number of multiplications that has to be
 124    * taken place between inputA samples and inputB samples. In the first stage of the
 125    * algorithm, the multiplications increase by one for every iteration.
 126    * In the second stage of the algorithm, srcBLen number of multiplications are done.
 127    * In the third stage of the algorithm, the multiplications decrease by one
 128    * for every iteration. */
 129
 130   /* The algorithm is implemented in three stages.
 131      The loop counters of each stage is initiated here. */
 132   blockSize1 = srcBLen - 1u;
 133   blockSize2 = srcALen - (srcBLen - 1u);
 134   blockSize3 = blockSize1;
 135
 136   /* --------------------------
 137    * Initializations of stage1
 138    * -------------------------*/
 139
 140   /* sum = x[0] * y[0]
 141    * sum = x[0] * y[1] + x[1] * y[0]
 142    * ....
 143    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
 144    */
 145
 146   /* In this stage the MAC operations are increased by 1 for every iteration.
 147      The count variable holds the number of MAC operations performed */
 148   count = 1u;
 149
 150   /* Working pointer of inputA */
 151   px = pIn1;
 152
 153   /* Working pointer of inputB */
 154   py = pIn2;
 155
 156
 157   /* ------------------------
 158    * Stage1 process
 159    * ----------------------*/
 160
 161   /* The first stage starts here */
 162   while(blockSize1 > 0u)
 163   {
 164     /* Accumulator is made zero for every iteration */
 165     sum = 0;
 166
 167     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 168     k = count >> 2u;
 169
 170     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 171      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 172     while(k > 0u)
 173     {
 174       /* x[0] * y[srcBLen - 1] */
 175       sum = (q31_t) ((((q63_t) sum << 32) +
 176                       ((q63_t) * px++ * (*py--))) >> 32);
 177
 178       /* x[1] * y[srcBLen - 2] */
 179       sum = (q31_t) ((((q63_t) sum << 32) +
 180                       ((q63_t) * px++ * (*py--))) >> 32);
 181
 182       /* x[2] * y[srcBLen - 3] */
 183       sum = (q31_t) ((((q63_t) sum << 32) +
 184                       ((q63_t) * px++ * (*py--))) >> 32);
 185
 186       /* x[3] * y[srcBLen - 4] */
 187       sum = (q31_t) ((((q63_t) sum << 32) +
 188                       ((q63_t) * px++ * (*py--))) >> 32);
 189
 190       /* Decrement the loop counter */
 191       k--;
 192     }
 193
 194     /* If the count is not a multiple of 4, compute any remaining MACs here.
 195      ** No loop unrolling is used. */
 196     k = count % 0x4u;
 197
 198     while(k > 0u)
 199     {
 200       /* Perform the multiply-accumulate */
 201       sum = (q31_t) ((((q63_t) sum << 32) +
 202                       ((q63_t) * px++ * (*py--))) >> 32);
 203
 204       /* Decrement the loop counter */
 205       k--;
 206     }
 207
 208     /* Store the result in the accumulator in the destination buffer. */
 209     *pOut++ = sum << 1;
 210
 211     /* Update the inputA and inputB pointers for next MAC calculation */
 212     py = pIn2 + count;
 213     px = pIn1;
 214
 215     /* Increment the MAC count */
 216     count++;
 217
 218     /* Decrement the loop counter */
 219     blockSize1--;
 220   }
 221
 222   /* --------------------------
 223    * Initializations of stage2
 224    * ------------------------*/
 225
 226   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
 227    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
 228    * ....
 229    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
 230    */
 231
 232   /* Working pointer of inputA */
 233   px = pIn1;
 234
 235   /* Working pointer of inputB */
 236   pSrc2 = pIn2 + (srcBLen - 1u);
 237   py = pSrc2;
 238
 239   /* count is index by which the pointer pIn1 to be incremented */
 240   count = 0u;
 241
 242   /* -------------------
 243    * Stage2 process
 244    * ------------------*/
 245
 246   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
 247    * So, to loop unroll over blockSize2,
 248    * srcBLen should be greater than or equal to 4 */
 249   if(srcBLen >= 4u)
 250   {
 251     /* Loop unroll over blockSize2, by 4 */
 252     blkCnt = blockSize2 >> 2u;
 253
 254     while(blkCnt > 0u)
 255     {
 256       /* Set all accumulators to zero */
 257       acc0 = 0;
 258       acc1 = 0;
 259       acc2 = 0;
 260       acc3 = 0;
 261
 262       /* read x[0], x[1], x[2] samples */
 263       x0 = *(px++);
 264       x1 = *(px++);
 265       x2 = *(px++);
 266
 267       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 268       k = srcBLen >> 2u;
 269
 270       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 271        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 272       do
 273       {
 274         /* Read y[srcBLen - 1] sample */
 275         c0 = *(py--);
 276
 277         /* Read x[3] sample */
 278         x3 = *(px++);
 279
 280         /* Perform the multiply-accumulates */
 281         /* acc0 +=  x[0] * y[srcBLen - 1] */
 282         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
 283
 284         /* acc1 +=  x[1] * y[srcBLen - 1] */
 285         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
 286
 287         /* acc2 +=  x[2] * y[srcBLen - 1] */
 288         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
 289
 290         /* acc3 +=  x[3] * y[srcBLen - 1] */
 291         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
 292
 293         /* Read y[srcBLen - 2] sample */
 294         c0 = *(py--);
 295
 296         /* Read x[4] sample */
 297         x0 = *(px++);
 298
 299         /* Perform the multiply-accumulate */
 300         /* acc0 +=  x[1] * y[srcBLen - 2] */
 301         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
 302         /* acc1 +=  x[2] * y[srcBLen - 2] */
 303         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
 304         /* acc2 +=  x[3] * y[srcBLen - 2] */
 305         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
 306         /* acc3 +=  x[4] * y[srcBLen - 2] */
 307         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
 308
 309         /* Read y[srcBLen - 3] sample */
 310         c0 = *(py--);
 311
 312         /* Read x[5] sample */
 313         x1 = *(px++);
 314
 315         /* Perform the multiply-accumulates */
 316         /* acc0 +=  x[2] * y[srcBLen - 3] */
 317         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
 318         /* acc1 +=  x[3] * y[srcBLen - 3] */
 319         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
 320         /* acc2 +=  x[4] * y[srcBLen - 3] */
 321         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
 322         /* acc3 +=  x[5] * y[srcBLen - 3] */
 323         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
 324
 325         /* Read y[srcBLen - 4] sample */
 326         c0 = *(py--);
 327
 328         /* Read x[6] sample */
 329         x2 = *(px++);
 330
 331         /* Perform the multiply-accumulates */
 332         /* acc0 +=  x[3] * y[srcBLen - 4] */
 333         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
 334         /* acc1 +=  x[4] * y[srcBLen - 4] */
 335         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
 336         /* acc2 +=  x[5] * y[srcBLen - 4] */
 337         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
 338         /* acc3 +=  x[6] * y[srcBLen - 4] */
 339         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
 340
 341
 342       } while(--k);
 343
 344       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 345        ** No loop unrolling is used. */
 346       k = srcBLen % 0x4u;
 347
 348       while(k > 0u)
 349       {
 350         /* Read y[srcBLen - 5] sample */
 351         c0 = *(py--);
 352
 353         /* Read x[7] sample */
 354         x3 = *(px++);
 355
 356         /* Perform the multiply-accumulates */
 357         /* acc0 +=  x[4] * y[srcBLen - 5] */
 358         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
 359         /* acc1 +=  x[5] * y[srcBLen - 5] */
 360         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
 361         /* acc2 +=  x[6] * y[srcBLen - 5] */
 362         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
 363         /* acc3 +=  x[7] * y[srcBLen - 5] */
 364         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
 365
 366         /* Reuse the present samples for the next MAC */
 367         x0 = x1;
 368         x1 = x2;
 369         x2 = x3;
 370
 371         /* Decrement the loop counter */
 372         k--;
 373       }
 374
 375       /* Store the results in the accumulators in the destination buffer. */
 376       *pOut++ = (q31_t) (acc0 << 1);
 377       *pOut++ = (q31_t) (acc1 << 1);
 378       *pOut++ = (q31_t) (acc2 << 1);
 379       *pOut++ = (q31_t) (acc3 << 1);
 380
 381       /* Increment the pointer pIn1 index, count by 4 */
 382       count += 4u;
 383
 384       /* Update the inputA and inputB pointers for next MAC calculation */
 385       px = pIn1 + count;
 386       py = pSrc2;
 387
 388       /* Decrement the loop counter */
 389       blkCnt--;
 390     }
 391
 392     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
 393      ** No loop unrolling is used. */
 394     blkCnt = blockSize2 % 0x4u;
 395
 396     while(blkCnt > 0u)
 397     {
 398       /* Accumulator is made zero for every iteration */
 399       sum = 0;
 400
 401       /* Apply loop unrolling and compute 4 MACs simultaneously. */
 402       k = srcBLen >> 2u;
 403
 404       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 405        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 406       while(k > 0u)
 407       {
 408         /* Perform the multiply-accumulates */
 409         sum = (q31_t) ((((q63_t) sum << 32) +
 410                         ((q63_t) * px++ * (*py--))) >> 32);
 411         sum = (q31_t) ((((q63_t) sum << 32) +
 412                         ((q63_t) * px++ * (*py--))) >> 32);
 413         sum = (q31_t) ((((q63_t) sum << 32) +
 414                         ((q63_t) * px++ * (*py--))) >> 32);
 415         sum = (q31_t) ((((q63_t) sum << 32) +
 416                         ((q63_t) * px++ * (*py--))) >> 32);
 417
 418         /* Decrement the loop counter */
 419         k--;
 420       }
 421
 422       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
 423        ** No loop unrolling is used. */
 424       k = srcBLen % 0x4u;
 425
 426       while(k > 0u)
 427       {
 428         /* Perform the multiply-accumulate */
 429         sum = (q31_t) ((((q63_t) sum << 32) +
 430                         ((q63_t) * px++ * (*py--))) >> 32);
 431
 432         /* Decrement the loop counter */
 433         k--;
 434       }
 435
 436       /* Store the result in the accumulator in the destination buffer. */
 437       *pOut++ = sum << 1;
 438
 439       /* Increment the MAC count */
 440       count++;
 441
 442       /* Update the inputA and inputB pointers for next MAC calculation */
 443       px = pIn1 + count;
 444       py = pSrc2;
 445
 446       /* Decrement the loop counter */
 447       blkCnt--;
 448     }
 449   }
 450   else
 451   {
 452     /* If the srcBLen is not a multiple of 4,
 453      * the blockSize2 loop cannot be unrolled by 4 */
 454     blkCnt = blockSize2;
 455
 456     while(blkCnt > 0u)
 457     {
 458       /* Accumulator is made zero for every iteration */
 459       sum = 0;
 460
 461       /* srcBLen number of MACS should be performed */
 462       k = srcBLen;
 463
 464       while(k > 0u)
 465       {
 466         /* Perform the multiply-accumulate */
 467         sum = (q31_t) ((((q63_t) sum << 32) +
 468                         ((q63_t) * px++ * (*py--))) >> 32);
 469
 470         /* Decrement the loop counter */
 471         k--;
 472       }
 473
 474       /* Store the result in the accumulator in the destination buffer. */
 475       *pOut++ = sum << 1;
 476
 477       /* Increment the MAC count */
 478       count++;
 479
 480       /* Update the inputA and inputB pointers for next MAC calculation */
 481       px = pIn1 + count;
 482       py = pSrc2;
 483
 484       /* Decrement the loop counter */
 485       blkCnt--;
 486     }
 487   }
 488
 489
 490   /* --------------------------
 491    * Initializations of stage3
 492    * -------------------------*/
 493
 494   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
 495    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
 496    * ....
 497    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
 498    * sum +=  x[srcALen-1] * y[srcBLen-1]
 499    */
 500
 501   /* In this stage the MAC operations are decreased by 1 for every iteration.
 502      The blockSize3 variable holds the number of MAC operations performed */
 503
 504   /* Working pointer of inputA */
 505   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
 506   px = pSrc1;
 507
 508   /* Working pointer of inputB */
 509   pSrc2 = pIn2 + (srcBLen - 1u);
 510   py = pSrc2;
 511
 512   /* -------------------
 513    * Stage3 process
 514    * ------------------*/
 515
 516   while(blockSize3 > 0u)
 517   {
 518     /* Accumulator is made zero for every iteration */
 519     sum = 0;
 520
 521     /* Apply loop unrolling and compute 4 MACs simultaneously. */
 522     k = blockSize3 >> 2u;
 523
 524     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
 525      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
 526     while(k > 0u)
 527     {
 528       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
 529       sum = (q31_t) ((((q63_t) sum << 32) +
 530                       ((q63_t) * px++ * (*py--))) >> 32);
 531
 532       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
 533       sum = (q31_t) ((((q63_t) sum << 32) +
 534                       ((q63_t) * px++ * (*py--))) >> 32);
 535
 536       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
 537       sum = (q31_t) ((((q63_t) sum << 32) +
 538                       ((q63_t) * px++ * (*py--))) >> 32);
 539
 540       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
 541       sum = (q31_t) ((((q63_t) sum << 32) +
 542                       ((q63_t) * px++ * (*py--))) >> 32);
 543
 544       /* Decrement the loop counter */
 545       k--;
 546     }
 547
 548     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
 549      ** No loop unrolling is used. */
 550     k = blockSize3 % 0x4u;
 551
 552     while(k > 0u)
 553     {
 554       /* Perform the multiply-accumulate */
 555       sum = (q31_t) ((((q63_t) sum << 32) +
 556                       ((q63_t) * px++ * (*py--))) >> 32);
 557
 558       /* Decrement the loop counter */
 559       k--;
 560     }
 561
 562     /* Store the result in the accumulator in the destination buffer. */
 563     *pOut++ = sum << 1;
 564
 565     /* Update the inputA and inputB pointers for next MAC calculation */
 566     px = ++pSrc1;
 567     py = pSrc2;
 568
 569     /* Decrement the loop counter */
 570     blockSize3--;
 571   }
 572
 573 }
 574
 575 /**
 576  * @} end of Conv group
 577  */