]> git.gir.st - tmk_keyboard.git/blob - tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_fast_q15.c
Merge commit '1fe4406f374291ab2e86e95a97341fd9c475fcb8'
[tmk_keyboard.git] / tmk_core / tool / mbed / mbed-sdk / libraries / dsp / cmsis_dsp / FilteringFunctions / arm_conv_fast_q15.c
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
3 *
4 * $Date: 17. January 2013
5 * $Revision: V1.4.1
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_fast_q15.c
9 *
10 * Description: Fast Q15 Convolution.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40
41 #include "arm_math.h"
42
43 /**
44 * @ingroup groupFilters
45 */
46
47 /**
48 * @addtogroup Conv
49 * @{
50 */
51
52 /**
53 * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
54 * @param[in] *pSrcA points to the first input sequence.
55 * @param[in] srcALen length of the first input sequence.
56 * @param[in] *pSrcB points to the second input sequence.
57 * @param[in] srcBLen length of the second input sequence.
58 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
59 * @return none.
60 *
61 * <b>Scaling and Overflow Behavior:</b>
62 *
63 * \par
64 * This fast version uses a 32-bit accumulator with 2.30 format.
65 * The accumulator maintains full precision of the intermediate multiplication results
66 * but provides only a single guard bit. There is no saturation on intermediate additions.
67 * Thus, if the accumulator overflows it wraps around and distorts the result.
68 * The input signals should be scaled down to avoid intermediate overflows.
69 * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
70 * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
71 * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
72 *
73 * \par
74 * See <code>arm_conv_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
75 */
76
77 void arm_conv_fast_q15(
78 q15_t * pSrcA,
79 uint32_t srcALen,
80 q15_t * pSrcB,
81 uint32_t srcBLen,
82 q15_t * pDst)
83 {
84 #ifndef UNALIGNED_SUPPORT_DISABLE
85 q15_t *pIn1; /* inputA pointer */
86 q15_t *pIn2; /* inputB pointer */
87 q15_t *pOut = pDst; /* output pointer */
88 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
89 q15_t *px; /* Intermediate inputA pointer */
90 q15_t *py; /* Intermediate inputB pointer */
91 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
92 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
93 uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */
94
95 /* The algorithm implementation is based on the lengths of the inputs. */
96 /* srcB is always made to slide across srcA. */
97 /* So srcBLen is always considered as shorter or equal to srcALen */
98 if(srcALen >= srcBLen)
99 {
100 /* Initialization of inputA pointer */
101 pIn1 = pSrcA;
102
103 /* Initialization of inputB pointer */
104 pIn2 = pSrcB;
105 }
106 else
107 {
108 /* Initialization of inputA pointer */
109 pIn1 = pSrcB;
110
111 /* Initialization of inputB pointer */
112 pIn2 = pSrcA;
113
114 /* srcBLen is always considered as shorter or equal to srcALen */
115 j = srcBLen;
116 srcBLen = srcALen;
117 srcALen = j;
118 }
119
120 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
121 /* The function is internally
122 * divided into three stages according to the number of multiplications that has to be
123 * taken place between inputA samples and inputB samples. In the first stage of the
124 * algorithm, the multiplications increase by one for every iteration.
125 * In the second stage of the algorithm, srcBLen number of multiplications are done.
126 * In the third stage of the algorithm, the multiplications decrease by one
127 * for every iteration. */
128
129 /* The algorithm is implemented in three stages.
130 The loop counters of each stage is initiated here. */
131 blockSize1 = srcBLen - 1u;
132 blockSize2 = srcALen - (srcBLen - 1u);
133 blockSize3 = blockSize1;
134
135 /* --------------------------
136 * Initializations of stage1
137 * -------------------------*/
138
139 /* sum = x[0] * y[0]
140 * sum = x[0] * y[1] + x[1] * y[0]
141 * ....
142 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
143 */
144
145 /* In this stage the MAC operations are increased by 1 for every iteration.
146 The count variable holds the number of MAC operations performed */
147 count = 1u;
148
149 /* Working pointer of inputA */
150 px = pIn1;
151
152 /* Working pointer of inputB */
153 py = pIn2;
154
155
156 /* ------------------------
157 * Stage1 process
158 * ----------------------*/
159
160 /* For loop unrolling by 4, this stage is divided into two. */
161 /* First part of this stage computes the MAC operations less than 4 */
162 /* Second part of this stage computes the MAC operations greater than or equal to 4 */
163
164 /* The first part of the stage starts here */
165 while((count < 4u) && (blockSize1 > 0u))
166 {
167 /* Accumulator is made zero for every iteration */
168 sum = 0;
169
170 /* Loop over number of MAC operations between
171 * inputA samples and inputB samples */
172 k = count;
173
174 while(k > 0u)
175 {
176 /* Perform the multiply-accumulates */
177 sum = __SMLAD(*px++, *py--, sum);
178
179 /* Decrement the loop counter */
180 k--;
181 }
182
183 /* Store the result in the accumulator in the destination buffer. */
184 *pOut++ = (q15_t) (sum >> 15);
185
186 /* Update the inputA and inputB pointers for next MAC calculation */
187 py = pIn2 + count;
188 px = pIn1;
189
190 /* Increment the MAC count */
191 count++;
192
193 /* Decrement the loop counter */
194 blockSize1--;
195 }
196
197 /* The second part of the stage starts here */
198 /* The internal loop, over count, is unrolled by 4 */
199 /* To, read the last two inputB samples using SIMD:
200 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
201 py = py - 1;
202
203 while(blockSize1 > 0u)
204 {
205 /* Accumulator is made zero for every iteration */
206 sum = 0;
207
208 /* Apply loop unrolling and compute 4 MACs simultaneously. */
209 k = count >> 2u;
210
211 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
212 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
213 while(k > 0u)
214 {
215 /* Perform the multiply-accumulates */
216 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
217 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
218 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
219 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
220
221 /* Decrement the loop counter */
222 k--;
223 }
224
225 /* For the next MAC operations, the pointer py is used without SIMD
226 * So, py is incremented by 1 */
227 py = py + 1u;
228
229 /* If the count is not a multiple of 4, compute any remaining MACs here.
230 ** No loop unrolling is used. */
231 k = count % 0x4u;
232
233 while(k > 0u)
234 {
235 /* Perform the multiply-accumulates */
236 sum = __SMLAD(*px++, *py--, sum);
237
238 /* Decrement the loop counter */
239 k--;
240 }
241
242 /* Store the result in the accumulator in the destination buffer. */
243 *pOut++ = (q15_t) (sum >> 15);
244
245 /* Update the inputA and inputB pointers for next MAC calculation */
246 py = pIn2 + (count - 1u);
247 px = pIn1;
248
249 /* Increment the MAC count */
250 count++;
251
252 /* Decrement the loop counter */
253 blockSize1--;
254 }
255
256 /* --------------------------
257 * Initializations of stage2
258 * ------------------------*/
259
260 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
261 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
262 * ....
263 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
264 */
265
266 /* Working pointer of inputA */
267 px = pIn1;
268
269 /* Working pointer of inputB */
270 pSrc2 = pIn2 + (srcBLen - 1u);
271 py = pSrc2;
272
273 /* count is the index by which the pointer pIn1 to be incremented */
274 count = 0u;
275
276
277 /* --------------------
278 * Stage2 process
279 * -------------------*/
280
281 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
282 * So, to loop unroll over blockSize2,
283 * srcBLen should be greater than or equal to 4 */
284 if(srcBLen >= 4u)
285 {
286 /* Loop unroll over blockSize2, by 4 */
287 blkCnt = blockSize2 >> 2u;
288
289 while(blkCnt > 0u)
290 {
291 py = py - 1u;
292
293 /* Set all accumulators to zero */
294 acc0 = 0;
295 acc1 = 0;
296 acc2 = 0;
297 acc3 = 0;
298
299
300 /* read x[0], x[1] samples */
301 x0 = *__SIMD32(px);
302 /* read x[1], x[2] samples */
303 x1 = _SIMD32_OFFSET(px+1);
304 px+= 2u;
305
306
307 /* Apply loop unrolling and compute 4 MACs simultaneously. */
308 k = srcBLen >> 2u;
309
310 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
311 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
312 do
313 {
314 /* Read the last two inputB samples using SIMD:
315 * y[srcBLen - 1] and y[srcBLen - 2] */
316 c0 = *__SIMD32(py)--;
317
318 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
319 acc0 = __SMLADX(x0, c0, acc0);
320
321 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
322 acc1 = __SMLADX(x1, c0, acc1);
323
324 /* Read x[2], x[3] */
325 x2 = *__SIMD32(px);
326
327 /* Read x[3], x[4] */
328 x3 = _SIMD32_OFFSET(px+1);
329
330 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
331 acc2 = __SMLADX(x2, c0, acc2);
332
333 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
334 acc3 = __SMLADX(x3, c0, acc3);
335
336 /* Read y[srcBLen - 3] and y[srcBLen - 4] */
337 c0 = *__SIMD32(py)--;
338
339 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
340 acc0 = __SMLADX(x2, c0, acc0);
341
342 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
343 acc1 = __SMLADX(x3, c0, acc1);
344
345 /* Read x[4], x[5] */
346 x0 = _SIMD32_OFFSET(px+2);
347
348 /* Read x[5], x[6] */
349 x1 = _SIMD32_OFFSET(px+3);
350 px += 4u;
351
352 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
353 acc2 = __SMLADX(x0, c0, acc2);
354
355 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
356 acc3 = __SMLADX(x1, c0, acc3);
357
358 } while(--k);
359
360 /* For the next MAC operations, SIMD is not used
361 * So, the 16 bit pointer if inputB, py is updated */
362
363 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
364 ** No loop unrolling is used. */
365 k = srcBLen % 0x4u;
366
367 if(k == 1u)
368 {
369 /* Read y[srcBLen - 5] */
370 c0 = *(py+1);
371
372 #ifdef ARM_MATH_BIG_ENDIAN
373
374 c0 = c0 << 16u;
375
376 #else
377
378 c0 = c0 & 0x0000FFFF;
379
380 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
381
382 /* Read x[7] */
383 x3 = *__SIMD32(px);
384 px++;
385
386 /* Perform the multiply-accumulates */
387 acc0 = __SMLAD(x0, c0, acc0);
388 acc1 = __SMLAD(x1, c0, acc1);
389 acc2 = __SMLADX(x1, c0, acc2);
390 acc3 = __SMLADX(x3, c0, acc3);
391 }
392
393 if(k == 2u)
394 {
395 /* Read y[srcBLen - 5], y[srcBLen - 6] */
396 c0 = _SIMD32_OFFSET(py);
397
398 /* Read x[7], x[8] */
399 x3 = *__SIMD32(px);
400
401 /* Read x[9] */
402 x2 = _SIMD32_OFFSET(px+1);
403 px += 2u;
404
405 /* Perform the multiply-accumulates */
406 acc0 = __SMLADX(x0, c0, acc0);
407 acc1 = __SMLADX(x1, c0, acc1);
408 acc2 = __SMLADX(x3, c0, acc2);
409 acc3 = __SMLADX(x2, c0, acc3);
410 }
411
412 if(k == 3u)
413 {
414 /* Read y[srcBLen - 5], y[srcBLen - 6] */
415 c0 = _SIMD32_OFFSET(py);
416
417 /* Read x[7], x[8] */
418 x3 = *__SIMD32(px);
419
420 /* Read x[9] */
421 x2 = _SIMD32_OFFSET(px+1);
422
423 /* Perform the multiply-accumulates */
424 acc0 = __SMLADX(x0, c0, acc0);
425 acc1 = __SMLADX(x1, c0, acc1);
426 acc2 = __SMLADX(x3, c0, acc2);
427 acc3 = __SMLADX(x2, c0, acc3);
428
429 /* Read y[srcBLen - 7] */
430 c0 = *(py-1);
431 #ifdef ARM_MATH_BIG_ENDIAN
432
433 c0 = c0 << 16u;
434 #else
435
436 c0 = c0 & 0x0000FFFF;
437 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
438
439 /* Read x[10] */
440 x3 = _SIMD32_OFFSET(px+2);
441 px += 3u;
442
443 /* Perform the multiply-accumulates */
444 acc0 = __SMLADX(x1, c0, acc0);
445 acc1 = __SMLAD(x2, c0, acc1);
446 acc2 = __SMLADX(x2, c0, acc2);
447 acc3 = __SMLADX(x3, c0, acc3);
448 }
449
450 /* Store the results in the accumulators in the destination buffer. */
451 #ifndef ARM_MATH_BIG_ENDIAN
452
453 *__SIMD32(pOut)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16);
454 *__SIMD32(pOut)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16);
455
456 #else
457
458 *__SIMD32(pOut)++ = __PKHBT((acc1 >> 15), (acc0 >> 15), 16);
459 *__SIMD32(pOut)++ = __PKHBT((acc3 >> 15), (acc2 >> 15), 16);
460
461 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
462
463 /* Increment the pointer pIn1 index, count by 4 */
464 count += 4u;
465
466 /* Update the inputA and inputB pointers for next MAC calculation */
467 px = pIn1 + count;
468 py = pSrc2;
469
470 /* Decrement the loop counter */
471 blkCnt--;
472 }
473
474 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
475 ** No loop unrolling is used. */
476 blkCnt = blockSize2 % 0x4u;
477
478 while(blkCnt > 0u)
479 {
480 /* Accumulator is made zero for every iteration */
481 sum = 0;
482
483 /* Apply loop unrolling and compute 4 MACs simultaneously. */
484 k = srcBLen >> 2u;
485
486 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
487 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
488 while(k > 0u)
489 {
490 /* Perform the multiply-accumulates */
491 sum += ((q31_t) * px++ * *py--);
492 sum += ((q31_t) * px++ * *py--);
493 sum += ((q31_t) * px++ * *py--);
494 sum += ((q31_t) * px++ * *py--);
495
496 /* Decrement the loop counter */
497 k--;
498 }
499
500 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
501 ** No loop unrolling is used. */
502 k = srcBLen % 0x4u;
503
504 while(k > 0u)
505 {
506 /* Perform the multiply-accumulates */
507 sum += ((q31_t) * px++ * *py--);
508
509 /* Decrement the loop counter */
510 k--;
511 }
512
513 /* Store the result in the accumulator in the destination buffer. */
514 *pOut++ = (q15_t) (sum >> 15);
515
516 /* Increment the pointer pIn1 index, count by 1 */
517 count++;
518
519 /* Update the inputA and inputB pointers for next MAC calculation */
520 px = pIn1 + count;
521 py = pSrc2;
522
523 /* Decrement the loop counter */
524 blkCnt--;
525 }
526 }
527 else
528 {
529 /* If the srcBLen is not a multiple of 4,
530 * the blockSize2 loop cannot be unrolled by 4 */
531 blkCnt = blockSize2;
532
533 while(blkCnt > 0u)
534 {
535 /* Accumulator is made zero for every iteration */
536 sum = 0;
537
538 /* srcBLen number of MACS should be performed */
539 k = srcBLen;
540
541 while(k > 0u)
542 {
543 /* Perform the multiply-accumulate */
544 sum += ((q31_t) * px++ * *py--);
545
546 /* Decrement the loop counter */
547 k--;
548 }
549
550 /* Store the result in the accumulator in the destination buffer. */
551 *pOut++ = (q15_t) (sum >> 15);
552
553 /* Increment the MAC count */
554 count++;
555
556 /* Update the inputA and inputB pointers for next MAC calculation */
557 px = pIn1 + count;
558 py = pSrc2;
559
560 /* Decrement the loop counter */
561 blkCnt--;
562 }
563 }
564
565
566 /* --------------------------
567 * Initializations of stage3
568 * -------------------------*/
569
570 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
571 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
572 * ....
573 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
574 * sum += x[srcALen-1] * y[srcBLen-1]
575 */
576
577 /* In this stage the MAC operations are decreased by 1 for every iteration.
578 The blockSize3 variable holds the number of MAC operations performed */
579
580 /* Working pointer of inputA */
581 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
582 px = pSrc1;
583
584 /* Working pointer of inputB */
585 pSrc2 = pIn2 + (srcBLen - 1u);
586 pIn2 = pSrc2 - 1u;
587 py = pIn2;
588
589 /* -------------------
590 * Stage3 process
591 * ------------------*/
592
593 /* For loop unrolling by 4, this stage is divided into two. */
594 /* First part of this stage computes the MAC operations greater than 4 */
595 /* Second part of this stage computes the MAC operations less than or equal to 4 */
596
597 /* The first part of the stage starts here */
598 j = blockSize3 >> 2u;
599
600 while((j > 0u) && (blockSize3 > 0u))
601 {
602 /* Accumulator is made zero for every iteration */
603 sum = 0;
604
605 /* Apply loop unrolling and compute 4 MACs simultaneously. */
606 k = blockSize3 >> 2u;
607
608 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
609 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
610 while(k > 0u)
611 {
612 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
613 * with y[srcBLen - 1], y[srcBLen - 2] respectively */
614 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
615 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
616 * with y[srcBLen - 3], y[srcBLen - 4] respectively */
617 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
618
619 /* Decrement the loop counter */
620 k--;
621 }
622
623 /* For the next MAC operations, the pointer py is used without SIMD
624 * So, py is incremented by 1 */
625 py = py + 1u;
626
627 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
628 ** No loop unrolling is used. */
629 k = blockSize3 % 0x4u;
630
631 while(k > 0u)
632 {
633 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
634 sum = __SMLAD(*px++, *py--, sum);
635
636 /* Decrement the loop counter */
637 k--;
638 }
639
640 /* Store the result in the accumulator in the destination buffer. */
641 *pOut++ = (q15_t) (sum >> 15);
642
643 /* Update the inputA and inputB pointers for next MAC calculation */
644 px = ++pSrc1;
645 py = pIn2;
646
647 /* Decrement the loop counter */
648 blockSize3--;
649
650 j--;
651 }
652
653 /* The second part of the stage starts here */
654 /* SIMD is not used for the next MAC operations,
655 * so pointer py is updated to read only one sample at a time */
656 py = py + 1u;
657
658 while(blockSize3 > 0u)
659 {
660 /* Accumulator is made zero for every iteration */
661 sum = 0;
662
663 /* Apply loop unrolling and compute 4 MACs simultaneously. */
664 k = blockSize3;
665
666 while(k > 0u)
667 {
668 /* Perform the multiply-accumulates */
669 /* sum += x[srcALen-1] * y[srcBLen-1] */
670 sum = __SMLAD(*px++, *py--, sum);
671
672 /* Decrement the loop counter */
673 k--;
674 }
675
676 /* Store the result in the accumulator in the destination buffer. */
677 *pOut++ = (q15_t) (sum >> 15);
678
679 /* Update the inputA and inputB pointers for next MAC calculation */
680 px = ++pSrc1;
681 py = pSrc2;
682
683 /* Decrement the loop counter */
684 blockSize3--;
685 }
686
687 #else
688 q15_t *pIn1; /* inputA pointer */
689 q15_t *pIn2; /* inputB pointer */
690 q15_t *pOut = pDst; /* output pointer */
691 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
692 q15_t *px; /* Intermediate inputA pointer */
693 q15_t *py; /* Intermediate inputB pointer */
694 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
695 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
696 uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */
697 q15_t a, b;
698
699 /* The algorithm implementation is based on the lengths of the inputs. */
700 /* srcB is always made to slide across srcA. */
701 /* So srcBLen is always considered as shorter or equal to srcALen */
702 if(srcALen >= srcBLen)
703 {
704 /* Initialization of inputA pointer */
705 pIn1 = pSrcA;
706
707 /* Initialization of inputB pointer */
708 pIn2 = pSrcB;
709 }
710 else
711 {
712 /* Initialization of inputA pointer */
713 pIn1 = pSrcB;
714
715 /* Initialization of inputB pointer */
716 pIn2 = pSrcA;
717
718 /* srcBLen is always considered as shorter or equal to srcALen */
719 j = srcBLen;
720 srcBLen = srcALen;
721 srcALen = j;
722 }
723
724 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
725 /* The function is internally
726 * divided into three stages according to the number of multiplications that has to be
727 * taken place between inputA samples and inputB samples. In the first stage of the
728 * algorithm, the multiplications increase by one for every iteration.
729 * In the second stage of the algorithm, srcBLen number of multiplications are done.
730 * In the third stage of the algorithm, the multiplications decrease by one
731 * for every iteration. */
732
733 /* The algorithm is implemented in three stages.
734 The loop counters of each stage is initiated here. */
735 blockSize1 = srcBLen - 1u;
736 blockSize2 = srcALen - (srcBLen - 1u);
737 blockSize3 = blockSize1;
738
739 /* --------------------------
740 * Initializations of stage1
741 * -------------------------*/
742
743 /* sum = x[0] * y[0]
744 * sum = x[0] * y[1] + x[1] * y[0]
745 * ....
746 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
747 */
748
749 /* In this stage the MAC operations are increased by 1 for every iteration.
750 The count variable holds the number of MAC operations performed */
751 count = 1u;
752
753 /* Working pointer of inputA */
754 px = pIn1;
755
756 /* Working pointer of inputB */
757 py = pIn2;
758
759
760 /* ------------------------
761 * Stage1 process
762 * ----------------------*/
763
764 /* For loop unrolling by 4, this stage is divided into two. */
765 /* First part of this stage computes the MAC operations less than 4 */
766 /* Second part of this stage computes the MAC operations greater than or equal to 4 */
767
768 /* The first part of the stage starts here */
769 while((count < 4u) && (blockSize1 > 0u))
770 {
771 /* Accumulator is made zero for every iteration */
772 sum = 0;
773
774 /* Loop over number of MAC operations between
775 * inputA samples and inputB samples */
776 k = count;
777
778 while(k > 0u)
779 {
780 /* Perform the multiply-accumulates */
781 sum += ((q31_t) * px++ * *py--);
782
783 /* Decrement the loop counter */
784 k--;
785 }
786
787 /* Store the result in the accumulator in the destination buffer. */
788 *pOut++ = (q15_t) (sum >> 15);
789
790 /* Update the inputA and inputB pointers for next MAC calculation */
791 py = pIn2 + count;
792 px = pIn1;
793
794 /* Increment the MAC count */
795 count++;
796
797 /* Decrement the loop counter */
798 blockSize1--;
799 }
800
801 /* The second part of the stage starts here */
802 /* The internal loop, over count, is unrolled by 4 */
803 /* To, read the last two inputB samples using SIMD:
804 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
805 py = py - 1;
806
807 while(blockSize1 > 0u)
808 {
809 /* Accumulator is made zero for every iteration */
810 sum = 0;
811
812 /* Apply loop unrolling and compute 4 MACs simultaneously. */
813 k = count >> 2u;
814
815 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
816 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
817 py++;
818
819 while(k > 0u)
820 {
821 /* Perform the multiply-accumulates */
822 sum += ((q31_t) * px++ * *py--);
823 sum += ((q31_t) * px++ * *py--);
824 sum += ((q31_t) * px++ * *py--);
825 sum += ((q31_t) * px++ * *py--);
826
827 /* Decrement the loop counter */
828 k--;
829 }
830
831 /* If the count is not a multiple of 4, compute any remaining MACs here.
832 ** No loop unrolling is used. */
833 k = count % 0x4u;
834
835 while(k > 0u)
836 {
837 /* Perform the multiply-accumulates */
838 sum += ((q31_t) * px++ * *py--);
839
840 /* Decrement the loop counter */
841 k--;
842 }
843
844 /* Store the result in the accumulator in the destination buffer. */
845 *pOut++ = (q15_t) (sum >> 15);
846
847 /* Update the inputA and inputB pointers for next MAC calculation */
848 py = pIn2 + (count - 1u);
849 px = pIn1;
850
851 /* Increment the MAC count */
852 count++;
853
854 /* Decrement the loop counter */
855 blockSize1--;
856 }
857
858 /* --------------------------
859 * Initializations of stage2
860 * ------------------------*/
861
862 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
863 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
864 * ....
865 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
866 */
867
868 /* Working pointer of inputA */
869 px = pIn1;
870
871 /* Working pointer of inputB */
872 pSrc2 = pIn2 + (srcBLen - 1u);
873 py = pSrc2;
874
875 /* count is the index by which the pointer pIn1 to be incremented */
876 count = 0u;
877
878
879 /* --------------------
880 * Stage2 process
881 * -------------------*/
882
883 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
884 * So, to loop unroll over blockSize2,
885 * srcBLen should be greater than or equal to 4 */
886 if(srcBLen >= 4u)
887 {
888 /* Loop unroll over blockSize2, by 4 */
889 blkCnt = blockSize2 >> 2u;
890
891 while(blkCnt > 0u)
892 {
893 py = py - 1u;
894
895 /* Set all accumulators to zero */
896 acc0 = 0;
897 acc1 = 0;
898 acc2 = 0;
899 acc3 = 0;
900
901 /* read x[0], x[1] samples */
902 a = *px++;
903 b = *px++;
904
905 #ifndef ARM_MATH_BIG_ENDIAN
906
907 x0 = __PKHBT(a, b, 16);
908 a = *px;
909 x1 = __PKHBT(b, a, 16);
910
911 #else
912
913 x0 = __PKHBT(b, a, 16);
914 a = *px;
915 x1 = __PKHBT(a, b, 16);
916
917 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
918
919 /* Apply loop unrolling and compute 4 MACs simultaneously. */
920 k = srcBLen >> 2u;
921
922 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
923 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
924 do
925 {
926 /* Read the last two inputB samples using SIMD:
927 * y[srcBLen - 1] and y[srcBLen - 2] */
928 a = *py;
929 b = *(py+1);
930 py -= 2;
931
932 #ifndef ARM_MATH_BIG_ENDIAN
933
934 c0 = __PKHBT(a, b, 16);
935
936 #else
937
938 c0 = __PKHBT(b, a, 16);;
939
940 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
941
942 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
943 acc0 = __SMLADX(x0, c0, acc0);
944
945 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
946 acc1 = __SMLADX(x1, c0, acc1);
947
948 a = *px;
949 b = *(px + 1);
950
951 #ifndef ARM_MATH_BIG_ENDIAN
952
953 x2 = __PKHBT(a, b, 16);
954 a = *(px + 2);
955 x3 = __PKHBT(b, a, 16);
956
957 #else
958
959 x2 = __PKHBT(b, a, 16);
960 a = *(px + 2);
961 x3 = __PKHBT(a, b, 16);
962
963 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
964
965 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
966 acc2 = __SMLADX(x2, c0, acc2);
967
968 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
969 acc3 = __SMLADX(x3, c0, acc3);
970
971 /* Read y[srcBLen - 3] and y[srcBLen - 4] */
972 a = *py;
973 b = *(py+1);
974 py -= 2;
975
976 #ifndef ARM_MATH_BIG_ENDIAN
977
978 c0 = __PKHBT(a, b, 16);
979
980 #else
981
982 c0 = __PKHBT(b, a, 16);;
983
984 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
985
986 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
987 acc0 = __SMLADX(x2, c0, acc0);
988
989 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
990 acc1 = __SMLADX(x3, c0, acc1);
991
992 /* Read x[4], x[5], x[6] */
993 a = *(px + 2);
994 b = *(px + 3);
995
996 #ifndef ARM_MATH_BIG_ENDIAN
997
998 x0 = __PKHBT(a, b, 16);
999 a = *(px + 4);
1000 x1 = __PKHBT(b, a, 16);
1001
1002 #else
1003
1004 x0 = __PKHBT(b, a, 16);
1005 a = *(px + 4);
1006 x1 = __PKHBT(a, b, 16);
1007
1008 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1009
1010 px += 4u;
1011
1012 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
1013 acc2 = __SMLADX(x0, c0, acc2);
1014
1015 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
1016 acc3 = __SMLADX(x1, c0, acc3);
1017
1018 } while(--k);
1019
1020 /* For the next MAC operations, SIMD is not used
1021 * So, the 16 bit pointer if inputB, py is updated */
1022
1023 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1024 ** No loop unrolling is used. */
1025 k = srcBLen % 0x4u;
1026
1027 if(k == 1u)
1028 {
1029 /* Read y[srcBLen - 5] */
1030 c0 = *(py+1);
1031
1032 #ifdef ARM_MATH_BIG_ENDIAN
1033
1034 c0 = c0 << 16u;
1035
1036 #else
1037
1038 c0 = c0 & 0x0000FFFF;
1039
1040 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
1041
1042 /* Read x[7] */
1043 a = *px;
1044 b = *(px+1);
1045 px++;
1046
1047 #ifndef ARM_MATH_BIG_ENDIAN
1048
1049 x3 = __PKHBT(a, b, 16);
1050
1051 #else
1052
1053 x3 = __PKHBT(b, a, 16);;
1054
1055 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1056
1057
1058 /* Perform the multiply-accumulates */
1059 acc0 = __SMLAD(x0, c0, acc0);
1060 acc1 = __SMLAD(x1, c0, acc1);
1061 acc2 = __SMLADX(x1, c0, acc2);
1062 acc3 = __SMLADX(x3, c0, acc3);
1063 }
1064
1065 if(k == 2u)
1066 {
1067 /* Read y[srcBLen - 5], y[srcBLen - 6] */
1068 a = *py;
1069 b = *(py+1);
1070
1071 #ifndef ARM_MATH_BIG_ENDIAN
1072
1073 c0 = __PKHBT(a, b, 16);
1074
1075 #else
1076
1077 c0 = __PKHBT(b, a, 16);;
1078
1079 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1080
1081 /* Read x[7], x[8], x[9] */
1082 a = *px;
1083 b = *(px + 1);
1084
1085 #ifndef ARM_MATH_BIG_ENDIAN
1086
1087 x3 = __PKHBT(a, b, 16);
1088 a = *(px + 2);
1089 x2 = __PKHBT(b, a, 16);
1090
1091 #else
1092
1093 x3 = __PKHBT(b, a, 16);
1094 a = *(px + 2);
1095 x2 = __PKHBT(a, b, 16);
1096
1097 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1098 px += 2u;
1099
1100 /* Perform the multiply-accumulates */
1101 acc0 = __SMLADX(x0, c0, acc0);
1102 acc1 = __SMLADX(x1, c0, acc1);
1103 acc2 = __SMLADX(x3, c0, acc2);
1104 acc3 = __SMLADX(x2, c0, acc3);
1105 }
1106
1107 if(k == 3u)
1108 {
1109 /* Read y[srcBLen - 5], y[srcBLen - 6] */
1110 a = *py;
1111 b = *(py+1);
1112
1113 #ifndef ARM_MATH_BIG_ENDIAN
1114
1115 c0 = __PKHBT(a, b, 16);
1116
1117 #else
1118
1119 c0 = __PKHBT(b, a, 16);;
1120
1121 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1122
1123 /* Read x[7], x[8], x[9] */
1124 a = *px;
1125 b = *(px + 1);
1126
1127 #ifndef ARM_MATH_BIG_ENDIAN
1128
1129 x3 = __PKHBT(a, b, 16);
1130 a = *(px + 2);
1131 x2 = __PKHBT(b, a, 16);
1132
1133 #else
1134
1135 x3 = __PKHBT(b, a, 16);
1136 a = *(px + 2);
1137 x2 = __PKHBT(a, b, 16);
1138
1139 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1140
1141 /* Perform the multiply-accumulates */
1142 acc0 = __SMLADX(x0, c0, acc0);
1143 acc1 = __SMLADX(x1, c0, acc1);
1144 acc2 = __SMLADX(x3, c0, acc2);
1145 acc3 = __SMLADX(x2, c0, acc3);
1146
1147 /* Read y[srcBLen - 7] */
1148 c0 = *(py-1);
1149 #ifdef ARM_MATH_BIG_ENDIAN
1150
1151 c0 = c0 << 16u;
1152 #else
1153
1154 c0 = c0 & 0x0000FFFF;
1155 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
1156
1157 /* Read x[10] */
1158 a = *(px+2);
1159 b = *(px+3);
1160
1161 #ifndef ARM_MATH_BIG_ENDIAN
1162
1163 x3 = __PKHBT(a, b, 16);
1164
1165 #else
1166
1167 x3 = __PKHBT(b, a, 16);;
1168
1169 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1170
1171 px += 3u;
1172
1173 /* Perform the multiply-accumulates */
1174 acc0 = __SMLADX(x1, c0, acc0);
1175 acc1 = __SMLAD(x2, c0, acc1);
1176 acc2 = __SMLADX(x2, c0, acc2);
1177 acc3 = __SMLADX(x3, c0, acc3);
1178 }
1179
1180 /* Store the results in the accumulators in the destination buffer. */
1181 *pOut++ = (q15_t)(acc0 >> 15);
1182 *pOut++ = (q15_t)(acc1 >> 15);
1183 *pOut++ = (q15_t)(acc2 >> 15);
1184 *pOut++ = (q15_t)(acc3 >> 15);
1185
1186 /* Increment the pointer pIn1 index, count by 4 */
1187 count += 4u;
1188
1189 /* Update the inputA and inputB pointers for next MAC calculation */
1190 px = pIn1 + count;
1191 py = pSrc2;
1192
1193 /* Decrement the loop counter */
1194 blkCnt--;
1195 }
1196
1197 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
1198 ** No loop unrolling is used. */
1199 blkCnt = blockSize2 % 0x4u;
1200
1201 while(blkCnt > 0u)
1202 {
1203 /* Accumulator is made zero for every iteration */
1204 sum = 0;
1205
1206 /* Apply loop unrolling and compute 4 MACs simultaneously. */
1207 k = srcBLen >> 2u;
1208
1209 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
1210 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1211 while(k > 0u)
1212 {
1213 /* Perform the multiply-accumulates */
1214 sum += ((q31_t) * px++ * *py--);
1215 sum += ((q31_t) * px++ * *py--);
1216 sum += ((q31_t) * px++ * *py--);
1217 sum += ((q31_t) * px++ * *py--);
1218
1219 /* Decrement the loop counter */
1220 k--;
1221 }
1222
1223 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1224 ** No loop unrolling is used. */
1225 k = srcBLen % 0x4u;
1226
1227 while(k > 0u)
1228 {
1229 /* Perform the multiply-accumulates */
1230 sum += ((q31_t) * px++ * *py--);
1231
1232 /* Decrement the loop counter */
1233 k--;
1234 }
1235
1236 /* Store the result in the accumulator in the destination buffer. */
1237 *pOut++ = (q15_t) (sum >> 15);
1238
1239 /* Increment the pointer pIn1 index, count by 1 */
1240 count++;
1241
1242 /* Update the inputA and inputB pointers for next MAC calculation */
1243 px = pIn1 + count;
1244 py = pSrc2;
1245
1246 /* Decrement the loop counter */
1247 blkCnt--;
1248 }
1249 }
1250 else
1251 {
1252 /* If the srcBLen is not a multiple of 4,
1253 * the blockSize2 loop cannot be unrolled by 4 */
1254 blkCnt = blockSize2;
1255
1256 while(blkCnt > 0u)
1257 {
1258 /* Accumulator is made zero for every iteration */
1259 sum = 0;
1260
1261 /* srcBLen number of MACS should be performed */
1262 k = srcBLen;
1263
1264 while(k > 0u)
1265 {
1266 /* Perform the multiply-accumulate */
1267 sum += ((q31_t) * px++ * *py--);
1268
1269 /* Decrement the loop counter */
1270 k--;
1271 }
1272
1273 /* Store the result in the accumulator in the destination buffer. */
1274 *pOut++ = (q15_t) (sum >> 15);
1275
1276 /* Increment the MAC count */
1277 count++;
1278
1279 /* Update the inputA and inputB pointers for next MAC calculation */
1280 px = pIn1 + count;
1281 py = pSrc2;
1282
1283 /* Decrement the loop counter */
1284 blkCnt--;
1285 }
1286 }
1287
1288
1289 /* --------------------------
1290 * Initializations of stage3
1291 * -------------------------*/
1292
1293 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
1294 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
1295 * ....
1296 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
1297 * sum += x[srcALen-1] * y[srcBLen-1]
1298 */
1299
1300 /* In this stage the MAC operations are decreased by 1 for every iteration.
1301 The blockSize3 variable holds the number of MAC operations performed */
1302
1303 /* Working pointer of inputA */
1304 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
1305 px = pSrc1;
1306
1307 /* Working pointer of inputB */
1308 pSrc2 = pIn2 + (srcBLen - 1u);
1309 pIn2 = pSrc2 - 1u;
1310 py = pIn2;
1311
1312 /* -------------------
1313 * Stage3 process
1314 * ------------------*/
1315
1316 /* For loop unrolling by 4, this stage is divided into two. */
1317 /* First part of this stage computes the MAC operations greater than 4 */
1318 /* Second part of this stage computes the MAC operations less than or equal to 4 */
1319
1320 /* The first part of the stage starts here */
1321 j = blockSize3 >> 2u;
1322
1323 while((j > 0u) && (blockSize3 > 0u))
1324 {
1325 /* Accumulator is made zero for every iteration */
1326 sum = 0;
1327
1328 /* Apply loop unrolling and compute 4 MACs simultaneously. */
1329 k = blockSize3 >> 2u;
1330
1331 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
1332 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1333 py++;
1334
1335 while(k > 0u)
1336 {
1337 sum += ((q31_t) * px++ * *py--);
1338 sum += ((q31_t) * px++ * *py--);
1339 sum += ((q31_t) * px++ * *py--);
1340 sum += ((q31_t) * px++ * *py--);
1341 /* Decrement the loop counter */
1342 k--;
1343 }
1344
1345 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
1346 ** No loop unrolling is used. */
1347 k = blockSize3 % 0x4u;
1348
1349 while(k > 0u)
1350 {
1351 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
1352 sum += ((q31_t) * px++ * *py--);
1353
1354 /* Decrement the loop counter */
1355 k--;
1356 }
1357
1358 /* Store the result in the accumulator in the destination buffer. */
1359 *pOut++ = (q15_t) (sum >> 15);
1360
1361 /* Update the inputA and inputB pointers for next MAC calculation */
1362 px = ++pSrc1;
1363 py = pIn2;
1364
1365 /* Decrement the loop counter */
1366 blockSize3--;
1367
1368 j--;
1369 }
1370
1371 /* The second part of the stage starts here */
1372 /* SIMD is not used for the next MAC operations,
1373 * so pointer py is updated to read only one sample at a time */
1374 py = py + 1u;
1375
1376 while(blockSize3 > 0u)
1377 {
1378 /* Accumulator is made zero for every iteration */
1379 sum = 0;
1380
1381 /* Apply loop unrolling and compute 4 MACs simultaneously. */
1382 k = blockSize3;
1383
1384 while(k > 0u)
1385 {
1386 /* Perform the multiply-accumulates */
1387 /* sum += x[srcALen-1] * y[srcBLen-1] */
1388 sum += ((q31_t) * px++ * *py--);
1389
1390 /* Decrement the loop counter */
1391 k--;
1392 }
1393
1394 /* Store the result in the accumulator in the destination buffer. */
1395 *pOut++ = (q15_t) (sum >> 15);
1396
1397 /* Update the inputA and inputB pointers for next MAC calculation */
1398 px = ++pSrc1;
1399 py = pSrc2;
1400
1401 /* Decrement the loop counter */
1402 blockSize3--;
1403 }
1404
1405 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
1406 }
1407
1408 /**
1409 * @} end of Conv group
1410 */
Imprint / Impressum