]>
git.gir.st - tmk_keyboard.git/blob - tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_fast_q15.c
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
4 * $Date: 17. January 2013
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_fast_q15.c
10 * Description: Fast Q15 Convolution.
12 * Target Processor: Cortex-M4/Cortex-M3
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
44 * @ingroup groupFilters
53 * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
54 * @param[in] *pSrcA points to the first input sequence.
55 * @param[in] srcALen length of the first input sequence.
56 * @param[in] *pSrcB points to the second input sequence.
57 * @param[in] srcBLen length of the second input sequence.
58 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
61 * <b>Scaling and Overflow Behavior:</b>
64 * This fast version uses a 32-bit accumulator with 2.30 format.
65 * The accumulator maintains full precision of the intermediate multiplication results
66 * but provides only a single guard bit. There is no saturation on intermediate additions.
67 * Thus, if the accumulator overflows it wraps around and distorts the result.
68 * The input signals should be scaled down to avoid intermediate overflows.
69 * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
70 * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
71 * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
74 * See <code>arm_conv_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
77 void arm_conv_fast_q15(
84 #ifndef UNALIGNED_SUPPORT_DISABLE
85 q15_t
*pIn1
; /* inputA pointer */
86 q15_t
*pIn2
; /* inputB pointer */
87 q15_t
*pOut
= pDst
; /* output pointer */
88 q31_t sum
, acc0
, acc1
, acc2
, acc3
; /* Accumulator */
89 q15_t
*px
; /* Intermediate inputA pointer */
90 q15_t
*py
; /* Intermediate inputB pointer */
91 q15_t
*pSrc1
, *pSrc2
; /* Intermediate pointers */
92 q31_t x0
, x1
, x2
, x3
, c0
; /* Temporary variables to hold state and coefficient values */
93 uint32_t blockSize1
, blockSize2
, blockSize3
, j
, k
, count
, blkCnt
; /* loop counter */
95 /* The algorithm implementation is based on the lengths of the inputs. */
96 /* srcB is always made to slide across srcA. */
97 /* So srcBLen is always considered as shorter or equal to srcALen */
98 if(srcALen
>= srcBLen
)
100 /* Initialization of inputA pointer */
103 /* Initialization of inputB pointer */
108 /* Initialization of inputA pointer */
111 /* Initialization of inputB pointer */
114 /* srcBLen is always considered as shorter or equal to srcALen */
120 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
121 /* The function is internally
122 * divided into three stages according to the number of multiplications that has to be
123 * taken place between inputA samples and inputB samples. In the first stage of the
124 * algorithm, the multiplications increase by one for every iteration.
125 * In the second stage of the algorithm, srcBLen number of multiplications are done.
126 * In the third stage of the algorithm, the multiplications decrease by one
127 * for every iteration. */
129 /* The algorithm is implemented in three stages.
130 The loop counters of each stage is initiated here. */
131 blockSize1
= srcBLen
- 1u;
132 blockSize2
= srcALen
- (srcBLen
- 1u);
133 blockSize3
= blockSize1
;
135 /* --------------------------
136 * Initializations of stage1
137 * -------------------------*/
140 * sum = x[0] * y[1] + x[1] * y[0]
142 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
145 /* In this stage the MAC operations are increased by 1 for every iteration.
146 The count variable holds the number of MAC operations performed */
149 /* Working pointer of inputA */
152 /* Working pointer of inputB */
156 /* ------------------------
158 * ----------------------*/
160 /* For loop unrolling by 4, this stage is divided into two. */
161 /* First part of this stage computes the MAC operations less than 4 */
162 /* Second part of this stage computes the MAC operations greater than or equal to 4 */
164 /* The first part of the stage starts here */
165 while((count
< 4u) && (blockSize1
> 0u))
167 /* Accumulator is made zero for every iteration */
170 /* Loop over number of MAC operations between
171 * inputA samples and inputB samples */
176 /* Perform the multiply-accumulates */
177 sum
= __SMLAD(*px
++, *py
--, sum
);
179 /* Decrement the loop counter */
183 /* Store the result in the accumulator in the destination buffer. */
184 *pOut
++ = (q15_t
) (sum
>> 15);
186 /* Update the inputA and inputB pointers for next MAC calculation */
190 /* Increment the MAC count */
193 /* Decrement the loop counter */
197 /* The second part of the stage starts here */
198 /* The internal loop, over count, is unrolled by 4 */
199 /* To, read the last two inputB samples using SIMD:
200 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
203 while(blockSize1
> 0u)
205 /* Accumulator is made zero for every iteration */
208 /* Apply loop unrolling and compute 4 MACs simultaneously. */
211 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
212 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
215 /* Perform the multiply-accumulates */
216 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
217 sum
= __SMLADX(*__SIMD32(px
)++, *__SIMD32(py
)--, sum
);
218 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
219 sum
= __SMLADX(*__SIMD32(px
)++, *__SIMD32(py
)--, sum
);
221 /* Decrement the loop counter */
225 /* For the next MAC operations, the pointer py is used without SIMD
226 * So, py is incremented by 1 */
229 /* If the count is not a multiple of 4, compute any remaining MACs here.
230 ** No loop unrolling is used. */
235 /* Perform the multiply-accumulates */
236 sum
= __SMLAD(*px
++, *py
--, sum
);
238 /* Decrement the loop counter */
242 /* Store the result in the accumulator in the destination buffer. */
243 *pOut
++ = (q15_t
) (sum
>> 15);
245 /* Update the inputA and inputB pointers for next MAC calculation */
246 py
= pIn2
+ (count
- 1u);
249 /* Increment the MAC count */
252 /* Decrement the loop counter */
256 /* --------------------------
257 * Initializations of stage2
258 * ------------------------*/
260 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
261 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
263 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
266 /* Working pointer of inputA */
269 /* Working pointer of inputB */
270 pSrc2
= pIn2
+ (srcBLen
- 1u);
273 /* count is the index by which the pointer pIn1 to be incremented */
277 /* --------------------
279 * -------------------*/
281 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
282 * So, to loop unroll over blockSize2,
283 * srcBLen should be greater than or equal to 4 */
286 /* Loop unroll over blockSize2, by 4 */
287 blkCnt
= blockSize2
>> 2u;
293 /* Set all accumulators to zero */
300 /* read x[0], x[1] samples */
302 /* read x[1], x[2] samples */
303 x1
= _SIMD32_OFFSET(px
+1);
307 /* Apply loop unrolling and compute 4 MACs simultaneously. */
310 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
311 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
314 /* Read the last two inputB samples using SIMD:
315 * y[srcBLen - 1] and y[srcBLen - 2] */
316 c0
= *__SIMD32(py
)--;
318 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
319 acc0
= __SMLADX(x0
, c0
, acc0
);
321 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
322 acc1
= __SMLADX(x1
, c0
, acc1
);
324 /* Read x[2], x[3] */
327 /* Read x[3], x[4] */
328 x3
= _SIMD32_OFFSET(px
+1);
330 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
331 acc2
= __SMLADX(x2
, c0
, acc2
);
333 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
334 acc3
= __SMLADX(x3
, c0
, acc3
);
336 /* Read y[srcBLen - 3] and y[srcBLen - 4] */
337 c0
= *__SIMD32(py
)--;
339 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
340 acc0
= __SMLADX(x2
, c0
, acc0
);
342 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
343 acc1
= __SMLADX(x3
, c0
, acc1
);
345 /* Read x[4], x[5] */
346 x0
= _SIMD32_OFFSET(px
+2);
348 /* Read x[5], x[6] */
349 x1
= _SIMD32_OFFSET(px
+3);
352 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
353 acc2
= __SMLADX(x0
, c0
, acc2
);
355 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
356 acc3
= __SMLADX(x1
, c0
, acc3
);
360 /* For the next MAC operations, SIMD is not used
361 * So, the 16 bit pointer if inputB, py is updated */
363 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
364 ** No loop unrolling is used. */
369 /* Read y[srcBLen - 5] */
372 #ifdef ARM_MATH_BIG_ENDIAN
378 c0
= c0
& 0x0000FFFF;
380 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
386 /* Perform the multiply-accumulates */
387 acc0
= __SMLAD(x0
, c0
, acc0
);
388 acc1
= __SMLAD(x1
, c0
, acc1
);
389 acc2
= __SMLADX(x1
, c0
, acc2
);
390 acc3
= __SMLADX(x3
, c0
, acc3
);
395 /* Read y[srcBLen - 5], y[srcBLen - 6] */
396 c0
= _SIMD32_OFFSET(py
);
398 /* Read x[7], x[8] */
402 x2
= _SIMD32_OFFSET(px
+1);
405 /* Perform the multiply-accumulates */
406 acc0
= __SMLADX(x0
, c0
, acc0
);
407 acc1
= __SMLADX(x1
, c0
, acc1
);
408 acc2
= __SMLADX(x3
, c0
, acc2
);
409 acc3
= __SMLADX(x2
, c0
, acc3
);
414 /* Read y[srcBLen - 5], y[srcBLen - 6] */
415 c0
= _SIMD32_OFFSET(py
);
417 /* Read x[7], x[8] */
421 x2
= _SIMD32_OFFSET(px
+1);
423 /* Perform the multiply-accumulates */
424 acc0
= __SMLADX(x0
, c0
, acc0
);
425 acc1
= __SMLADX(x1
, c0
, acc1
);
426 acc2
= __SMLADX(x3
, c0
, acc2
);
427 acc3
= __SMLADX(x2
, c0
, acc3
);
429 /* Read y[srcBLen - 7] */
431 #ifdef ARM_MATH_BIG_ENDIAN
436 c0
= c0
& 0x0000FFFF;
437 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
440 x3
= _SIMD32_OFFSET(px
+2);
443 /* Perform the multiply-accumulates */
444 acc0
= __SMLADX(x1
, c0
, acc0
);
445 acc1
= __SMLAD(x2
, c0
, acc1
);
446 acc2
= __SMLADX(x2
, c0
, acc2
);
447 acc3
= __SMLADX(x3
, c0
, acc3
);
450 /* Store the results in the accumulators in the destination buffer. */
451 #ifndef ARM_MATH_BIG_ENDIAN
453 *__SIMD32(pOut
)++ = __PKHBT((acc0
>> 15), (acc1
>> 15), 16);
454 *__SIMD32(pOut
)++ = __PKHBT((acc2
>> 15), (acc3
>> 15), 16);
458 *__SIMD32(pOut
)++ = __PKHBT((acc1
>> 15), (acc0
>> 15), 16);
459 *__SIMD32(pOut
)++ = __PKHBT((acc3
>> 15), (acc2
>> 15), 16);
461 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
463 /* Increment the pointer pIn1 index, count by 4 */
466 /* Update the inputA and inputB pointers for next MAC calculation */
470 /* Decrement the loop counter */
474 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
475 ** No loop unrolling is used. */
476 blkCnt
= blockSize2
% 0x4u
;
480 /* Accumulator is made zero for every iteration */
483 /* Apply loop unrolling and compute 4 MACs simultaneously. */
486 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
487 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
490 /* Perform the multiply-accumulates */
491 sum
+= ((q31_t
) * px
++ * *py
--);
492 sum
+= ((q31_t
) * px
++ * *py
--);
493 sum
+= ((q31_t
) * px
++ * *py
--);
494 sum
+= ((q31_t
) * px
++ * *py
--);
496 /* Decrement the loop counter */
500 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
501 ** No loop unrolling is used. */
506 /* Perform the multiply-accumulates */
507 sum
+= ((q31_t
) * px
++ * *py
--);
509 /* Decrement the loop counter */
513 /* Store the result in the accumulator in the destination buffer. */
514 *pOut
++ = (q15_t
) (sum
>> 15);
516 /* Increment the pointer pIn1 index, count by 1 */
519 /* Update the inputA and inputB pointers for next MAC calculation */
523 /* Decrement the loop counter */
529 /* If the srcBLen is not a multiple of 4,
530 * the blockSize2 loop cannot be unrolled by 4 */
535 /* Accumulator is made zero for every iteration */
538 /* srcBLen number of MACS should be performed */
543 /* Perform the multiply-accumulate */
544 sum
+= ((q31_t
) * px
++ * *py
--);
546 /* Decrement the loop counter */
550 /* Store the result in the accumulator in the destination buffer. */
551 *pOut
++ = (q15_t
) (sum
>> 15);
553 /* Increment the MAC count */
556 /* Update the inputA and inputB pointers for next MAC calculation */
560 /* Decrement the loop counter */
566 /* --------------------------
567 * Initializations of stage3
568 * -------------------------*/
570 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
571 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
573 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
574 * sum += x[srcALen-1] * y[srcBLen-1]
577 /* In this stage the MAC operations are decreased by 1 for every iteration.
578 The blockSize3 variable holds the number of MAC operations performed */
580 /* Working pointer of inputA */
581 pSrc1
= (pIn1
+ srcALen
) - (srcBLen
- 1u);
584 /* Working pointer of inputB */
585 pSrc2
= pIn2
+ (srcBLen
- 1u);
589 /* -------------------
591 * ------------------*/
593 /* For loop unrolling by 4, this stage is divided into two. */
594 /* First part of this stage computes the MAC operations greater than 4 */
595 /* Second part of this stage computes the MAC operations less than or equal to 4 */
597 /* The first part of the stage starts here */
598 j
= blockSize3
>> 2u;
600 while((j
> 0u) && (blockSize3
> 0u))
602 /* Accumulator is made zero for every iteration */
605 /* Apply loop unrolling and compute 4 MACs simultaneously. */
606 k
= blockSize3
>> 2u;
608 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
609 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
612 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
613 * with y[srcBLen - 1], y[srcBLen - 2] respectively */
614 sum
= __SMLADX(*__SIMD32(px
)++, *__SIMD32(py
)--, sum
);
615 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
616 * with y[srcBLen - 3], y[srcBLen - 4] respectively */
617 sum
= __SMLADX(*__SIMD32(px
)++, *__SIMD32(py
)--, sum
);
619 /* Decrement the loop counter */
623 /* For the next MAC operations, the pointer py is used without SIMD
624 * So, py is incremented by 1 */
627 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
628 ** No loop unrolling is used. */
629 k
= blockSize3
% 0x4u
;
633 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
634 sum
= __SMLAD(*px
++, *py
--, sum
);
636 /* Decrement the loop counter */
640 /* Store the result in the accumulator in the destination buffer. */
641 *pOut
++ = (q15_t
) (sum
>> 15);
643 /* Update the inputA and inputB pointers for next MAC calculation */
647 /* Decrement the loop counter */
653 /* The second part of the stage starts here */
654 /* SIMD is not used for the next MAC operations,
655 * so pointer py is updated to read only one sample at a time */
658 while(blockSize3
> 0u)
660 /* Accumulator is made zero for every iteration */
663 /* Apply loop unrolling and compute 4 MACs simultaneously. */
668 /* Perform the multiply-accumulates */
669 /* sum += x[srcALen-1] * y[srcBLen-1] */
670 sum
= __SMLAD(*px
++, *py
--, sum
);
672 /* Decrement the loop counter */
676 /* Store the result in the accumulator in the destination buffer. */
677 *pOut
++ = (q15_t
) (sum
>> 15);
679 /* Update the inputA and inputB pointers for next MAC calculation */
683 /* Decrement the loop counter */
688 q15_t
*pIn1
; /* inputA pointer */
689 q15_t
*pIn2
; /* inputB pointer */
690 q15_t
*pOut
= pDst
; /* output pointer */
691 q31_t sum
, acc0
, acc1
, acc2
, acc3
; /* Accumulator */
692 q15_t
*px
; /* Intermediate inputA pointer */
693 q15_t
*py
; /* Intermediate inputB pointer */
694 q15_t
*pSrc1
, *pSrc2
; /* Intermediate pointers */
695 q31_t x0
, x1
, x2
, x3
, c0
; /* Temporary variables to hold state and coefficient values */
696 uint32_t blockSize1
, blockSize2
, blockSize3
, j
, k
, count
, blkCnt
; /* loop counter */
699 /* The algorithm implementation is based on the lengths of the inputs. */
700 /* srcB is always made to slide across srcA. */
701 /* So srcBLen is always considered as shorter or equal to srcALen */
702 if(srcALen
>= srcBLen
)
704 /* Initialization of inputA pointer */
707 /* Initialization of inputB pointer */
712 /* Initialization of inputA pointer */
715 /* Initialization of inputB pointer */
718 /* srcBLen is always considered as shorter or equal to srcALen */
724 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
725 /* The function is internally
726 * divided into three stages according to the number of multiplications that has to be
727 * taken place between inputA samples and inputB samples. In the first stage of the
728 * algorithm, the multiplications increase by one for every iteration.
729 * In the second stage of the algorithm, srcBLen number of multiplications are done.
730 * In the third stage of the algorithm, the multiplications decrease by one
731 * for every iteration. */
733 /* The algorithm is implemented in three stages.
734 The loop counters of each stage is initiated here. */
735 blockSize1
= srcBLen
- 1u;
736 blockSize2
= srcALen
- (srcBLen
- 1u);
737 blockSize3
= blockSize1
;
739 /* --------------------------
740 * Initializations of stage1
741 * -------------------------*/
744 * sum = x[0] * y[1] + x[1] * y[0]
746 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
749 /* In this stage the MAC operations are increased by 1 for every iteration.
750 The count variable holds the number of MAC operations performed */
753 /* Working pointer of inputA */
756 /* Working pointer of inputB */
760 /* ------------------------
762 * ----------------------*/
764 /* For loop unrolling by 4, this stage is divided into two. */
765 /* First part of this stage computes the MAC operations less than 4 */
766 /* Second part of this stage computes the MAC operations greater than or equal to 4 */
768 /* The first part of the stage starts here */
769 while((count
< 4u) && (blockSize1
> 0u))
771 /* Accumulator is made zero for every iteration */
774 /* Loop over number of MAC operations between
775 * inputA samples and inputB samples */
780 /* Perform the multiply-accumulates */
781 sum
+= ((q31_t
) * px
++ * *py
--);
783 /* Decrement the loop counter */
787 /* Store the result in the accumulator in the destination buffer. */
788 *pOut
++ = (q15_t
) (sum
>> 15);
790 /* Update the inputA and inputB pointers for next MAC calculation */
794 /* Increment the MAC count */
797 /* Decrement the loop counter */
801 /* The second part of the stage starts here */
802 /* The internal loop, over count, is unrolled by 4 */
803 /* To, read the last two inputB samples using SIMD:
804 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
807 while(blockSize1
> 0u)
809 /* Accumulator is made zero for every iteration */
812 /* Apply loop unrolling and compute 4 MACs simultaneously. */
815 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
816 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
821 /* Perform the multiply-accumulates */
822 sum
+= ((q31_t
) * px
++ * *py
--);
823 sum
+= ((q31_t
) * px
++ * *py
--);
824 sum
+= ((q31_t
) * px
++ * *py
--);
825 sum
+= ((q31_t
) * px
++ * *py
--);
827 /* Decrement the loop counter */
831 /* If the count is not a multiple of 4, compute any remaining MACs here.
832 ** No loop unrolling is used. */
837 /* Perform the multiply-accumulates */
838 sum
+= ((q31_t
) * px
++ * *py
--);
840 /* Decrement the loop counter */
844 /* Store the result in the accumulator in the destination buffer. */
845 *pOut
++ = (q15_t
) (sum
>> 15);
847 /* Update the inputA and inputB pointers for next MAC calculation */
848 py
= pIn2
+ (count
- 1u);
851 /* Increment the MAC count */
854 /* Decrement the loop counter */
858 /* --------------------------
859 * Initializations of stage2
860 * ------------------------*/
862 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
863 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
865 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
868 /* Working pointer of inputA */
871 /* Working pointer of inputB */
872 pSrc2
= pIn2
+ (srcBLen
- 1u);
875 /* count is the index by which the pointer pIn1 to be incremented */
879 /* --------------------
881 * -------------------*/
883 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
884 * So, to loop unroll over blockSize2,
885 * srcBLen should be greater than or equal to 4 */
888 /* Loop unroll over blockSize2, by 4 */
889 blkCnt
= blockSize2
>> 2u;
895 /* Set all accumulators to zero */
901 /* read x[0], x[1] samples */
905 #ifndef ARM_MATH_BIG_ENDIAN
907 x0
= __PKHBT(a
, b
, 16);
909 x1
= __PKHBT(b
, a
, 16);
913 x0
= __PKHBT(b
, a
, 16);
915 x1
= __PKHBT(a
, b
, 16);
917 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
919 /* Apply loop unrolling and compute 4 MACs simultaneously. */
922 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
923 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
926 /* Read the last two inputB samples using SIMD:
927 * y[srcBLen - 1] and y[srcBLen - 2] */
932 #ifndef ARM_MATH_BIG_ENDIAN
934 c0
= __PKHBT(a
, b
, 16);
938 c0
= __PKHBT(b
, a
, 16);;
940 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
942 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
943 acc0
= __SMLADX(x0
, c0
, acc0
);
945 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
946 acc1
= __SMLADX(x1
, c0
, acc1
);
951 #ifndef ARM_MATH_BIG_ENDIAN
953 x2
= __PKHBT(a
, b
, 16);
955 x3
= __PKHBT(b
, a
, 16);
959 x2
= __PKHBT(b
, a
, 16);
961 x3
= __PKHBT(a
, b
, 16);
963 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
965 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
966 acc2
= __SMLADX(x2
, c0
, acc2
);
968 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
969 acc3
= __SMLADX(x3
, c0
, acc3
);
971 /* Read y[srcBLen - 3] and y[srcBLen - 4] */
976 #ifndef ARM_MATH_BIG_ENDIAN
978 c0
= __PKHBT(a
, b
, 16);
982 c0
= __PKHBT(b
, a
, 16);;
984 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
986 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
987 acc0
= __SMLADX(x2
, c0
, acc0
);
989 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
990 acc1
= __SMLADX(x3
, c0
, acc1
);
992 /* Read x[4], x[5], x[6] */
996 #ifndef ARM_MATH_BIG_ENDIAN
998 x0
= __PKHBT(a
, b
, 16);
1000 x1
= __PKHBT(b
, a
, 16);
1004 x0
= __PKHBT(b
, a
, 16);
1006 x1
= __PKHBT(a
, b
, 16);
1008 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1012 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
1013 acc2
= __SMLADX(x0
, c0
, acc2
);
1015 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
1016 acc3
= __SMLADX(x1
, c0
, acc3
);
1020 /* For the next MAC operations, SIMD is not used
1021 * So, the 16 bit pointer if inputB, py is updated */
1023 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1024 ** No loop unrolling is used. */
1029 /* Read y[srcBLen - 5] */
1032 #ifdef ARM_MATH_BIG_ENDIAN
1038 c0
= c0
& 0x0000FFFF;
1040 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
1047 #ifndef ARM_MATH_BIG_ENDIAN
1049 x3
= __PKHBT(a
, b
, 16);
1053 x3
= __PKHBT(b
, a
, 16);;
1055 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1058 /* Perform the multiply-accumulates */
1059 acc0
= __SMLAD(x0
, c0
, acc0
);
1060 acc1
= __SMLAD(x1
, c0
, acc1
);
1061 acc2
= __SMLADX(x1
, c0
, acc2
);
1062 acc3
= __SMLADX(x3
, c0
, acc3
);
1067 /* Read y[srcBLen - 5], y[srcBLen - 6] */
1071 #ifndef ARM_MATH_BIG_ENDIAN
1073 c0
= __PKHBT(a
, b
, 16);
1077 c0
= __PKHBT(b
, a
, 16);;
1079 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1081 /* Read x[7], x[8], x[9] */
1085 #ifndef ARM_MATH_BIG_ENDIAN
1087 x3
= __PKHBT(a
, b
, 16);
1089 x2
= __PKHBT(b
, a
, 16);
1093 x3
= __PKHBT(b
, a
, 16);
1095 x2
= __PKHBT(a
, b
, 16);
1097 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1100 /* Perform the multiply-accumulates */
1101 acc0
= __SMLADX(x0
, c0
, acc0
);
1102 acc1
= __SMLADX(x1
, c0
, acc1
);
1103 acc2
= __SMLADX(x3
, c0
, acc2
);
1104 acc3
= __SMLADX(x2
, c0
, acc3
);
1109 /* Read y[srcBLen - 5], y[srcBLen - 6] */
1113 #ifndef ARM_MATH_BIG_ENDIAN
1115 c0
= __PKHBT(a
, b
, 16);
1119 c0
= __PKHBT(b
, a
, 16);;
1121 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1123 /* Read x[7], x[8], x[9] */
1127 #ifndef ARM_MATH_BIG_ENDIAN
1129 x3
= __PKHBT(a
, b
, 16);
1131 x2
= __PKHBT(b
, a
, 16);
1135 x3
= __PKHBT(b
, a
, 16);
1137 x2
= __PKHBT(a
, b
, 16);
1139 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1141 /* Perform the multiply-accumulates */
1142 acc0
= __SMLADX(x0
, c0
, acc0
);
1143 acc1
= __SMLADX(x1
, c0
, acc1
);
1144 acc2
= __SMLADX(x3
, c0
, acc2
);
1145 acc3
= __SMLADX(x2
, c0
, acc3
);
1147 /* Read y[srcBLen - 7] */
1149 #ifdef ARM_MATH_BIG_ENDIAN
1154 c0
= c0
& 0x0000FFFF;
1155 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
1161 #ifndef ARM_MATH_BIG_ENDIAN
1163 x3
= __PKHBT(a
, b
, 16);
1167 x3
= __PKHBT(b
, a
, 16);;
1169 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1173 /* Perform the multiply-accumulates */
1174 acc0
= __SMLADX(x1
, c0
, acc0
);
1175 acc1
= __SMLAD(x2
, c0
, acc1
);
1176 acc2
= __SMLADX(x2
, c0
, acc2
);
1177 acc3
= __SMLADX(x3
, c0
, acc3
);
1180 /* Store the results in the accumulators in the destination buffer. */
1181 *pOut
++ = (q15_t
)(acc0
>> 15);
1182 *pOut
++ = (q15_t
)(acc1
>> 15);
1183 *pOut
++ = (q15_t
)(acc2
>> 15);
1184 *pOut
++ = (q15_t
)(acc3
>> 15);
1186 /* Increment the pointer pIn1 index, count by 4 */
1189 /* Update the inputA and inputB pointers for next MAC calculation */
1193 /* Decrement the loop counter */
1197 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
1198 ** No loop unrolling is used. */
1199 blkCnt
= blockSize2
% 0x4u
;
1203 /* Accumulator is made zero for every iteration */
1206 /* Apply loop unrolling and compute 4 MACs simultaneously. */
1209 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
1210 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1213 /* Perform the multiply-accumulates */
1214 sum
+= ((q31_t
) * px
++ * *py
--);
1215 sum
+= ((q31_t
) * px
++ * *py
--);
1216 sum
+= ((q31_t
) * px
++ * *py
--);
1217 sum
+= ((q31_t
) * px
++ * *py
--);
1219 /* Decrement the loop counter */
1223 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1224 ** No loop unrolling is used. */
1229 /* Perform the multiply-accumulates */
1230 sum
+= ((q31_t
) * px
++ * *py
--);
1232 /* Decrement the loop counter */
1236 /* Store the result in the accumulator in the destination buffer. */
1237 *pOut
++ = (q15_t
) (sum
>> 15);
1239 /* Increment the pointer pIn1 index, count by 1 */
1242 /* Update the inputA and inputB pointers for next MAC calculation */
1246 /* Decrement the loop counter */
1252 /* If the srcBLen is not a multiple of 4,
1253 * the blockSize2 loop cannot be unrolled by 4 */
1254 blkCnt
= blockSize2
;
1258 /* Accumulator is made zero for every iteration */
1261 /* srcBLen number of MACS should be performed */
1266 /* Perform the multiply-accumulate */
1267 sum
+= ((q31_t
) * px
++ * *py
--);
1269 /* Decrement the loop counter */
1273 /* Store the result in the accumulator in the destination buffer. */
1274 *pOut
++ = (q15_t
) (sum
>> 15);
1276 /* Increment the MAC count */
1279 /* Update the inputA and inputB pointers for next MAC calculation */
1283 /* Decrement the loop counter */
1289 /* --------------------------
1290 * Initializations of stage3
1291 * -------------------------*/
1293 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
1294 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
1296 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
1297 * sum += x[srcALen-1] * y[srcBLen-1]
1300 /* In this stage the MAC operations are decreased by 1 for every iteration.
1301 The blockSize3 variable holds the number of MAC operations performed */
1303 /* Working pointer of inputA */
1304 pSrc1
= (pIn1
+ srcALen
) - (srcBLen
- 1u);
1307 /* Working pointer of inputB */
1308 pSrc2
= pIn2
+ (srcBLen
- 1u);
1312 /* -------------------
1314 * ------------------*/
1316 /* For loop unrolling by 4, this stage is divided into two. */
1317 /* First part of this stage computes the MAC operations greater than 4 */
1318 /* Second part of this stage computes the MAC operations less than or equal to 4 */
1320 /* The first part of the stage starts here */
1321 j
= blockSize3
>> 2u;
1323 while((j
> 0u) && (blockSize3
> 0u))
1325 /* Accumulator is made zero for every iteration */
1328 /* Apply loop unrolling and compute 4 MACs simultaneously. */
1329 k
= blockSize3
>> 2u;
1331 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
1332 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1337 sum
+= ((q31_t
) * px
++ * *py
--);
1338 sum
+= ((q31_t
) * px
++ * *py
--);
1339 sum
+= ((q31_t
) * px
++ * *py
--);
1340 sum
+= ((q31_t
) * px
++ * *py
--);
1341 /* Decrement the loop counter */
1345 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
1346 ** No loop unrolling is used. */
1347 k
= blockSize3
% 0x4u
;
1351 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
1352 sum
+= ((q31_t
) * px
++ * *py
--);
1354 /* Decrement the loop counter */
1358 /* Store the result in the accumulator in the destination buffer. */
1359 *pOut
++ = (q15_t
) (sum
>> 15);
1361 /* Update the inputA and inputB pointers for next MAC calculation */
1365 /* Decrement the loop counter */
1371 /* The second part of the stage starts here */
1372 /* SIMD is not used for the next MAC operations,
1373 * so pointer py is updated to read only one sample at a time */
1376 while(blockSize3
> 0u)
1378 /* Accumulator is made zero for every iteration */
1381 /* Apply loop unrolling and compute 4 MACs simultaneously. */
1386 /* Perform the multiply-accumulates */
1387 /* sum += x[srcALen-1] * y[srcBLen-1] */
1388 sum
+= ((q31_t
) * px
++ * *py
--);
1390 /* Decrement the loop counter */
1394 /* Store the result in the accumulator in the destination buffer. */
1395 *pOut
++ = (q15_t
) (sum
>> 15);
1397 /* Update the inputA and inputB pointers for next MAC calculation */
1401 /* Decrement the loop counter */
1405 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
1409 * @} end of Conv group