]>
git.gir.st - tmk_keyboard.git/blob - tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_correlate_fast_q15.c
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
4 * $Date: 17. January 2013
7 * Project: CMSIS DSP Library
8 * Title: arm_correlate_fast_q15.c
10 * Description: Fast Q15 Correlation.
12 * Target Processor: Cortex-M4/Cortex-M3
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
44 * @ingroup groupFilters
53 * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
54 * @param[in] *pSrcA points to the first input sequence.
55 * @param[in] srcALen length of the first input sequence.
56 * @param[in] *pSrcB points to the second input sequence.
57 * @param[in] srcBLen length of the second input sequence.
58 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
61 * <b>Scaling and Overflow Behavior:</b>
64 * This fast version uses a 32-bit accumulator with 2.30 format.
65 * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
66 * There is no saturation on intermediate additions.
67 * Thus, if the accumulator overflows it wraps around and distorts the result.
68 * The input signals should be scaled down to avoid intermediate overflows.
69 * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
70 * maximum of min(srcALen, srcBLen) number of additions is carried internally.
71 * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
74 * See <code>arm_correlate_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
77 void arm_correlate_fast_q15 (
84 #ifndef UNALIGNED_SUPPORT_DISABLE
86 q15_t
* pIn1
; /* inputA pointer */
87 q15_t
* pIn2
; /* inputB pointer */
88 q15_t
* pOut
= pDst
; /* output pointer */
89 q31_t sum
, acc0
, acc1
, acc2
, acc3
; /* Accumulators */
90 q15_t
* px
; /* Intermediate inputA pointer */
91 q15_t
* py
; /* Intermediate inputB pointer */
92 q15_t
* pSrc1
; /* Intermediate pointers */
93 q31_t x0
, x1
, x2
, x3
, c0
; /* temporary variables for holding input and coefficient values */
94 uint32_t j
, k
= 0u , count
, blkCnt
, outBlockSize
, blockSize1
, blockSize2
, blockSize3
; /* loop counter */
95 int32_t inc
= 1 ; /* Destination address modifier */
98 /* The algorithm implementation is based on the lengths of the inputs. */
99 /* srcB is always made to slide across srcA. */
100 /* So srcBLen is always considered as shorter or equal to srcALen */
101 /* But CORR(x, y) is reverse of CORR(y, x) */
102 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
103 /* and the destination pointer modifier, inc is set to -1 */
104 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
105 /* But to improve the performance,
106 * we include zeroes in the output instead of zero padding either of the the inputs*/
107 /* If srcALen > srcBLen,
108 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
109 /* If srcALen < srcBLen,
110 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
111 if ( srcALen
>= srcBLen
)
113 /* Initialization of inputA pointer */
116 /* Initialization of inputB pointer */
119 /* Number of output samples is calculated */
120 outBlockSize
= ( 2u * srcALen
) - 1u ;
122 /* When srcALen > srcBLen, zero padding is done to srcB
123 * to make their lengths equal.
124 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
125 * number of output samples are made zero */
126 j
= outBlockSize
- ( srcALen
+ ( srcBLen
- 1u ));
128 /* Updating the pointer position to non zero value */
134 /* Initialization of inputA pointer */
137 /* Initialization of inputB pointer */
140 /* srcBLen is always considered as shorter or equal to srcALen */
145 /* CORR(x, y) = Reverse order(CORR(y, x)) */
146 /* Hence set the destination pointer to point to the last output sample */
147 pOut
= pDst
+ (( srcALen
+ srcBLen
) - 2u );
149 /* Destination address modifier is set to -1 */
154 /* The function is internally
155 * divided into three parts according to the number of multiplications that has to be
156 * taken place between inputA samples and inputB samples. In the first part of the
157 * algorithm, the multiplications increase by one for every iteration.
158 * In the second part of the algorithm, srcBLen number of multiplications are done.
159 * In the third part of the algorithm, the multiplications decrease by one
160 * for every iteration.*/
161 /* The algorithm is implemented in three stages.
162 * The loop counters of each stage is initiated here. */
163 blockSize1
= srcBLen
- 1u ;
164 blockSize2
= srcALen
- ( srcBLen
- 1u );
165 blockSize3
= blockSize1
;
167 /* --------------------------
168 * Initializations of stage1
169 * -------------------------*/
171 /* sum = x[0] * y[srcBlen - 1]
172 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
174 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
177 /* In this stage the MAC operations are increased by 1 for every iteration.
178 The count variable holds the number of MAC operations performed */
181 /* Working pointer of inputA */
184 /* Working pointer of inputB */
185 pSrc1
= pIn2
+ ( srcBLen
- 1u );
188 /* ------------------------
190 * ----------------------*/
192 /* The first loop starts here */
193 while ( blockSize1
> 0u )
195 /* Accumulator is made zero for every iteration */
198 /* Apply loop unrolling and compute 4 MACs simultaneously. */
201 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
202 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
205 /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
206 sum
= __SMLAD (* __SIMD32 ( px
)++, * __SIMD32 ( py
)++, sum
);
207 /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
208 sum
= __SMLAD (* __SIMD32 ( px
)++, * __SIMD32 ( py
)++, sum
);
210 /* Decrement the loop counter */
214 /* If the count is not a multiple of 4, compute any remaining MACs here.
215 ** No loop unrolling is used. */
220 /* Perform the multiply-accumulates */
221 /* x[0] * y[srcBLen - 1] */
222 sum
= __SMLAD (* px
++, * py
++, sum
);
224 /* Decrement the loop counter */
228 /* Store the result in the accumulator in the destination buffer. */
229 * pOut
= ( q15_t
) ( sum
>> 15 );
230 /* Destination pointer is updated according to the address modifier, inc */
233 /* Update the inputA and inputB pointers for next MAC calculation */
237 /* Increment the MAC count */
240 /* Decrement the loop counter */
244 /* --------------------------
245 * Initializations of stage2
246 * ------------------------*/
248 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
249 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
251 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
254 /* Working pointer of inputA */
257 /* Working pointer of inputB */
260 /* count is index by which the pointer pIn1 to be incremented */
263 /* -------------------
265 * ------------------*/
267 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
268 * So, to loop unroll over blockSize2,
269 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
272 /* Loop unroll over blockSize2, by 4 */
273 blkCnt
= blockSize2
>> 2u ;
277 /* Set all accumulators to zero */
283 /* read x[0], x[1] samples */
285 /* read x[1], x[2] samples */
286 x1
= _SIMD32_OFFSET ( px
+ 1 );
289 /* Apply loop unrolling and compute 4 MACs simultaneously. */
292 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
293 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
296 /* Read the first two inputB samples using SIMD:
298 c0
= * __SIMD32 ( py
)++;
300 /* acc0 += x[0] * y[0] + x[1] * y[1] */
301 acc0
= __SMLAD ( x0
, c0
, acc0
);
303 /* acc1 += x[1] * y[0] + x[2] * y[1] */
304 acc1
= __SMLAD ( x1
, c0
, acc1
);
306 /* Read x[2], x[3] */
309 /* Read x[3], x[4] */
310 x3
= _SIMD32_OFFSET ( px
+ 1 );
312 /* acc2 += x[2] * y[0] + x[3] * y[1] */
313 acc2
= __SMLAD ( x2
, c0
, acc2
);
315 /* acc3 += x[3] * y[0] + x[4] * y[1] */
316 acc3
= __SMLAD ( x3
, c0
, acc3
);
318 /* Read y[2] and y[3] */
319 c0
= * __SIMD32 ( py
)++;
321 /* acc0 += x[2] * y[2] + x[3] * y[3] */
322 acc0
= __SMLAD ( x2
, c0
, acc0
);
324 /* acc1 += x[3] * y[2] + x[4] * y[3] */
325 acc1
= __SMLAD ( x3
, c0
, acc1
);
327 /* Read x[4], x[5] */
328 x0
= _SIMD32_OFFSET ( px
+ 2 );
330 /* Read x[5], x[6] */
331 x1
= _SIMD32_OFFSET ( px
+ 3 );
334 /* acc2 += x[4] * y[2] + x[5] * y[3] */
335 acc2
= __SMLAD ( x0
, c0
, acc2
);
337 /* acc3 += x[5] * y[2] + x[6] * y[3] */
338 acc3
= __SMLAD ( x1
, c0
, acc3
);
342 /* For the next MAC operations, SIMD is not used
343 * So, the 16 bit pointer if inputB, py is updated */
345 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
346 ** No loop unrolling is used. */
353 #ifdef ARM_MATH_BIG_ENDIAN
359 c0
= c0
& 0x0000FFFF ;
361 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
367 /* Perform the multiply-accumulates */
368 acc0
= __SMLAD ( x0
, c0
, acc0
);
369 acc1
= __SMLAD ( x1
, c0
, acc1
);
370 acc2
= __SMLADX ( x1
, c0
, acc2
);
371 acc3
= __SMLADX ( x3
, c0
, acc3
);
376 /* Read y[4], y[5] */
379 /* Read x[7], x[8] */
383 x2
= _SIMD32_OFFSET ( px
+ 1 );
386 /* Perform the multiply-accumulates */
387 acc0
= __SMLAD ( x0
, c0
, acc0
);
388 acc1
= __SMLAD ( x1
, c0
, acc1
);
389 acc2
= __SMLAD ( x3
, c0
, acc2
);
390 acc3
= __SMLAD ( x2
, c0
, acc3
);
395 /* Read y[4], y[5] */
396 c0
= * __SIMD32 ( py
)++;
398 /* Read x[7], x[8] */
402 x2
= _SIMD32_OFFSET ( px
+ 1 );
404 /* Perform the multiply-accumulates */
405 acc0
= __SMLAD ( x0
, c0
, acc0
);
406 acc1
= __SMLAD ( x1
, c0
, acc1
);
407 acc2
= __SMLAD ( x3
, c0
, acc2
);
408 acc3
= __SMLAD ( x2
, c0
, acc3
);
412 #ifdef ARM_MATH_BIG_ENDIAN
417 c0
= c0
& 0x0000FFFF ;
418 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
421 x3
= _SIMD32_OFFSET ( px
+ 2 );
424 /* Perform the multiply-accumulates */
425 acc0
= __SMLADX ( x1
, c0
, acc0
);
426 acc1
= __SMLAD ( x2
, c0
, acc1
);
427 acc2
= __SMLADX ( x2
, c0
, acc2
);
428 acc3
= __SMLADX ( x3
, c0
, acc3
);
431 /* Store the result in the accumulator in the destination buffer. */
432 * pOut
= ( q15_t
) ( acc0
>> 15 );
433 /* Destination pointer is updated according to the address modifier, inc */
436 * pOut
= ( q15_t
) ( acc1
>> 15 );
439 * pOut
= ( q15_t
) ( acc2
>> 15 );
442 * pOut
= ( q15_t
) ( acc3
>> 15 );
445 /* Increment the pointer pIn1 index, count by 1 */
448 /* Update the inputA and inputB pointers for next MAC calculation */
453 /* Decrement the loop counter */
457 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
458 ** No loop unrolling is used. */
459 blkCnt
= blockSize2
% 0x4 u
;
463 /* Accumulator is made zero for every iteration */
466 /* Apply loop unrolling and compute 4 MACs simultaneously. */
469 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
470 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
473 /* Perform the multiply-accumulates */
474 sum
+= (( q31_t
) * px
++ * * py
++);
475 sum
+= (( q31_t
) * px
++ * * py
++);
476 sum
+= (( q31_t
) * px
++ * * py
++);
477 sum
+= (( q31_t
) * px
++ * * py
++);
479 /* Decrement the loop counter */
483 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
484 ** No loop unrolling is used. */
489 /* Perform the multiply-accumulates */
490 sum
+= (( q31_t
) * px
++ * * py
++);
492 /* Decrement the loop counter */
496 /* Store the result in the accumulator in the destination buffer. */
497 * pOut
= ( q15_t
) ( sum
>> 15 );
498 /* Destination pointer is updated according to the address modifier, inc */
501 /* Increment the pointer pIn1 index, count by 1 */
504 /* Update the inputA and inputB pointers for next MAC calculation */
508 /* Decrement the loop counter */
514 /* If the srcBLen is not a multiple of 4,
515 * the blockSize2 loop cannot be unrolled by 4 */
520 /* Accumulator is made zero for every iteration */
523 /* Loop over srcBLen */
528 /* Perform the multiply-accumulate */
529 sum
+= (( q31_t
) * px
++ * * py
++);
531 /* Decrement the loop counter */
535 /* Store the result in the accumulator in the destination buffer. */
536 * pOut
= ( q15_t
) ( sum
>> 15 );
537 /* Destination pointer is updated according to the address modifier, inc */
540 /* Increment the MAC count */
543 /* Update the inputA and inputB pointers for next MAC calculation */
547 /* Decrement the loop counter */
552 /* --------------------------
553 * Initializations of stage3
554 * -------------------------*/
556 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
557 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
559 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
560 * sum += x[srcALen-1] * y[0]
563 /* In this stage the MAC operations are decreased by 1 for every iteration.
564 The count variable holds the number of MAC operations performed */
565 count
= srcBLen
- 1u ;
567 /* Working pointer of inputA */
568 pSrc1
= ( pIn1
+ srcALen
) - ( srcBLen
- 1u );
571 /* Working pointer of inputB */
574 /* -------------------
576 * ------------------*/
578 while ( blockSize3
> 0u )
580 /* Accumulator is made zero for every iteration */
583 /* Apply loop unrolling and compute 4 MACs simultaneously. */
586 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
587 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
590 /* Perform the multiply-accumulates */
591 /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
592 sum
= __SMLAD (* __SIMD32 ( px
)++, * __SIMD32 ( py
)++, sum
);
593 /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
594 sum
= __SMLAD (* __SIMD32 ( px
)++, * __SIMD32 ( py
)++, sum
);
596 /* Decrement the loop counter */
600 /* If the count is not a multiple of 4, compute any remaining MACs here.
601 ** No loop unrolling is used. */
606 /* Perform the multiply-accumulates */
607 sum
= __SMLAD (* px
++, * py
++, sum
);
609 /* Decrement the loop counter */
613 /* Store the result in the accumulator in the destination buffer. */
614 * pOut
= ( q15_t
) ( sum
>> 15 );
615 /* Destination pointer is updated according to the address modifier, inc */
618 /* Update the inputA and inputB pointers for next MAC calculation */
622 /* Decrement the MAC count */
625 /* Decrement the loop counter */
631 q15_t
* pIn1
; /* inputA pointer */
632 q15_t
* pIn2
; /* inputB pointer */
633 q15_t
* pOut
= pDst
; /* output pointer */
634 q31_t sum
, acc0
, acc1
, acc2
, acc3
; /* Accumulators */
635 q15_t
* px
; /* Intermediate inputA pointer */
636 q15_t
* py
; /* Intermediate inputB pointer */
637 q15_t
* pSrc1
; /* Intermediate pointers */
638 q31_t x0
, x1
, x2
, x3
, c0
; /* temporary variables for holding input and coefficient values */
639 uint32_t j
, k
= 0u , count
, blkCnt
, outBlockSize
, blockSize1
, blockSize2
, blockSize3
; /* loop counter */
640 int32_t inc
= 1 ; /* Destination address modifier */
644 /* The algorithm implementation is based on the lengths of the inputs. */
645 /* srcB is always made to slide across srcA. */
646 /* So srcBLen is always considered as shorter or equal to srcALen */
647 /* But CORR(x, y) is reverse of CORR(y, x) */
648 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
649 /* and the destination pointer modifier, inc is set to -1 */
650 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
651 /* But to improve the performance,
652 * we include zeroes in the output instead of zero padding either of the the inputs*/
653 /* If srcALen > srcBLen,
654 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
655 /* If srcALen < srcBLen,
656 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
657 if ( srcALen
>= srcBLen
)
659 /* Initialization of inputA pointer */
662 /* Initialization of inputB pointer */
665 /* Number of output samples is calculated */
666 outBlockSize
= ( 2u * srcALen
) - 1u ;
668 /* When srcALen > srcBLen, zero padding is done to srcB
669 * to make their lengths equal.
670 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
671 * number of output samples are made zero */
672 j
= outBlockSize
- ( srcALen
+ ( srcBLen
- 1u ));
674 /* Updating the pointer position to non zero value */
680 /* Initialization of inputA pointer */
683 /* Initialization of inputB pointer */
686 /* srcBLen is always considered as shorter or equal to srcALen */
691 /* CORR(x, y) = Reverse order(CORR(y, x)) */
692 /* Hence set the destination pointer to point to the last output sample */
693 pOut
= pDst
+ (( srcALen
+ srcBLen
) - 2u );
695 /* Destination address modifier is set to -1 */
700 /* The function is internally
701 * divided into three parts according to the number of multiplications that has to be
702 * taken place between inputA samples and inputB samples. In the first part of the
703 * algorithm, the multiplications increase by one for every iteration.
704 * In the second part of the algorithm, srcBLen number of multiplications are done.
705 * In the third part of the algorithm, the multiplications decrease by one
706 * for every iteration.*/
707 /* The algorithm is implemented in three stages.
708 * The loop counters of each stage is initiated here. */
709 blockSize1
= srcBLen
- 1u ;
710 blockSize2
= srcALen
- ( srcBLen
- 1u );
711 blockSize3
= blockSize1
;
713 /* --------------------------
714 * Initializations of stage1
715 * -------------------------*/
717 /* sum = x[0] * y[srcBlen - 1]
718 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
720 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
723 /* In this stage the MAC operations are increased by 1 for every iteration.
724 The count variable holds the number of MAC operations performed */
727 /* Working pointer of inputA */
730 /* Working pointer of inputB */
731 pSrc1
= pIn2
+ ( srcBLen
- 1u );
734 /* ------------------------
736 * ----------------------*/
738 /* The first loop starts here */
739 while ( blockSize1
> 0u )
741 /* Accumulator is made zero for every iteration */
744 /* Apply loop unrolling and compute 4 MACs simultaneously. */
747 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
748 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
751 /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
752 sum
+= (( q31_t
) * px
++ * * py
++);
753 sum
+= (( q31_t
) * px
++ * * py
++);
754 sum
+= (( q31_t
) * px
++ * * py
++);
755 sum
+= (( q31_t
) * px
++ * * py
++);
757 /* Decrement the loop counter */
761 /* If the count is not a multiple of 4, compute any remaining MACs here.
762 ** No loop unrolling is used. */
767 /* Perform the multiply-accumulates */
768 /* x[0] * y[srcBLen - 1] */
769 sum
+= (( q31_t
) * px
++ * * py
++);
771 /* Decrement the loop counter */
775 /* Store the result in the accumulator in the destination buffer. */
776 * pOut
= ( q15_t
) ( sum
>> 15 );
777 /* Destination pointer is updated according to the address modifier, inc */
780 /* Update the inputA and inputB pointers for next MAC calculation */
784 /* Increment the MAC count */
787 /* Decrement the loop counter */
791 /* --------------------------
792 * Initializations of stage2
793 * ------------------------*/
795 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
796 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
798 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
801 /* Working pointer of inputA */
804 /* Working pointer of inputB */
807 /* count is index by which the pointer pIn1 to be incremented */
810 /* -------------------
812 * ------------------*/
814 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
815 * So, to loop unroll over blockSize2,
816 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
819 /* Loop unroll over blockSize2, by 4 */
820 blkCnt
= blockSize2
>> 2u ;
824 /* Set all accumulators to zero */
830 /* read x[0], x[1], x[2] samples */
834 #ifndef ARM_MATH_BIG_ENDIAN
836 x0
= __PKHBT ( a
, b
, 16 );
838 x1
= __PKHBT ( b
, a
, 16 );
842 x0
= __PKHBT ( b
, a
, 16 );
844 x1
= __PKHBT ( a
, b
, 16 );
846 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
850 /* Apply loop unrolling and compute 4 MACs simultaneously. */
853 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
854 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
857 /* Read the first two inputB samples using SIMD:
862 #ifndef ARM_MATH_BIG_ENDIAN
864 c0
= __PKHBT ( a
, b
, 16 );
868 c0
= __PKHBT ( b
, a
, 16 );
870 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
872 /* acc0 += x[0] * y[0] + x[1] * y[1] */
873 acc0
= __SMLAD ( x0
, c0
, acc0
);
875 /* acc1 += x[1] * y[0] + x[2] * y[1] */
876 acc1
= __SMLAD ( x1
, c0
, acc1
);
878 /* Read x[2], x[3], x[4] */
882 #ifndef ARM_MATH_BIG_ENDIAN
884 x2
= __PKHBT ( a
, b
, 16 );
886 x3
= __PKHBT ( b
, a
, 16 );
890 x2
= __PKHBT ( b
, a
, 16 );
892 x3
= __PKHBT ( a
, b
, 16 );
894 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
896 /* acc2 += x[2] * y[0] + x[3] * y[1] */
897 acc2
= __SMLAD ( x2
, c0
, acc2
);
899 /* acc3 += x[3] * y[0] + x[4] * y[1] */
900 acc3
= __SMLAD ( x3
, c0
, acc3
);
902 /* Read y[2] and y[3] */
908 #ifndef ARM_MATH_BIG_ENDIAN
910 c0
= __PKHBT ( a
, b
, 16 );
914 c0
= __PKHBT ( b
, a
, 16 );
916 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
918 /* acc0 += x[2] * y[2] + x[3] * y[3] */
919 acc0
= __SMLAD ( x2
, c0
, acc0
);
921 /* acc1 += x[3] * y[2] + x[4] * y[3] */
922 acc1
= __SMLAD ( x3
, c0
, acc1
);
924 /* Read x[4], x[5], x[6] */
928 #ifndef ARM_MATH_BIG_ENDIAN
930 x0
= __PKHBT ( a
, b
, 16 );
932 x1
= __PKHBT ( b
, a
, 16 );
936 x0
= __PKHBT ( b
, a
, 16 );
938 x1
= __PKHBT ( a
, b
, 16 );
940 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
944 /* acc2 += x[4] * y[2] + x[5] * y[3] */
945 acc2
= __SMLAD ( x0
, c0
, acc2
);
947 /* acc3 += x[5] * y[2] + x[6] * y[3] */
948 acc3
= __SMLAD ( x1
, c0
, acc3
);
952 /* For the next MAC operations, SIMD is not used
953 * So, the 16 bit pointer if inputB, py is updated */
955 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
956 ** No loop unrolling is used. */
963 #ifdef ARM_MATH_BIG_ENDIAN
969 c0
= c0
& 0x0000FFFF ;
971 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
979 #ifndef ARM_MATH_BIG_ENDIAN
981 x3
= __PKHBT ( a
, b
, 16 );
985 x3
= __PKHBT ( b
, a
, 16 );
987 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
991 /* Perform the multiply-accumulates */
992 acc0
= __SMLAD ( x0
, c0
, acc0
);
993 acc1
= __SMLAD ( x1
, c0
, acc1
);
994 acc2
= __SMLADX ( x1
, c0
, acc2
);
995 acc3
= __SMLADX ( x3
, c0
, acc3
);
1000 /* Read y[4], y[5] */
1004 #ifndef ARM_MATH_BIG_ENDIAN
1006 c0
= __PKHBT ( a
, b
, 16 );
1010 c0
= __PKHBT ( b
, a
, 16 );
1012 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1014 /* Read x[7], x[8], x[9] */
1018 #ifndef ARM_MATH_BIG_ENDIAN
1020 x3
= __PKHBT ( a
, b
, 16 );
1022 x2
= __PKHBT ( b
, a
, 16 );
1026 x3
= __PKHBT ( b
, a
, 16 );
1028 x2
= __PKHBT ( a
, b
, 16 );
1030 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1034 /* Perform the multiply-accumulates */
1035 acc0
= __SMLAD ( x0
, c0
, acc0
);
1036 acc1
= __SMLAD ( x1
, c0
, acc1
);
1037 acc2
= __SMLAD ( x3
, c0
, acc2
);
1038 acc3
= __SMLAD ( x2
, c0
, acc3
);
1043 /* Read y[4], y[5] */
1047 #ifndef ARM_MATH_BIG_ENDIAN
1049 c0
= __PKHBT ( a
, b
, 16 );
1053 c0
= __PKHBT ( b
, a
, 16 );
1055 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1059 /* Read x[7], x[8], x[9] */
1063 #ifndef ARM_MATH_BIG_ENDIAN
1065 x3
= __PKHBT ( a
, b
, 16 );
1067 x2
= __PKHBT ( b
, a
, 16 );
1071 x3
= __PKHBT ( b
, a
, 16 );
1073 x2
= __PKHBT ( a
, b
, 16 );
1075 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1077 /* Perform the multiply-accumulates */
1078 acc0
= __SMLAD ( x0
, c0
, acc0
);
1079 acc1
= __SMLAD ( x1
, c0
, acc1
);
1080 acc2
= __SMLAD ( x3
, c0
, acc2
);
1081 acc3
= __SMLAD ( x2
, c0
, acc3
);
1085 #ifdef ARM_MATH_BIG_ENDIAN
1090 c0
= c0
& 0x0000FFFF ;
1091 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
1096 #ifndef ARM_MATH_BIG_ENDIAN
1098 x3
= __PKHBT ( a
, b
, 16 );
1102 x3
= __PKHBT ( b
, a
, 16 );
1104 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1108 /* Perform the multiply-accumulates */
1109 acc0
= __SMLADX ( x1
, c0
, acc0
);
1110 acc1
= __SMLAD ( x2
, c0
, acc1
);
1111 acc2
= __SMLADX ( x2
, c0
, acc2
);
1112 acc3
= __SMLADX ( x3
, c0
, acc3
);
1115 /* Store the result in the accumulator in the destination buffer. */
1116 * pOut
= ( q15_t
) ( acc0
>> 15 );
1117 /* Destination pointer is updated according to the address modifier, inc */
1120 * pOut
= ( q15_t
) ( acc1
>> 15 );
1123 * pOut
= ( q15_t
) ( acc2
>> 15 );
1126 * pOut
= ( q15_t
) ( acc3
>> 15 );
1129 /* Increment the pointer pIn1 index, count by 1 */
1132 /* Update the inputA and inputB pointers for next MAC calculation */
1137 /* Decrement the loop counter */
1141 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
1142 ** No loop unrolling is used. */
1143 blkCnt
= blockSize2
% 0x4 u
;
1147 /* Accumulator is made zero for every iteration */
1150 /* Apply loop unrolling and compute 4 MACs simultaneously. */
1153 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
1154 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1157 /* Perform the multiply-accumulates */
1158 sum
+= (( q31_t
) * px
++ * * py
++);
1159 sum
+= (( q31_t
) * px
++ * * py
++);
1160 sum
+= (( q31_t
) * px
++ * * py
++);
1161 sum
+= (( q31_t
) * px
++ * * py
++);
1163 /* Decrement the loop counter */
1167 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1168 ** No loop unrolling is used. */
1173 /* Perform the multiply-accumulates */
1174 sum
+= (( q31_t
) * px
++ * * py
++);
1176 /* Decrement the loop counter */
1180 /* Store the result in the accumulator in the destination buffer. */
1181 * pOut
= ( q15_t
) ( sum
>> 15 );
1182 /* Destination pointer is updated according to the address modifier, inc */
1185 /* Increment the pointer pIn1 index, count by 1 */
1188 /* Update the inputA and inputB pointers for next MAC calculation */
1192 /* Decrement the loop counter */
1198 /* If the srcBLen is not a multiple of 4,
1199 * the blockSize2 loop cannot be unrolled by 4 */
1200 blkCnt
= blockSize2
;
1204 /* Accumulator is made zero for every iteration */
1207 /* Loop over srcBLen */
1212 /* Perform the multiply-accumulate */
1213 sum
+= (( q31_t
) * px
++ * * py
++);
1215 /* Decrement the loop counter */
1219 /* Store the result in the accumulator in the destination buffer. */
1220 * pOut
= ( q15_t
) ( sum
>> 15 );
1221 /* Destination pointer is updated according to the address modifier, inc */
1224 /* Increment the MAC count */
1227 /* Update the inputA and inputB pointers for next MAC calculation */
1231 /* Decrement the loop counter */
1236 /* --------------------------
1237 * Initializations of stage3
1238 * -------------------------*/
1240 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
1241 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
1243 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
1244 * sum += x[srcALen-1] * y[0]
1247 /* In this stage the MAC operations are decreased by 1 for every iteration.
1248 The count variable holds the number of MAC operations performed */
1249 count
= srcBLen
- 1u ;
1251 /* Working pointer of inputA */
1252 pSrc1
= ( pIn1
+ srcALen
) - ( srcBLen
- 1u );
1255 /* Working pointer of inputB */
1258 /* -------------------
1260 * ------------------*/
1262 while ( blockSize3
> 0u )
1264 /* Accumulator is made zero for every iteration */
1267 /* Apply loop unrolling and compute 4 MACs simultaneously. */
1270 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
1271 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1274 /* Perform the multiply-accumulates */
1275 sum
+= (( q31_t
) * px
++ * * py
++);
1276 sum
+= (( q31_t
) * px
++ * * py
++);
1277 sum
+= (( q31_t
) * px
++ * * py
++);
1278 sum
+= (( q31_t
) * px
++ * * py
++);
1280 /* Decrement the loop counter */
1284 /* If the count is not a multiple of 4, compute any remaining MACs here.
1285 ** No loop unrolling is used. */
1290 /* Perform the multiply-accumulates */
1291 sum
+= (( q31_t
) * px
++ * * py
++);
1293 /* Decrement the loop counter */
1297 /* Store the result in the accumulator in the destination buffer. */
1298 * pOut
= ( q15_t
) ( sum
>> 15 );
1299 /* Destination pointer is updated according to the address modifier, inc */
1302 /* Update the inputA and inputB pointers for next MAC calculation */
1306 /* Decrement the MAC count */
1309 /* Decrement the loop counter */
1313 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
1318 * @} end of Corr group