]>
git.gir.st - tmk_keyboard.git/blob - tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_partial_fast_q15.c
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
4 * $Date: 17. January 2013
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_partial_fast_q15.c
10 * Description: Fast Q15 Partial convolution.
12 * Target Processor: Cortex-M4/Cortex-M3
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
44 * @ingroup groupFilters
48 * @addtogroup PartialConv
53 * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
54 * @param[in] *pSrcA points to the first input sequence.
55 * @param[in] srcALen length of the first input sequence.
56 * @param[in] *pSrcB points to the second input sequence.
57 * @param[in] srcBLen length of the second input sequence.
58 * @param[out] *pDst points to the location where the output result is written.
59 * @param[in] firstIndex is the first output sample to start with.
60 * @param[in] numPoints is the number of output points to be computed.
61 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
63 * See <code>arm_conv_partial_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
67 arm_status
arm_conv_partial_fast_q15 (
76 #ifndef UNALIGNED_SUPPORT_DISABLE
78 q15_t
* pIn1
; /* inputA pointer */
79 q15_t
* pIn2
; /* inputB pointer */
80 q15_t
* pOut
= pDst
; /* output pointer */
81 q31_t sum
, acc0
, acc1
, acc2
, acc3
; /* Accumulator */
82 q15_t
* px
; /* Intermediate inputA pointer */
83 q15_t
* py
; /* Intermediate inputB pointer */
84 q15_t
* pSrc1
, * pSrc2
; /* Intermediate pointers */
85 q31_t x0
, x1
, x2
, x3
, c0
;
86 uint32_t j
, k
, count
, check
, blkCnt
;
87 int32_t blockSize1
, blockSize2
, blockSize3
; /* loop counters */
88 arm_status status
; /* status of Partial convolution */
90 /* Check for range of output samples to be calculated */
91 if (( firstIndex
+ numPoints
) > (( srcALen
+ ( srcBLen
- 1u ))))
93 /* Set status as ARM_MATH_ARGUMENT_ERROR */
94 status
= ARM_MATH_ARGUMENT_ERROR
;
99 /* The algorithm implementation is based on the lengths of the inputs. */
100 /* srcB is always made to slide across srcA. */
101 /* So srcBLen is always considered as shorter or equal to srcALen */
102 if ( srcALen
>= srcBLen
)
104 /* Initialization of inputA pointer */
107 /* Initialization of inputB pointer */
112 /* Initialization of inputA pointer */
115 /* Initialization of inputB pointer */
118 /* srcBLen is always considered as shorter or equal to srcALen */
124 /* Conditions to check which loopCounter holds
125 * the first and last indices of the output samples to be calculated. */
126 check
= firstIndex
+ numPoints
;
127 blockSize3
= (( int32_t ) check
- ( int32_t ) srcALen
);
128 blockSize3
= ( blockSize3
> 0 ) ? blockSize3
: 0 ;
129 blockSize1
= ((( int32_t ) srcBLen
- 1 ) - ( int32_t ) firstIndex
);
130 blockSize1
= ( blockSize1
> 0 ) ? (( check
> ( srcBLen
- 1u )) ? blockSize1
:
131 ( int32_t ) numPoints
) : 0 ;
132 blockSize2
= ( int32_t ) check
- (( blockSize3
+ blockSize1
) +
133 ( int32_t ) firstIndex
);
134 blockSize2
= ( blockSize2
> 0 ) ? blockSize2
: 0 ;
136 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
137 /* The function is internally
138 * divided into three stages according to the number of multiplications that has to be
139 * taken place between inputA samples and inputB samples. In the first stage of the
140 * algorithm, the multiplications increase by one for every iteration.
141 * In the second stage of the algorithm, srcBLen number of multiplications are done.
142 * In the third stage of the algorithm, the multiplications decrease by one
143 * for every iteration. */
145 /* Set the output pointer to point to the firstIndex
146 * of the output sample to be calculated. */
147 pOut
= pDst
+ firstIndex
;
149 /* --------------------------
150 * Initializations of stage1
151 * -------------------------*/
154 * sum = x[0] * y[1] + x[1] * y[0]
156 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
159 /* In this stage the MAC operations are increased by 1 for every iteration.
160 The count variable holds the number of MAC operations performed.
161 Since the partial convolution starts from firstIndex
162 Number of Macs to be performed is firstIndex + 1 */
163 count
= 1u + firstIndex
;
165 /* Working pointer of inputA */
168 /* Working pointer of inputB */
169 pSrc2
= pIn2
+ firstIndex
;
172 /* ------------------------
174 * ----------------------*/
176 /* For loop unrolling by 4, this stage is divided into two. */
177 /* First part of this stage computes the MAC operations less than 4 */
178 /* Second part of this stage computes the MAC operations greater than or equal to 4 */
180 /* The first part of the stage starts here */
181 while (( count
< 4u ) && ( blockSize1
> 0 ))
183 /* Accumulator is made zero for every iteration */
186 /* Loop over number of MAC operations between
187 * inputA samples and inputB samples */
192 /* Perform the multiply-accumulates */
193 sum
= __SMLAD (* px
++, * py
--, sum
);
195 /* Decrement the loop counter */
199 /* Store the result in the accumulator in the destination buffer. */
200 * pOut
++ = ( q15_t
) ( sum
>> 15 );
202 /* Update the inputA and inputB pointers for next MAC calculation */
206 /* Increment the MAC count */
209 /* Decrement the loop counter */
213 /* The second part of the stage starts here */
214 /* The internal loop, over count, is unrolled by 4 */
215 /* To, read the last two inputB samples using SIMD:
216 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
219 while ( blockSize1
> 0 )
221 /* Accumulator is made zero for every iteration */
224 /* Apply loop unrolling and compute 4 MACs simultaneously. */
227 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
228 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
231 /* Perform the multiply-accumulates */
232 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
233 sum
= __SMLADX (* __SIMD32 ( px
)++, * __SIMD32 ( py
)--, sum
);
234 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
235 sum
= __SMLADX (* __SIMD32 ( px
)++, * __SIMD32 ( py
)--, sum
);
237 /* Decrement the loop counter */
241 /* For the next MAC operations, the pointer py is used without SIMD
242 * So, py is incremented by 1 */
245 /* If the count is not a multiple of 4, compute any remaining MACs here.
246 ** No loop unrolling is used. */
251 /* Perform the multiply-accumulates */
252 sum
= __SMLAD (* px
++, * py
--, sum
);
254 /* Decrement the loop counter */
258 /* Store the result in the accumulator in the destination buffer. */
259 * pOut
++ = ( q15_t
) ( sum
>> 15 );
261 /* Update the inputA and inputB pointers for next MAC calculation */
265 /* Increment the MAC count */
268 /* Decrement the loop counter */
272 /* --------------------------
273 * Initializations of stage2
274 * ------------------------*/
276 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
277 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
279 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
282 /* Working pointer of inputA */
285 /* Working pointer of inputB */
286 pSrc2
= pIn2
+ ( srcBLen
- 1u );
289 /* count is the index by which the pointer pIn1 to be incremented */
293 /* --------------------
295 * -------------------*/
297 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
298 * So, to loop unroll over blockSize2,
299 * srcBLen should be greater than or equal to 4 */
302 /* Loop unroll over blockSize2, by 4 */
303 blkCnt
= (( uint32_t ) blockSize2
>> 2u );
309 /* Set all accumulators to zero */
316 /* read x[0], x[1] samples */
318 /* read x[1], x[2] samples */
319 x1
= _SIMD32_OFFSET ( px
+ 1 );
323 /* Apply loop unrolling and compute 4 MACs simultaneously. */
326 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
327 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
330 /* Read the last two inputB samples using SIMD:
331 * y[srcBLen - 1] and y[srcBLen - 2] */
332 c0
= * __SIMD32 ( py
)--;
334 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
335 acc0
= __SMLADX ( x0
, c0
, acc0
);
337 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
338 acc1
= __SMLADX ( x1
, c0
, acc1
);
340 /* Read x[2], x[3] */
343 /* Read x[3], x[4] */
344 x3
= _SIMD32_OFFSET ( px
+ 1 );
346 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
347 acc2
= __SMLADX ( x2
, c0
, acc2
);
349 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
350 acc3
= __SMLADX ( x3
, c0
, acc3
);
352 /* Read y[srcBLen - 3] and y[srcBLen - 4] */
353 c0
= * __SIMD32 ( py
)--;
355 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
356 acc0
= __SMLADX ( x2
, c0
, acc0
);
358 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
359 acc1
= __SMLADX ( x3
, c0
, acc1
);
361 /* Read x[4], x[5] */
362 x0
= _SIMD32_OFFSET ( px
+ 2 );
364 /* Read x[5], x[6] */
365 x1
= _SIMD32_OFFSET ( px
+ 3 );
368 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
369 acc2
= __SMLADX ( x0
, c0
, acc2
);
371 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
372 acc3
= __SMLADX ( x1
, c0
, acc3
);
376 /* For the next MAC operations, SIMD is not used
377 * So, the 16 bit pointer if inputB, py is updated */
379 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
380 ** No loop unrolling is used. */
385 /* Read y[srcBLen - 5] */
387 #ifdef ARM_MATH_BIG_ENDIAN
393 c0
= c0
& 0x0000FFFF ;
395 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
401 /* Perform the multiply-accumulates */
402 acc0
= __SMLAD ( x0
, c0
, acc0
);
403 acc1
= __SMLAD ( x1
, c0
, acc1
);
404 acc2
= __SMLADX ( x1
, c0
, acc2
);
405 acc3
= __SMLADX ( x3
, c0
, acc3
);
410 /* Read y[srcBLen - 5], y[srcBLen - 6] */
411 c0
= _SIMD32_OFFSET ( py
);
413 /* Read x[7], x[8] */
417 x2
= _SIMD32_OFFSET ( px
+ 1 );
420 /* Perform the multiply-accumulates */
421 acc0
= __SMLADX ( x0
, c0
, acc0
);
422 acc1
= __SMLADX ( x1
, c0
, acc1
);
423 acc2
= __SMLADX ( x3
, c0
, acc2
);
424 acc3
= __SMLADX ( x2
, c0
, acc3
);
429 /* Read y[srcBLen - 5], y[srcBLen - 6] */
430 c0
= _SIMD32_OFFSET ( py
);
432 /* Read x[7], x[8] */
436 x2
= _SIMD32_OFFSET ( px
+ 1 );
438 /* Perform the multiply-accumulates */
439 acc0
= __SMLADX ( x0
, c0
, acc0
);
440 acc1
= __SMLADX ( x1
, c0
, acc1
);
441 acc2
= __SMLADX ( x3
, c0
, acc2
);
442 acc3
= __SMLADX ( x2
, c0
, acc3
);
445 #ifdef ARM_MATH_BIG_ENDIAN
450 c0
= c0
& 0x0000FFFF ;
451 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
454 x3
= _SIMD32_OFFSET ( px
+ 2 );
457 /* Perform the multiply-accumulates */
458 acc0
= __SMLADX ( x1
, c0
, acc0
);
459 acc1
= __SMLAD ( x2
, c0
, acc1
);
460 acc2
= __SMLADX ( x2
, c0
, acc2
);
461 acc3
= __SMLADX ( x3
, c0
, acc3
);
464 /* Store the results in the accumulators in the destination buffer. */
465 #ifndef ARM_MATH_BIG_ENDIAN
467 * __SIMD32 ( pOut
)++ = __PKHBT ( acc0
>> 15 , acc1
>> 15 , 16 );
468 * __SIMD32 ( pOut
)++ = __PKHBT ( acc2
>> 15 , acc3
>> 15 , 16 );
472 * __SIMD32 ( pOut
)++ = __PKHBT ( acc1
>> 15 , acc0
>> 15 , 16 );
473 * __SIMD32 ( pOut
)++ = __PKHBT ( acc3
>> 15 , acc2
>> 15 , 16 );
475 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
477 /* Increment the pointer pIn1 index, count by 4 */
480 /* Update the inputA and inputB pointers for next MAC calculation */
484 /* Decrement the loop counter */
488 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
489 ** No loop unrolling is used. */
490 blkCnt
= ( uint32_t ) blockSize2
% 0x4 u
;
494 /* Accumulator is made zero for every iteration */
497 /* Apply loop unrolling and compute 4 MACs simultaneously. */
500 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
501 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
504 /* Perform the multiply-accumulates */
505 sum
+= (( q31_t
) * px
++ * * py
--);
506 sum
+= (( q31_t
) * px
++ * * py
--);
507 sum
+= (( q31_t
) * px
++ * * py
--);
508 sum
+= (( q31_t
) * px
++ * * py
--);
510 /* Decrement the loop counter */
514 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
515 ** No loop unrolling is used. */
520 /* Perform the multiply-accumulates */
521 sum
+= (( q31_t
) * px
++ * * py
--);
523 /* Decrement the loop counter */
527 /* Store the result in the accumulator in the destination buffer. */
528 * pOut
++ = ( q15_t
) ( sum
>> 15 );
530 /* Increment the pointer pIn1 index, count by 1 */
533 /* Update the inputA and inputB pointers for next MAC calculation */
537 /* Decrement the loop counter */
543 /* If the srcBLen is not a multiple of 4,
544 * the blockSize2 loop cannot be unrolled by 4 */
545 blkCnt
= ( uint32_t ) blockSize2
;
549 /* Accumulator is made zero for every iteration */
552 /* srcBLen number of MACS should be performed */
557 /* Perform the multiply-accumulate */
558 sum
+= (( q31_t
) * px
++ * * py
--);
560 /* Decrement the loop counter */
564 /* Store the result in the accumulator in the destination buffer. */
565 * pOut
++ = ( q15_t
) ( sum
>> 15 );
567 /* Increment the MAC count */
570 /* Update the inputA and inputB pointers for next MAC calculation */
574 /* Decrement the loop counter */
580 /* --------------------------
581 * Initializations of stage3
582 * -------------------------*/
584 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
585 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
587 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
588 * sum += x[srcALen-1] * y[srcBLen-1]
591 /* In this stage the MAC operations are decreased by 1 for every iteration.
592 The count variable holds the number of MAC operations performed */
593 count
= srcBLen
- 1u ;
595 /* Working pointer of inputA */
596 pSrc1
= ( pIn1
+ srcALen
) - ( srcBLen
- 1u );
599 /* Working pointer of inputB */
600 pSrc2
= pIn2
+ ( srcBLen
- 1u );
604 /* -------------------
606 * ------------------*/
608 /* For loop unrolling by 4, this stage is divided into two. */
609 /* First part of this stage computes the MAC operations greater than 4 */
610 /* Second part of this stage computes the MAC operations less than or equal to 4 */
612 /* The first part of the stage starts here */
615 while (( j
> 0u ) && ( blockSize3
> 0 ))
617 /* Accumulator is made zero for every iteration */
620 /* Apply loop unrolling and compute 4 MACs simultaneously. */
623 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
624 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
627 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
628 * with y[srcBLen - 1], y[srcBLen - 2] respectively */
629 sum
= __SMLADX (* __SIMD32 ( px
)++, * __SIMD32 ( py
)--, sum
);
630 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
631 * with y[srcBLen - 3], y[srcBLen - 4] respectively */
632 sum
= __SMLADX (* __SIMD32 ( px
)++, * __SIMD32 ( py
)--, sum
);
634 /* Decrement the loop counter */
638 /* For the next MAC operations, the pointer py is used without SIMD
639 * So, py is incremented by 1 */
642 /* If the count is not a multiple of 4, compute any remaining MACs here.
643 ** No loop unrolling is used. */
648 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
649 sum
= __SMLAD (* px
++, * py
--, sum
);
651 /* Decrement the loop counter */
655 /* Store the result in the accumulator in the destination buffer. */
656 * pOut
++ = ( q15_t
) ( sum
>> 15 );
658 /* Update the inputA and inputB pointers for next MAC calculation */
662 /* Decrement the MAC count */
665 /* Decrement the loop counter */
671 /* The second part of the stage starts here */
672 /* SIMD is not used for the next MAC operations,
673 * so pointer py is updated to read only one sample at a time */
676 while ( blockSize3
> 0 )
678 /* Accumulator is made zero for every iteration */
681 /* Apply loop unrolling and compute 4 MACs simultaneously. */
686 /* Perform the multiply-accumulates */
687 /* sum += x[srcALen-1] * y[srcBLen-1] */
688 sum
= __SMLAD (* px
++, * py
--, sum
);
690 /* Decrement the loop counter */
694 /* Store the result in the accumulator in the destination buffer. */
695 * pOut
++ = ( q15_t
) ( sum
>> 15 );
697 /* Update the inputA and inputB pointers for next MAC calculation */
701 /* Decrement the MAC count */
704 /* Decrement the loop counter */
708 /* set status as ARM_MATH_SUCCESS */
709 status
= ARM_MATH_SUCCESS
;
712 /* Return to application */
717 q15_t
* pIn1
; /* inputA pointer */
718 q15_t
* pIn2
; /* inputB pointer */
719 q15_t
* pOut
= pDst
; /* output pointer */
720 q31_t sum
, acc0
, acc1
, acc2
, acc3
; /* Accumulator */
721 q15_t
* px
; /* Intermediate inputA pointer */
722 q15_t
* py
; /* Intermediate inputB pointer */
723 q15_t
* pSrc1
, * pSrc2
; /* Intermediate pointers */
724 q31_t x0
, x1
, x2
, x3
, c0
;
725 uint32_t j
, k
, count
, check
, blkCnt
;
726 int32_t blockSize1
, blockSize2
, blockSize3
; /* loop counters */
727 arm_status status
; /* status of Partial convolution */
730 /* Check for range of output samples to be calculated */
731 if (( firstIndex
+ numPoints
) > (( srcALen
+ ( srcBLen
- 1u ))))
733 /* Set status as ARM_MATH_ARGUMENT_ERROR */
734 status
= ARM_MATH_ARGUMENT_ERROR
;
739 /* The algorithm implementation is based on the lengths of the inputs. */
740 /* srcB is always made to slide across srcA. */
741 /* So srcBLen is always considered as shorter or equal to srcALen */
742 if ( srcALen
>= srcBLen
)
744 /* Initialization of inputA pointer */
747 /* Initialization of inputB pointer */
752 /* Initialization of inputA pointer */
755 /* Initialization of inputB pointer */
758 /* srcBLen is always considered as shorter or equal to srcALen */
764 /* Conditions to check which loopCounter holds
765 * the first and last indices of the output samples to be calculated. */
766 check
= firstIndex
+ numPoints
;
767 blockSize3
= (( int32_t ) check
- ( int32_t ) srcALen
);
768 blockSize3
= ( blockSize3
> 0 ) ? blockSize3
: 0 ;
769 blockSize1
= ((( int32_t ) srcBLen
- 1 ) - ( int32_t ) firstIndex
);
770 blockSize1
= ( blockSize1
> 0 ) ? (( check
> ( srcBLen
- 1u )) ? blockSize1
:
771 ( int32_t ) numPoints
) : 0 ;
772 blockSize2
= ( int32_t ) check
- (( blockSize3
+ blockSize1
) +
773 ( int32_t ) firstIndex
);
774 blockSize2
= ( blockSize2
> 0 ) ? blockSize2
: 0 ;
776 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
777 /* The function is internally
778 * divided into three stages according to the number of multiplications that has to be
779 * taken place between inputA samples and inputB samples. In the first stage of the
780 * algorithm, the multiplications increase by one for every iteration.
781 * In the second stage of the algorithm, srcBLen number of multiplications are done.
782 * In the third stage of the algorithm, the multiplications decrease by one
783 * for every iteration. */
785 /* Set the output pointer to point to the firstIndex
786 * of the output sample to be calculated. */
787 pOut
= pDst
+ firstIndex
;
789 /* --------------------------
790 * Initializations of stage1
791 * -------------------------*/
794 * sum = x[0] * y[1] + x[1] * y[0]
796 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
799 /* In this stage the MAC operations are increased by 1 for every iteration.
800 The count variable holds the number of MAC operations performed.
801 Since the partial convolution starts from firstIndex
802 Number of Macs to be performed is firstIndex + 1 */
803 count
= 1u + firstIndex
;
805 /* Working pointer of inputA */
808 /* Working pointer of inputB */
809 pSrc2
= pIn2
+ firstIndex
;
812 /* ------------------------
814 * ----------------------*/
816 /* For loop unrolling by 4, this stage is divided into two. */
817 /* First part of this stage computes the MAC operations less than 4 */
818 /* Second part of this stage computes the MAC operations greater than or equal to 4 */
820 /* The first part of the stage starts here */
821 while (( count
< 4u ) && ( blockSize1
> 0u ))
823 /* Accumulator is made zero for every iteration */
826 /* Loop over number of MAC operations between
827 * inputA samples and inputB samples */
832 /* Perform the multiply-accumulates */
833 sum
+= (( q31_t
) * px
++ * * py
--);
835 /* Decrement the loop counter */
839 /* Store the result in the accumulator in the destination buffer. */
840 * pOut
++ = ( q15_t
) ( sum
>> 15 );
842 /* Update the inputA and inputB pointers for next MAC calculation */
846 /* Increment the MAC count */
849 /* Decrement the loop counter */
853 /* The second part of the stage starts here */
854 /* The internal loop, over count, is unrolled by 4 */
855 /* To, read the last two inputB samples using SIMD:
856 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
859 while ( blockSize1
> 0u )
861 /* Accumulator is made zero for every iteration */
864 /* Apply loop unrolling and compute 4 MACs simultaneously. */
867 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
868 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
873 /* Perform the multiply-accumulates */
874 sum
+= (( q31_t
) * px
++ * * py
--);
875 sum
+= (( q31_t
) * px
++ * * py
--);
876 sum
+= (( q31_t
) * px
++ * * py
--);
877 sum
+= (( q31_t
) * px
++ * * py
--);
879 /* Decrement the loop counter */
883 /* If the count is not a multiple of 4, compute any remaining MACs here.
884 ** No loop unrolling is used. */
889 /* Perform the multiply-accumulates */
890 sum
+= (( q31_t
) * px
++ * * py
--);
892 /* Decrement the loop counter */
896 /* Store the result in the accumulator in the destination buffer. */
897 * pOut
++ = ( q15_t
) ( sum
>> 15 );
899 /* Update the inputA and inputB pointers for next MAC calculation */
903 /* Increment the MAC count */
906 /* Decrement the loop counter */
910 /* --------------------------
911 * Initializations of stage2
912 * ------------------------*/
914 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
915 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
917 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
920 /* Working pointer of inputA */
923 /* Working pointer of inputB */
924 pSrc2
= pIn2
+ ( srcBLen
- 1u );
927 /* count is the index by which the pointer pIn1 to be incremented */
931 /* --------------------
933 * -------------------*/
935 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
936 * So, to loop unroll over blockSize2,
937 * srcBLen should be greater than or equal to 4 */
940 /* Loop unroll over blockSize2, by 4 */
941 blkCnt
= (( uint32_t ) blockSize2
>> 2u );
947 /* Set all accumulators to zero */
953 /* read x[0], x[1] samples */
957 #ifndef ARM_MATH_BIG_ENDIAN
959 x0
= __PKHBT ( a
, b
, 16 );
961 x1
= __PKHBT ( b
, a
, 16 );
965 x0
= __PKHBT ( b
, a
, 16 );
967 x1
= __PKHBT ( a
, b
, 16 );
969 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
971 /* Apply loop unrolling and compute 4 MACs simultaneously. */
974 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
975 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
978 /* Read the last two inputB samples using SIMD:
979 * y[srcBLen - 1] and y[srcBLen - 2] */
984 #ifndef ARM_MATH_BIG_ENDIAN
986 c0
= __PKHBT ( a
, b
, 16 );
990 c0
= __PKHBT ( b
, a
, 16 );;
992 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
994 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
995 acc0
= __SMLADX ( x0
, c0
, acc0
);
997 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
998 acc1
= __SMLADX ( x1
, c0
, acc1
);
1003 #ifndef ARM_MATH_BIG_ENDIAN
1005 x2
= __PKHBT ( a
, b
, 16 );
1007 x3
= __PKHBT ( b
, a
, 16 );
1011 x2
= __PKHBT ( b
, a
, 16 );
1013 x3
= __PKHBT ( a
, b
, 16 );
1015 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1017 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
1018 acc2
= __SMLADX ( x2
, c0
, acc2
);
1020 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
1021 acc3
= __SMLADX ( x3
, c0
, acc3
);
1023 /* Read y[srcBLen - 3] and y[srcBLen - 4] */
1028 #ifndef ARM_MATH_BIG_ENDIAN
1030 c0
= __PKHBT ( a
, b
, 16 );
1034 c0
= __PKHBT ( b
, a
, 16 );;
1036 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1038 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
1039 acc0
= __SMLADX ( x2
, c0
, acc0
);
1041 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
1042 acc1
= __SMLADX ( x3
, c0
, acc1
);
1044 /* Read x[4], x[5], x[6] */
1048 #ifndef ARM_MATH_BIG_ENDIAN
1050 x0
= __PKHBT ( a
, b
, 16 );
1052 x1
= __PKHBT ( b
, a
, 16 );
1056 x0
= __PKHBT ( b
, a
, 16 );
1058 x1
= __PKHBT ( a
, b
, 16 );
1060 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1064 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
1065 acc2
= __SMLADX ( x0
, c0
, acc2
);
1067 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
1068 acc3
= __SMLADX ( x1
, c0
, acc3
);
1072 /* For the next MAC operations, SIMD is not used
1073 * So, the 16 bit pointer if inputB, py is updated */
1075 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1076 ** No loop unrolling is used. */
1081 /* Read y[srcBLen - 5] */
1084 #ifdef ARM_MATH_BIG_ENDIAN
1090 c0
= c0
& 0x0000FFFF ;
1092 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
1099 #ifndef ARM_MATH_BIG_ENDIAN
1101 x3
= __PKHBT ( a
, b
, 16 );
1105 x3
= __PKHBT ( b
, a
, 16 );;
1107 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1110 /* Perform the multiply-accumulates */
1111 acc0
= __SMLAD ( x0
, c0
, acc0
);
1112 acc1
= __SMLAD ( x1
, c0
, acc1
);
1113 acc2
= __SMLADX ( x1
, c0
, acc2
);
1114 acc3
= __SMLADX ( x3
, c0
, acc3
);
1119 /* Read y[srcBLen - 5], y[srcBLen - 6] */
1123 #ifndef ARM_MATH_BIG_ENDIAN
1125 c0
= __PKHBT ( a
, b
, 16 );
1129 c0
= __PKHBT ( b
, a
, 16 );;
1131 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1133 /* Read x[7], x[8], x[9] */
1137 #ifndef ARM_MATH_BIG_ENDIAN
1139 x3
= __PKHBT ( a
, b
, 16 );
1141 x2
= __PKHBT ( b
, a
, 16 );
1145 x3
= __PKHBT ( b
, a
, 16 );
1147 x2
= __PKHBT ( a
, b
, 16 );
1149 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1152 /* Perform the multiply-accumulates */
1153 acc0
= __SMLADX ( x0
, c0
, acc0
);
1154 acc1
= __SMLADX ( x1
, c0
, acc1
);
1155 acc2
= __SMLADX ( x3
, c0
, acc2
);
1156 acc3
= __SMLADX ( x2
, c0
, acc3
);
1161 /* Read y[srcBLen - 5], y[srcBLen - 6] */
1165 #ifndef ARM_MATH_BIG_ENDIAN
1167 c0
= __PKHBT ( a
, b
, 16 );
1171 c0
= __PKHBT ( b
, a
, 16 );;
1173 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1175 /* Read x[7], x[8], x[9] */
1179 #ifndef ARM_MATH_BIG_ENDIAN
1181 x3
= __PKHBT ( a
, b
, 16 );
1183 x2
= __PKHBT ( b
, a
, 16 );
1187 x3
= __PKHBT ( b
, a
, 16 );
1189 x2
= __PKHBT ( a
, b
, 16 );
1191 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1193 /* Perform the multiply-accumulates */
1194 acc0
= __SMLADX ( x0
, c0
, acc0
);
1195 acc1
= __SMLADX ( x1
, c0
, acc1
);
1196 acc2
= __SMLADX ( x3
, c0
, acc2
);
1197 acc3
= __SMLADX ( x2
, c0
, acc3
);
1199 /* Read y[srcBLen - 7] */
1201 #ifdef ARM_MATH_BIG_ENDIAN
1206 c0
= c0
& 0x0000FFFF ;
1207 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
1213 #ifndef ARM_MATH_BIG_ENDIAN
1215 x3
= __PKHBT ( a
, b
, 16 );
1219 x3
= __PKHBT ( b
, a
, 16 );;
1221 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1225 /* Perform the multiply-accumulates */
1226 acc0
= __SMLADX ( x1
, c0
, acc0
);
1227 acc1
= __SMLAD ( x2
, c0
, acc1
);
1228 acc2
= __SMLADX ( x2
, c0
, acc2
);
1229 acc3
= __SMLADX ( x3
, c0
, acc3
);
1232 /* Store the results in the accumulators in the destination buffer. */
1233 * pOut
++ = ( q15_t
)( acc0
>> 15 );
1234 * pOut
++ = ( q15_t
)( acc1
>> 15 );
1235 * pOut
++ = ( q15_t
)( acc2
>> 15 );
1236 * pOut
++ = ( q15_t
)( acc3
>> 15 );
1238 /* Increment the pointer pIn1 index, count by 4 */
1241 /* Update the inputA and inputB pointers for next MAC calculation */
1245 /* Decrement the loop counter */
1249 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
1250 ** No loop unrolling is used. */
1251 blkCnt
= ( uint32_t ) blockSize2
% 0x4 u
;
1255 /* Accumulator is made zero for every iteration */
1258 /* Apply loop unrolling and compute 4 MACs simultaneously. */
1261 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
1262 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1265 /* Perform the multiply-accumulates */
1266 sum
+= (( q31_t
) * px
++ * * py
--);
1267 sum
+= (( q31_t
) * px
++ * * py
--);
1268 sum
+= (( q31_t
) * px
++ * * py
--);
1269 sum
+= (( q31_t
) * px
++ * * py
--);
1271 /* Decrement the loop counter */
1275 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1276 ** No loop unrolling is used. */
1281 /* Perform the multiply-accumulates */
1282 sum
+= (( q31_t
) * px
++ * * py
--);
1284 /* Decrement the loop counter */
1288 /* Store the result in the accumulator in the destination buffer. */
1289 * pOut
++ = ( q15_t
) ( sum
>> 15 );
1291 /* Increment the pointer pIn1 index, count by 1 */
1294 /* Update the inputA and inputB pointers for next MAC calculation */
1298 /* Decrement the loop counter */
1304 /* If the srcBLen is not a multiple of 4,
1305 * the blockSize2 loop cannot be unrolled by 4 */
1306 blkCnt
= ( uint32_t ) blockSize2
;
1310 /* Accumulator is made zero for every iteration */
1313 /* srcBLen number of MACS should be performed */
1318 /* Perform the multiply-accumulate */
1319 sum
+= (( q31_t
) * px
++ * * py
--);
1321 /* Decrement the loop counter */
1325 /* Store the result in the accumulator in the destination buffer. */
1326 * pOut
++ = ( q15_t
) ( sum
>> 15 );
1328 /* Increment the MAC count */
1331 /* Update the inputA and inputB pointers for next MAC calculation */
1335 /* Decrement the loop counter */
1341 /* --------------------------
1342 * Initializations of stage3
1343 * -------------------------*/
1345 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
1346 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
1348 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
1349 * sum += x[srcALen-1] * y[srcBLen-1]
1352 /* In this stage the MAC operations are decreased by 1 for every iteration.
1353 The count variable holds the number of MAC operations performed */
1354 count
= srcBLen
- 1u ;
1356 /* Working pointer of inputA */
1357 pSrc1
= ( pIn1
+ srcALen
) - ( srcBLen
- 1u );
1360 /* Working pointer of inputB */
1361 pSrc2
= pIn2
+ ( srcBLen
- 1u );
1365 /* -------------------
1367 * ------------------*/
1369 /* For loop unrolling by 4, this stage is divided into two. */
1370 /* First part of this stage computes the MAC operations greater than 4 */
1371 /* Second part of this stage computes the MAC operations less than or equal to 4 */
1373 /* The first part of the stage starts here */
1376 while (( j
> 0u ) && ( blockSize3
> 0 ))
1378 /* Accumulator is made zero for every iteration */
1381 /* Apply loop unrolling and compute 4 MACs simultaneously. */
1384 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
1385 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1390 /* Perform the multiply-accumulates */
1391 sum
+= (( q31_t
) * px
++ * * py
--);
1392 sum
+= (( q31_t
) * px
++ * * py
--);
1393 sum
+= (( q31_t
) * px
++ * * py
--);
1394 sum
+= (( q31_t
) * px
++ * * py
--);
1395 /* Decrement the loop counter */
1400 /* If the count is not a multiple of 4, compute any remaining MACs here.
1401 ** No loop unrolling is used. */
1406 /* Perform the multiply-accumulates */
1407 sum
+= (( q31_t
) * px
++ * * py
--);
1409 /* Decrement the loop counter */
1413 /* Store the result in the accumulator in the destination buffer. */
1414 * pOut
++ = ( q15_t
) ( sum
>> 15 );
1416 /* Update the inputA and inputB pointers for next MAC calculation */
1420 /* Decrement the MAC count */
1423 /* Decrement the loop counter */
1429 /* The second part of the stage starts here */
1430 /* SIMD is not used for the next MAC operations,
1431 * so pointer py is updated to read only one sample at a time */
1434 while ( blockSize3
> 0u )
1436 /* Accumulator is made zero for every iteration */
1439 /* Apply loop unrolling and compute 4 MACs simultaneously. */
1444 /* Perform the multiply-accumulates */
1445 /* sum += x[srcALen-1] * y[srcBLen-1] */
1446 sum
+= (( q31_t
) * px
++ * * py
--);
1448 /* Decrement the loop counter */
1452 /* Store the result in the accumulator in the destination buffer. */
1453 * pOut
++ = ( q15_t
) ( sum
>> 15 );
1455 /* Update the inputA and inputB pointers for next MAC calculation */
1459 /* Decrement the MAC count */
1462 /* Decrement the loop counter */
1466 /* set status as ARM_MATH_SUCCESS */
1467 status
= ARM_MATH_SUCCESS
;
1470 /* Return to application */
1473 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
1477 * @} end of PartialConv group