]>
git.gir.st - tmk_keyboard.git/blob - tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_partial_fast_q31.c
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
4 * $Date: 17. January 2013
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_partial_fast_q31.c
10 * Description: Fast Q31 Partial convolution.
12 * Target Processor: Cortex-M4/Cortex-M3
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
44 * @ingroup groupFilters
48 * @addtogroup PartialConv
53 * @brief Partial convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4.
54 * @param[in] *pSrcA points to the first input sequence.
55 * @param[in] srcALen length of the first input sequence.
56 * @param[in] *pSrcB points to the second input sequence.
57 * @param[in] srcBLen length of the second input sequence.
58 * @param[out] *pDst points to the location where the output result is written.
59 * @param[in] firstIndex is the first output sample to start with.
60 * @param[in] numPoints is the number of output points to be computed.
61 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
64 * See <code>arm_conv_partial_q31()</code> for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.
67 arm_status
arm_conv_partial_fast_q31 (
76 q31_t
* pIn1
; /* inputA pointer */
77 q31_t
* pIn2
; /* inputB pointer */
78 q31_t
* pOut
= pDst
; /* output pointer */
79 q31_t
* px
; /* Intermediate inputA pointer */
80 q31_t
* py
; /* Intermediate inputB pointer */
81 q31_t
* pSrc1
, * pSrc2
; /* Intermediate pointers */
82 q31_t sum
, acc0
, acc1
, acc2
, acc3
; /* Accumulators */
83 q31_t x0
, x1
, x2
, x3
, c0
;
84 uint32_t j
, k
, count
, check
, blkCnt
;
85 int32_t blockSize1
, blockSize2
, blockSize3
; /* loop counters */
86 arm_status status
; /* status of Partial convolution */
89 /* Check for range of output samples to be calculated */
90 if (( firstIndex
+ numPoints
) > (( srcALen
+ ( srcBLen
- 1u ))))
92 /* Set status as ARM_MATH_ARGUMENT_ERROR */
93 status
= ARM_MATH_ARGUMENT_ERROR
;
98 /* The algorithm implementation is based on the lengths of the inputs. */
99 /* srcB is always made to slide across srcA. */
100 /* So srcBLen is always considered as shorter or equal to srcALen */
101 if ( srcALen
>= srcBLen
)
103 /* Initialization of inputA pointer */
106 /* Initialization of inputB pointer */
111 /* Initialization of inputA pointer */
114 /* Initialization of inputB pointer */
117 /* srcBLen is always considered as shorter or equal to srcALen */
123 /* Conditions to check which loopCounter holds
124 * the first and last indices of the output samples to be calculated. */
125 check
= firstIndex
+ numPoints
;
126 blockSize3
= (( int32_t ) check
- ( int32_t ) srcALen
);
127 blockSize3
= ( blockSize3
> 0 ) ? blockSize3
: 0 ;
128 blockSize1
= ((( int32_t ) srcBLen
- 1 ) - ( int32_t ) firstIndex
);
129 blockSize1
= ( blockSize1
> 0 ) ? (( check
> ( srcBLen
- 1u )) ? blockSize1
:
130 ( int32_t ) numPoints
) : 0 ;
131 blockSize2
= ( int32_t ) check
- (( blockSize3
+ blockSize1
) +
132 ( int32_t ) firstIndex
);
133 blockSize2
= ( blockSize2
> 0 ) ? blockSize2
: 0 ;
135 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
136 /* The function is internally
137 * divided into three stages according to the number of multiplications that has to be
138 * taken place between inputA samples and inputB samples. In the first stage of the
139 * algorithm, the multiplications increase by one for every iteration.
140 * In the second stage of the algorithm, srcBLen number of multiplications are done.
141 * In the third stage of the algorithm, the multiplications decrease by one
142 * for every iteration. */
144 /* Set the output pointer to point to the firstIndex
145 * of the output sample to be calculated. */
146 pOut
= pDst
+ firstIndex
;
148 /* --------------------------
149 * Initializations of stage1
150 * -------------------------*/
153 * sum = x[0] * y[1] + x[1] * y[0]
155 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
158 /* In this stage the MAC operations are increased by 1 for every iteration.
159 The count variable holds the number of MAC operations performed.
160 Since the partial convolution starts from firstIndex
161 Number of Macs to be performed is firstIndex + 1 */
162 count
= 1u + firstIndex
;
164 /* Working pointer of inputA */
167 /* Working pointer of inputB */
168 pSrc2
= pIn2
+ firstIndex
;
171 /* ------------------------
173 * ----------------------*/
175 /* The first loop starts here */
176 while ( blockSize1
> 0 )
178 /* Accumulator is made zero for every iteration */
181 /* Apply loop unrolling and compute 4 MACs simultaneously. */
184 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
185 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
188 /* x[0] * y[srcBLen - 1] */
189 sum
= ( q31_t
) (((( q63_t
) sum
<< 32 ) +
190 (( q63_t
) * px
++ * (* py
--))) >> 32 );
192 /* x[1] * y[srcBLen - 2] */
193 sum
= ( q31_t
) (((( q63_t
) sum
<< 32 ) +
194 (( q63_t
) * px
++ * (* py
--))) >> 32 );
196 /* x[2] * y[srcBLen - 3] */
197 sum
= ( q31_t
) (((( q63_t
) sum
<< 32 ) +
198 (( q63_t
) * px
++ * (* py
--))) >> 32 );
200 /* x[3] * y[srcBLen - 4] */
201 sum
= ( q31_t
) (((( q63_t
) sum
<< 32 ) +
202 (( q63_t
) * px
++ * (* py
--))) >> 32 );
204 /* Decrement the loop counter */
208 /* If the count is not a multiple of 4, compute any remaining MACs here.
209 ** No loop unrolling is used. */
214 /* Perform the multiply-accumulates */
215 sum
= ( q31_t
) (((( q63_t
) sum
<< 32 ) +
216 (( q63_t
) * px
++ * (* py
--))) >> 32 );
218 /* Decrement the loop counter */
222 /* Store the result in the accumulator in the destination buffer. */
225 /* Update the inputA and inputB pointers for next MAC calculation */
229 /* Increment the MAC count */
232 /* Decrement the loop counter */
236 /* --------------------------
237 * Initializations of stage2
238 * ------------------------*/
240 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
241 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
243 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
246 /* Working pointer of inputA */
249 /* Working pointer of inputB */
250 pSrc2
= pIn2
+ ( srcBLen
- 1u );
253 /* count is index by which the pointer pIn1 to be incremented */
256 /* -------------------
258 * ------------------*/
260 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
261 * So, to loop unroll over blockSize2,
262 * srcBLen should be greater than or equal to 4 */
265 /* Loop unroll over blockSize2 */
266 blkCnt
= (( uint32_t ) blockSize2
>> 2u );
270 /* Set all accumulators to zero */
276 /* read x[0], x[1], x[2] samples */
281 /* Apply loop unrolling and compute 4 MACs simultaneously. */
284 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
285 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
288 /* Read y[srcBLen - 1] sample */
291 /* Read x[3] sample */
294 /* Perform the multiply-accumulate */
295 /* acc0 += x[0] * y[srcBLen - 1] */
296 acc0
= ( q31_t
) (((( q63_t
) acc0
<< 32 ) + (( q63_t
) x0
* c0
)) >> 32 );
298 /* acc1 += x[1] * y[srcBLen - 1] */
299 acc1
= ( q31_t
) (((( q63_t
) acc1
<< 32 ) + (( q63_t
) x1
* c0
)) >> 32 );
301 /* acc2 += x[2] * y[srcBLen - 1] */
302 acc2
= ( q31_t
) (((( q63_t
) acc2
<< 32 ) + (( q63_t
) x2
* c0
)) >> 32 );
304 /* acc3 += x[3] * y[srcBLen - 1] */
305 acc3
= ( q31_t
) (((( q63_t
) acc3
<< 32 ) + (( q63_t
) x3
* c0
)) >> 32 );
307 /* Read y[srcBLen - 2] sample */
310 /* Read x[4] sample */
313 /* Perform the multiply-accumulate */
314 /* acc0 += x[1] * y[srcBLen - 2] */
315 acc0
= ( q31_t
) (((( q63_t
) acc0
<< 32 ) + (( q63_t
) x1
* c0
)) >> 32 );
316 /* acc1 += x[2] * y[srcBLen - 2] */
317 acc1
= ( q31_t
) (((( q63_t
) acc1
<< 32 ) + (( q63_t
) x2
* c0
)) >> 32 );
318 /* acc2 += x[3] * y[srcBLen - 2] */
319 acc2
= ( q31_t
) (((( q63_t
) acc2
<< 32 ) + (( q63_t
) x3
* c0
)) >> 32 );
320 /* acc3 += x[4] * y[srcBLen - 2] */
321 acc3
= ( q31_t
) (((( q63_t
) acc3
<< 32 ) + (( q63_t
) x0
* c0
)) >> 32 );
323 /* Read y[srcBLen - 3] sample */
326 /* Read x[5] sample */
329 /* Perform the multiply-accumulates */
330 /* acc0 += x[2] * y[srcBLen - 3] */
331 acc0
= ( q31_t
) (((( q63_t
) acc0
<< 32 ) + (( q63_t
) x2
* c0
)) >> 32 );
332 /* acc1 += x[3] * y[srcBLen - 2] */
333 acc1
= ( q31_t
) (((( q63_t
) acc1
<< 32 ) + (( q63_t
) x3
* c0
)) >> 32 );
334 /* acc2 += x[4] * y[srcBLen - 2] */
335 acc2
= ( q31_t
) (((( q63_t
) acc2
<< 32 ) + (( q63_t
) x0
* c0
)) >> 32 );
336 /* acc3 += x[5] * y[srcBLen - 2] */
337 acc3
= ( q31_t
) (((( q63_t
) acc3
<< 32 ) + (( q63_t
) x1
* c0
)) >> 32 );
339 /* Read y[srcBLen - 4] sample */
342 /* Read x[6] sample */
345 /* Perform the multiply-accumulates */
346 /* acc0 += x[3] * y[srcBLen - 4] */
347 acc0
= ( q31_t
) (((( q63_t
) acc0
<< 32 ) + (( q63_t
) x3
* c0
)) >> 32 );
348 /* acc1 += x[4] * y[srcBLen - 4] */
349 acc1
= ( q31_t
) (((( q63_t
) acc1
<< 32 ) + (( q63_t
) x0
* c0
)) >> 32 );
350 /* acc2 += x[5] * y[srcBLen - 4] */
351 acc2
= ( q31_t
) (((( q63_t
) acc2
<< 32 ) + (( q63_t
) x1
* c0
)) >> 32 );
352 /* acc3 += x[6] * y[srcBLen - 4] */
353 acc3
= ( q31_t
) (((( q63_t
) acc3
<< 32 ) + (( q63_t
) x2
* c0
)) >> 32 );
358 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
359 ** No loop unrolling is used. */
364 /* Read y[srcBLen - 5] sample */
367 /* Read x[7] sample */
370 /* Perform the multiply-accumulates */
371 /* acc0 += x[4] * y[srcBLen - 5] */
372 acc0
= ( q31_t
) (((( q63_t
) acc0
<< 32 ) + (( q63_t
) x0
* c0
)) >> 32 );
373 /* acc1 += x[5] * y[srcBLen - 5] */
374 acc1
= ( q31_t
) (((( q63_t
) acc1
<< 32 ) + (( q63_t
) x1
* c0
)) >> 32 );
375 /* acc2 += x[6] * y[srcBLen - 5] */
376 acc2
= ( q31_t
) (((( q63_t
) acc2
<< 32 ) + (( q63_t
) x2
* c0
)) >> 32 );
377 /* acc3 += x[7] * y[srcBLen - 5] */
378 acc3
= ( q31_t
) (((( q63_t
) acc3
<< 32 ) + (( q63_t
) x3
* c0
)) >> 32 );
380 /* Reuse the present samples for the next MAC */
385 /* Decrement the loop counter */
389 /* Store the result in the accumulator in the destination buffer. */
390 * pOut
++ = ( q31_t
) ( acc0
<< 1 );
391 * pOut
++ = ( q31_t
) ( acc1
<< 1 );
392 * pOut
++ = ( q31_t
) ( acc2
<< 1 );
393 * pOut
++ = ( q31_t
) ( acc3
<< 1 );
395 /* Increment the pointer pIn1 index, count by 4 */
398 /* Update the inputA and inputB pointers for next MAC calculation */
402 /* Decrement the loop counter */
406 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
407 ** No loop unrolling is used. */
408 blkCnt
= ( uint32_t ) blockSize2
% 0x4 u
;
412 /* Accumulator is made zero for every iteration */
415 /* Apply loop unrolling and compute 4 MACs simultaneously. */
418 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
419 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
422 /* Perform the multiply-accumulates */
423 sum
= ( q31_t
) (((( q63_t
) sum
<< 32 ) +
424 (( q63_t
) * px
++ * (* py
--))) >> 32 );
425 sum
= ( q31_t
) (((( q63_t
) sum
<< 32 ) +
426 (( q63_t
) * px
++ * (* py
--))) >> 32 );
427 sum
= ( q31_t
) (((( q63_t
) sum
<< 32 ) +
428 (( q63_t
) * px
++ * (* py
--))) >> 32 );
429 sum
= ( q31_t
) (((( q63_t
) sum
<< 32 ) +
430 (( q63_t
) * px
++ * (* py
--))) >> 32 );
432 /* Decrement the loop counter */
436 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
437 ** No loop unrolling is used. */
442 /* Perform the multiply-accumulate */
443 sum
= ( q31_t
) (((( q63_t
) sum
<< 32 ) +
444 (( q63_t
) * px
++ * (* py
--))) >> 32 );
446 /* Decrement the loop counter */
450 /* Store the result in the accumulator in the destination buffer. */
453 /* Increment the MAC count */
456 /* Update the inputA and inputB pointers for next MAC calculation */
460 /* Decrement the loop counter */
466 /* If the srcBLen is not a multiple of 4,
467 * the blockSize2 loop cannot be unrolled by 4 */
468 blkCnt
= ( uint32_t ) blockSize2
;
472 /* Accumulator is made zero for every iteration */
475 /* srcBLen number of MACS should be performed */
480 /* Perform the multiply-accumulate */
481 sum
= ( q31_t
) (((( q63_t
) sum
<< 32 ) +
482 (( q63_t
) * px
++ * (* py
--))) >> 32 );
484 /* Decrement the loop counter */
488 /* Store the result in the accumulator in the destination buffer. */
491 /* Increment the MAC count */
494 /* Update the inputA and inputB pointers for next MAC calculation */
498 /* Decrement the loop counter */
504 /* --------------------------
505 * Initializations of stage3
506 * -------------------------*/
508 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
509 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
511 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
512 * sum += x[srcALen-1] * y[srcBLen-1]
515 /* In this stage the MAC operations are decreased by 1 for every iteration.
516 The count variable holds the number of MAC operations performed */
517 count
= srcBLen
- 1u ;
519 /* Working pointer of inputA */
520 pSrc1
= ( pIn1
+ srcALen
) - ( srcBLen
- 1u );
523 /* Working pointer of inputB */
524 pSrc2
= pIn2
+ ( srcBLen
- 1u );
527 /* -------------------
529 * ------------------*/
531 while ( blockSize3
> 0 )
533 /* Accumulator is made zero for every iteration */
536 /* Apply loop unrolling and compute 4 MACs simultaneously. */
539 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
540 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
543 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
544 sum
= ( q31_t
) (((( q63_t
) sum
<< 32 ) +
545 (( q63_t
) * px
++ * (* py
--))) >> 32 );
547 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
548 sum
= ( q31_t
) (((( q63_t
) sum
<< 32 ) +
549 (( q63_t
) * px
++ * (* py
--))) >> 32 );
551 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
552 sum
= ( q31_t
) (((( q63_t
) sum
<< 32 ) +
553 (( q63_t
) * px
++ * (* py
--))) >> 32 );
555 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
556 sum
= ( q31_t
) (((( q63_t
) sum
<< 32 ) +
557 (( q63_t
) * px
++ * (* py
--))) >> 32 );
559 /* Decrement the loop counter */
563 /* If the count is not a multiple of 4, compute any remaining MACs here.
564 ** No loop unrolling is used. */
569 /* Perform the multiply-accumulates */
570 /* sum += x[srcALen-1] * y[srcBLen-1] */
571 sum
= ( q31_t
) (((( q63_t
) sum
<< 32 ) +
572 (( q63_t
) * px
++ * (* py
--))) >> 32 );
574 /* Decrement the loop counter */
578 /* Store the result in the accumulator in the destination buffer. */
581 /* Update the inputA and inputB pointers for next MAC calculation */
585 /* Decrement the MAC count */
588 /* Decrement the loop counter */
593 /* set status as ARM_MATH_SUCCESS */
594 status
= ARM_MATH_SUCCESS
;
597 /* Return to application */
603 * @} end of PartialConv group