]>
git.gir.st - tmk_keyboard.git/blob - tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_f32.c
1 /* ----------------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
4 * $Date: 17. January 2013
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_f32.c
10 * Description: Convolution of floating-point sequences.
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------------- */
44 * @ingroup groupFilters
48 * @defgroup Conv Convolution
50 * Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector.
51 * Convolution is similar to correlation and is frequently used in filtering and data analysis.
52 * The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types.
53 * The library also provides fast versions of the Q15 and Q31 functions on Cortex-M4 and Cortex-M3.
56 * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
57 * Then the convolution
65 * \image html ConvolutionEquation.gif
67 * Note that <code>c[n]</code> is of length <code>srcALen + srcBLen - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., srcALen + srcBLen - 2</code>.
68 * <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and
69 * <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
70 * The output result is written to <code>pDst</code> and the calling function must allocate <code>srcALen+srcBLen-1</code> words for the result.
73 * Conceptually, when two signals <code>a[n]</code> and <code>b[n]</code> are convolved,
74 * the signal <code>b[n]</code> slides over <code>a[n]</code>.
75 * For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together.
78 * Note that convolution is a commutative operation:
81 * a[n] * b[n] = b[n] * a[n].
85 * This means that switching the A and B arguments to the convolution functions has no effect.
87 * <b>Fixed-Point Behavior</b>
90 * Convolution requires summing up a large number of intermediate products.
91 * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
92 * Refer to the function specific documentation below for further details of the particular algorithm used.
95 * <b>Fast Versions</b>
98 * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires
99 * the input signals should be scaled down to avoid intermediate overflows.
102 * <b>Opt Versions</b>
105 * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
106 * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions
115 * @brief Convolution of floating-point sequences.
116 * @param[in] *pSrcA points to the first input sequence.
117 * @param[in] srcALen length of the first input sequence.
118 * @param[in] *pSrcB points to the second input sequence.
119 * @param[in] srcBLen length of the second input sequence.
120 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
133 #ifndef ARM_MATH_CM0_FAMILY
135 /* Run the below code for Cortex-M4 and Cortex-M3 */
137 float32_t
* pIn1
; /* inputA pointer */
138 float32_t
* pIn2
; /* inputB pointer */
139 float32_t
* pOut
= pDst
; /* output pointer */
140 float32_t
* px
; /* Intermediate inputA pointer */
141 float32_t
* py
; /* Intermediate inputB pointer */
142 float32_t
* pSrc1
, * pSrc2
; /* Intermediate pointers */
143 float32_t sum
, acc0
, acc1
, acc2
, acc3
; /* Accumulator */
144 float32_t x0
, x1
, x2
, x3
, c0
; /* Temporary variables to hold state and coefficient values */
145 uint32_t j
, k
, count
, blkCnt
, blockSize1
, blockSize2
, blockSize3
; /* loop counters */
147 /* The algorithm implementation is based on the lengths of the inputs. */
148 /* srcB is always made to slide across srcA. */
149 /* So srcBLen is always considered as shorter or equal to srcALen */
150 if ( srcALen
>= srcBLen
)
152 /* Initialization of inputA pointer */
155 /* Initialization of inputB pointer */
160 /* Initialization of inputA pointer */
163 /* Initialization of inputB pointer */
166 /* srcBLen is always considered as shorter or equal to srcALen */
172 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
173 /* The function is internally
174 * divided into three stages according to the number of multiplications that has to be
175 * taken place between inputA samples and inputB samples. In the first stage of the
176 * algorithm, the multiplications increase by one for every iteration.
177 * In the second stage of the algorithm, srcBLen number of multiplications are done.
178 * In the third stage of the algorithm, the multiplications decrease by one
179 * for every iteration. */
181 /* The algorithm is implemented in three stages.
182 The loop counters of each stage is initiated here. */
183 blockSize1
= srcBLen
- 1u ;
184 blockSize2
= srcALen
- ( srcBLen
- 1u );
185 blockSize3
= blockSize1
;
187 /* --------------------------
188 * initializations of stage1
189 * -------------------------*/
192 * sum = x[0] * y[1] + x[1] * y[0]
194 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
197 /* In this stage the MAC operations are increased by 1 for every iteration.
198 The count variable holds the number of MAC operations performed */
201 /* Working pointer of inputA */
204 /* Working pointer of inputB */
208 /* ------------------------
210 * ----------------------*/
212 /* The first stage starts here */
213 while ( blockSize1
> 0u )
215 /* Accumulator is made zero for every iteration */
218 /* Apply loop unrolling and compute 4 MACs simultaneously. */
221 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
222 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
225 /* x[0] * y[srcBLen - 1] */
226 sum
+= * px
++ * * py
--;
228 /* x[1] * y[srcBLen - 2] */
229 sum
+= * px
++ * * py
--;
231 /* x[2] * y[srcBLen - 3] */
232 sum
+= * px
++ * * py
--;
234 /* x[3] * y[srcBLen - 4] */
235 sum
+= * px
++ * * py
--;
237 /* Decrement the loop counter */
241 /* If the count is not a multiple of 4, compute any remaining MACs here.
242 ** No loop unrolling is used. */
247 /* Perform the multiply-accumulate */
248 sum
+= * px
++ * * py
--;
250 /* Decrement the loop counter */
254 /* Store the result in the accumulator in the destination buffer. */
257 /* Update the inputA and inputB pointers for next MAC calculation */
261 /* Increment the MAC count */
264 /* Decrement the loop counter */
268 /* --------------------------
269 * Initializations of stage2
270 * ------------------------*/
272 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
273 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
275 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
278 /* Working pointer of inputA */
281 /* Working pointer of inputB */
282 pSrc2
= pIn2
+ ( srcBLen
- 1u );
285 /* count is index by which the pointer pIn1 to be incremented */
288 /* -------------------
290 * ------------------*/
292 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
293 * So, to loop unroll over blockSize2,
294 * srcBLen should be greater than or equal to 4 */
297 /* Loop unroll over blockSize2, by 4 */
298 blkCnt
= blockSize2
>> 2u ;
302 /* Set all accumulators to zero */
308 /* read x[0], x[1], x[2] samples */
313 /* Apply loop unrolling and compute 4 MACs simultaneously. */
316 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
317 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
320 /* Read y[srcBLen - 1] sample */
323 /* Read x[3] sample */
326 /* Perform the multiply-accumulate */
327 /* acc0 += x[0] * y[srcBLen - 1] */
330 /* acc1 += x[1] * y[srcBLen - 1] */
333 /* acc2 += x[2] * y[srcBLen - 1] */
336 /* acc3 += x[3] * y[srcBLen - 1] */
339 /* Read y[srcBLen - 2] sample */
342 /* Read x[4] sample */
345 /* Perform the multiply-accumulate */
346 /* acc0 += x[1] * y[srcBLen - 2] */
348 /* acc1 += x[2] * y[srcBLen - 2] */
350 /* acc2 += x[3] * y[srcBLen - 2] */
352 /* acc3 += x[4] * y[srcBLen - 2] */
355 /* Read y[srcBLen - 3] sample */
358 /* Read x[5] sample */
361 /* Perform the multiply-accumulates */
362 /* acc0 += x[2] * y[srcBLen - 3] */
364 /* acc1 += x[3] * y[srcBLen - 2] */
366 /* acc2 += x[4] * y[srcBLen - 2] */
368 /* acc3 += x[5] * y[srcBLen - 2] */
371 /* Read y[srcBLen - 4] sample */
374 /* Read x[6] sample */
378 /* Perform the multiply-accumulates */
379 /* acc0 += x[3] * y[srcBLen - 4] */
381 /* acc1 += x[4] * y[srcBLen - 4] */
383 /* acc2 += x[5] * y[srcBLen - 4] */
385 /* acc3 += x[6] * y[srcBLen - 4] */
391 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
392 ** No loop unrolling is used. */
397 /* Read y[srcBLen - 5] sample */
400 /* Read x[7] sample */
403 /* Perform the multiply-accumulates */
404 /* acc0 += x[4] * y[srcBLen - 5] */
406 /* acc1 += x[5] * y[srcBLen - 5] */
408 /* acc2 += x[6] * y[srcBLen - 5] */
410 /* acc3 += x[7] * y[srcBLen - 5] */
413 /* Reuse the present samples for the next MAC */
418 /* Decrement the loop counter */
422 /* Store the result in the accumulator in the destination buffer. */
428 /* Increment the pointer pIn1 index, count by 4 */
431 /* Update the inputA and inputB pointers for next MAC calculation */
436 /* Decrement the loop counter */
441 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
442 ** No loop unrolling is used. */
443 blkCnt
= blockSize2
% 0x4 u
;
447 /* Accumulator is made zero for every iteration */
450 /* Apply loop unrolling and compute 4 MACs simultaneously. */
453 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
454 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
457 /* Perform the multiply-accumulates */
458 sum
+= * px
++ * * py
--;
459 sum
+= * px
++ * * py
--;
460 sum
+= * px
++ * * py
--;
461 sum
+= * px
++ * * py
--;
463 /* Decrement the loop counter */
467 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
468 ** No loop unrolling is used. */
473 /* Perform the multiply-accumulate */
474 sum
+= * px
++ * * py
--;
476 /* Decrement the loop counter */
480 /* Store the result in the accumulator in the destination buffer. */
483 /* Increment the MAC count */
486 /* Update the inputA and inputB pointers for next MAC calculation */
490 /* Decrement the loop counter */
496 /* If the srcBLen is not a multiple of 4,
497 * the blockSize2 loop cannot be unrolled by 4 */
502 /* Accumulator is made zero for every iteration */
505 /* srcBLen number of MACS should be performed */
510 /* Perform the multiply-accumulate */
511 sum
+= * px
++ * * py
--;
513 /* Decrement the loop counter */
517 /* Store the result in the accumulator in the destination buffer. */
520 /* Increment the MAC count */
523 /* Update the inputA and inputB pointers for next MAC calculation */
527 /* Decrement the loop counter */
533 /* --------------------------
534 * Initializations of stage3
535 * -------------------------*/
537 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
538 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
540 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
541 * sum += x[srcALen-1] * y[srcBLen-1]
544 /* In this stage the MAC operations are decreased by 1 for every iteration.
545 The blockSize3 variable holds the number of MAC operations performed */
547 /* Working pointer of inputA */
548 pSrc1
= ( pIn1
+ srcALen
) - ( srcBLen
- 1u );
551 /* Working pointer of inputB */
552 pSrc2
= pIn2
+ ( srcBLen
- 1u );
555 /* -------------------
557 * ------------------*/
559 while ( blockSize3
> 0u )
561 /* Accumulator is made zero for every iteration */
564 /* Apply loop unrolling and compute 4 MACs simultaneously. */
565 k
= blockSize3
>> 2u ;
567 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
568 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
571 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
572 sum
+= * px
++ * * py
--;
574 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
575 sum
+= * px
++ * * py
--;
577 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
578 sum
+= * px
++ * * py
--;
580 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
581 sum
+= * px
++ * * py
--;
583 /* Decrement the loop counter */
587 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
588 ** No loop unrolling is used. */
589 k
= blockSize3
% 0x4 u
;
593 /* Perform the multiply-accumulates */
594 /* sum += x[srcALen-1] * y[srcBLen-1] */
595 sum
+= * px
++ * * py
--;
597 /* Decrement the loop counter */
601 /* Store the result in the accumulator in the destination buffer. */
604 /* Update the inputA and inputB pointers for next MAC calculation */
608 /* Decrement the loop counter */
614 /* Run the below code for Cortex-M0 */
616 float32_t
* pIn1
= pSrcA
; /* inputA pointer */
617 float32_t
* pIn2
= pSrcB
; /* inputB pointer */
618 float32_t sum
; /* Accumulator */
619 uint32_t i
, j
; /* loop counters */
621 /* Loop to calculate convolution for output length number of times */
622 for ( i
= 0u ; i
< (( srcALen
+ srcBLen
) - 1u ); i
++)
624 /* Initialize sum with zero to carry out MAC operations */
627 /* Loop to perform MAC operations according to convolution equation */
628 for ( j
= 0u ; j
<= i
; j
++)
630 /* Check the array limitations */
631 if (((( i
- j
) < srcBLen
) && ( j
< srcALen
)))
633 /* z[i] += x[i-j] * y[j] */
634 sum
+= pIn1
[ j
] * pIn2
[ i
- j
];
637 /* Store the output in the destination buffer */
641 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
646 * @} end of Conv group