]>
git.gir.st - tmk_keyboard.git/blob - tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_correlate_f32.c
1 /* ----------------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
4 * $Date: 17. January 2013
7 * Project: CMSIS DSP Library
8 * Title: arm_correlate_f32.c
10 * Description: Correlation of floating-point sequences.
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------------- */
44 * @ingroup groupFilters
48 * @defgroup Corr Correlation
50 * Correlation is a mathematical operation that is similar to convolution.
51 * As with convolution, correlation uses two signals to produce a third signal.
52 * The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution.
53 * Correlation is commonly used to measure the similarity between two signals.
54 * It has applications in pattern recognition, cryptanalysis, and searching.
55 * The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types.
56 * Fast versions of the Q15 and Q31 functions are also provided.
59 * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
60 * The convolution of the two signals is denoted by
64 * In correlation, one of the signals is flipped in time
70 * and this is mathematically defined as
71 * \image html CorrelateEquation.gif
73 * The <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
74 * The result <code>c[n]</code> is of length <code>2 * max(srcALen, srcBLen) - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2)</code>.
75 * The output result is written to <code>pDst</code> and the calling function must allocate <code>2 * max(srcALen, srcBLen) - 1</code> words for the result.
79 * The <code>pDst</code> should be initialized to all zeros before being used.
81 * <b>Fixed-Point Behavior</b>
83 * Correlation requires summing up a large number of intermediate products.
84 * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
85 * Refer to the function specific documentation below for further details of the particular algorithm used.
88 * <b>Fast Versions</b>
91 * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of correlate and the design requires
92 * the input signals should be scaled down to avoid intermediate overflows.
98 * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
99 * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of correlate
107 * @brief Correlation of floating-point sequences.
108 * @param[in] *pSrcA points to the first input sequence.
109 * @param[in] srcALen length of the first input sequence.
110 * @param[in] *pSrcB points to the second input sequence.
111 * @param[in] srcBLen length of the second input sequence.
112 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
116 void arm_correlate_f32 (
125 #ifndef ARM_MATH_CM0_FAMILY
127 /* Run the below code for Cortex-M4 and Cortex-M3 */
129 float32_t
* pIn1
; /* inputA pointer */
130 float32_t
* pIn2
; /* inputB pointer */
131 float32_t
* pOut
= pDst
; /* output pointer */
132 float32_t
* px
; /* Intermediate inputA pointer */
133 float32_t
* py
; /* Intermediate inputB pointer */
134 float32_t
* pSrc1
; /* Intermediate pointers */
135 float32_t sum
, acc0
, acc1
, acc2
, acc3
; /* Accumulators */
136 float32_t x0
, x1
, x2
, x3
, c0
; /* temporary variables for holding input and coefficient values */
137 uint32_t j
, k
= 0u , count
, blkCnt
, outBlockSize
, blockSize1
, blockSize2
, blockSize3
; /* loop counters */
138 int32_t inc
= 1 ; /* Destination address modifier */
141 /* The algorithm implementation is based on the lengths of the inputs. */
142 /* srcB is always made to slide across srcA. */
143 /* So srcBLen is always considered as shorter or equal to srcALen */
144 /* But CORR(x, y) is reverse of CORR(y, x) */
145 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
146 /* and the destination pointer modifier, inc is set to -1 */
147 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
148 /* But to improve the performance,
149 * we include zeroes in the output instead of zero padding either of the the inputs*/
150 /* If srcALen > srcBLen,
151 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
152 /* If srcALen < srcBLen,
153 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
154 if ( srcALen
>= srcBLen
)
156 /* Initialization of inputA pointer */
159 /* Initialization of inputB pointer */
162 /* Number of output samples is calculated */
163 outBlockSize
= ( 2u * srcALen
) - 1u ;
165 /* When srcALen > srcBLen, zero padding has to be done to srcB
166 * to make their lengths equal.
167 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
168 * number of output samples are made zero */
169 j
= outBlockSize
- ( srcALen
+ ( srcBLen
- 1u ));
171 /* Updating the pointer position to non zero value */
176 // /* Zero is stored in the destination buffer */
179 // /* Decrement the loop counter */
186 /* Initialization of inputA pointer */
189 /* Initialization of inputB pointer */
192 /* srcBLen is always considered as shorter or equal to srcALen */
197 /* CORR(x, y) = Reverse order(CORR(y, x)) */
198 /* Hence set the destination pointer to point to the last output sample */
199 pOut
= pDst
+ (( srcALen
+ srcBLen
) - 2u );
201 /* Destination address modifier is set to -1 */
206 /* The function is internally
207 * divided into three parts according to the number of multiplications that has to be
208 * taken place between inputA samples and inputB samples. In the first part of the
209 * algorithm, the multiplications increase by one for every iteration.
210 * In the second part of the algorithm, srcBLen number of multiplications are done.
211 * In the third part of the algorithm, the multiplications decrease by one
212 * for every iteration.*/
213 /* The algorithm is implemented in three stages.
214 * The loop counters of each stage is initiated here. */
215 blockSize1
= srcBLen
- 1u ;
216 blockSize2
= srcALen
- ( srcBLen
- 1u );
217 blockSize3
= blockSize1
;
219 /* --------------------------
220 * Initializations of stage1
221 * -------------------------*/
223 /* sum = x[0] * y[srcBlen - 1]
224 * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
226 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
229 /* In this stage the MAC operations are increased by 1 for every iteration.
230 The count variable holds the number of MAC operations performed */
233 /* Working pointer of inputA */
236 /* Working pointer of inputB */
237 pSrc1
= pIn2
+ ( srcBLen
- 1u );
240 /* ------------------------
242 * ----------------------*/
244 /* The first stage starts here */
245 while ( blockSize1
> 0u )
247 /* Accumulator is made zero for every iteration */
250 /* Apply loop unrolling and compute 4 MACs simultaneously. */
253 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
254 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
257 /* x[0] * y[srcBLen - 4] */
258 sum
+= * px
++ * * py
++;
259 /* x[1] * y[srcBLen - 3] */
260 sum
+= * px
++ * * py
++;
261 /* x[2] * y[srcBLen - 2] */
262 sum
+= * px
++ * * py
++;
263 /* x[3] * y[srcBLen - 1] */
264 sum
+= * px
++ * * py
++;
266 /* Decrement the loop counter */
270 /* If the count is not a multiple of 4, compute any remaining MACs here.
271 ** No loop unrolling is used. */
276 /* Perform the multiply-accumulate */
277 /* x[0] * y[srcBLen - 1] */
278 sum
+= * px
++ * * py
++;
280 /* Decrement the loop counter */
284 /* Store the result in the accumulator in the destination buffer. */
286 /* Destination pointer is updated according to the address modifier, inc */
289 /* Update the inputA and inputB pointers for next MAC calculation */
293 /* Increment the MAC count */
296 /* Decrement the loop counter */
300 /* --------------------------
301 * Initializations of stage2
302 * ------------------------*/
304 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
305 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
307 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
310 /* Working pointer of inputA */
313 /* Working pointer of inputB */
316 /* count is index by which the pointer pIn1 to be incremented */
319 /* -------------------
321 * ------------------*/
323 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
324 * So, to loop unroll over blockSize2,
325 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
328 /* Loop unroll over blockSize2, by 4 */
329 blkCnt
= blockSize2
>> 2u ;
333 /* Set all accumulators to zero */
339 /* read x[0], x[1], x[2] samples */
344 /* Apply loop unrolling and compute 4 MACs simultaneously. */
347 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
348 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
351 /* Read y[0] sample */
354 /* Read x[3] sample */
357 /* Perform the multiply-accumulate */
358 /* acc0 += x[0] * y[0] */
360 /* acc1 += x[1] * y[0] */
362 /* acc2 += x[2] * y[0] */
364 /* acc3 += x[3] * y[0] */
367 /* Read y[1] sample */
370 /* Read x[4] sample */
373 /* Perform the multiply-accumulate */
374 /* acc0 += x[1] * y[1] */
376 /* acc1 += x[2] * y[1] */
378 /* acc2 += x[3] * y[1] */
380 /* acc3 += x[4] * y[1] */
383 /* Read y[2] sample */
386 /* Read x[5] sample */
389 /* Perform the multiply-accumulates */
390 /* acc0 += x[2] * y[2] */
392 /* acc1 += x[3] * y[2] */
394 /* acc2 += x[4] * y[2] */
396 /* acc3 += x[5] * y[2] */
399 /* Read y[3] sample */
402 /* Read x[6] sample */
405 /* Perform the multiply-accumulates */
406 /* acc0 += x[3] * y[3] */
408 /* acc1 += x[4] * y[3] */
410 /* acc2 += x[5] * y[3] */
412 /* acc3 += x[6] * y[3] */
418 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
419 ** No loop unrolling is used. */
424 /* Read y[4] sample */
427 /* Read x[7] sample */
430 /* Perform the multiply-accumulates */
431 /* acc0 += x[4] * y[4] */
433 /* acc1 += x[5] * y[4] */
435 /* acc2 += x[6] * y[4] */
437 /* acc3 += x[7] * y[4] */
440 /* Reuse the present samples for the next MAC */
445 /* Decrement the loop counter */
449 /* Store the result in the accumulator in the destination buffer. */
451 /* Destination pointer is updated according to the address modifier, inc */
463 /* Increment the pointer pIn1 index, count by 4 */
466 /* Update the inputA and inputB pointers for next MAC calculation */
470 /* Decrement the loop counter */
474 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
475 ** No loop unrolling is used. */
476 blkCnt
= blockSize2
% 0x4 u
;
480 /* Accumulator is made zero for every iteration */
483 /* Apply loop unrolling and compute 4 MACs simultaneously. */
486 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
487 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
490 /* Perform the multiply-accumulates */
491 sum
+= * px
++ * * py
++;
492 sum
+= * px
++ * * py
++;
493 sum
+= * px
++ * * py
++;
494 sum
+= * px
++ * * py
++;
496 /* Decrement the loop counter */
500 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
501 ** No loop unrolling is used. */
506 /* Perform the multiply-accumulate */
507 sum
+= * px
++ * * py
++;
509 /* Decrement the loop counter */
513 /* Store the result in the accumulator in the destination buffer. */
515 /* Destination pointer is updated according to the address modifier, inc */
518 /* Increment the pointer pIn1 index, count by 1 */
521 /* Update the inputA and inputB pointers for next MAC calculation */
525 /* Decrement the loop counter */
531 /* If the srcBLen is not a multiple of 4,
532 * the blockSize2 loop cannot be unrolled by 4 */
537 /* Accumulator is made zero for every iteration */
540 /* Loop over srcBLen */
545 /* Perform the multiply-accumulate */
546 sum
+= * px
++ * * py
++;
548 /* Decrement the loop counter */
552 /* Store the result in the accumulator in the destination buffer. */
554 /* Destination pointer is updated according to the address modifier, inc */
557 /* Increment the pointer pIn1 index, count by 1 */
560 /* Update the inputA and inputB pointers for next MAC calculation */
564 /* Decrement the loop counter */
569 /* --------------------------
570 * Initializations of stage3
571 * -------------------------*/
573 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
574 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
576 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
577 * sum += x[srcALen-1] * y[0]
580 /* In this stage the MAC operations are decreased by 1 for every iteration.
581 The count variable holds the number of MAC operations performed */
582 count
= srcBLen
- 1u ;
584 /* Working pointer of inputA */
585 pSrc1
= pIn1
+ ( srcALen
- ( srcBLen
- 1u ));
588 /* Working pointer of inputB */
591 /* -------------------
593 * ------------------*/
595 while ( blockSize3
> 0u )
597 /* Accumulator is made zero for every iteration */
600 /* Apply loop unrolling and compute 4 MACs simultaneously. */
603 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
604 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
607 /* Perform the multiply-accumulates */
608 /* sum += x[srcALen - srcBLen + 4] * y[3] */
609 sum
+= * px
++ * * py
++;
610 /* sum += x[srcALen - srcBLen + 3] * y[2] */
611 sum
+= * px
++ * * py
++;
612 /* sum += x[srcALen - srcBLen + 2] * y[1] */
613 sum
+= * px
++ * * py
++;
614 /* sum += x[srcALen - srcBLen + 1] * y[0] */
615 sum
+= * px
++ * * py
++;
617 /* Decrement the loop counter */
621 /* If the count is not a multiple of 4, compute any remaining MACs here.
622 ** No loop unrolling is used. */
627 /* Perform the multiply-accumulates */
628 sum
+= * px
++ * * py
++;
630 /* Decrement the loop counter */
634 /* Store the result in the accumulator in the destination buffer. */
636 /* Destination pointer is updated according to the address modifier, inc */
639 /* Update the inputA and inputB pointers for next MAC calculation */
643 /* Decrement the MAC count */
646 /* Decrement the loop counter */
652 /* Run the below code for Cortex-M0 */
654 float32_t
* pIn1
= pSrcA
; /* inputA pointer */
655 float32_t
* pIn2
= pSrcB
+ ( srcBLen
- 1u ); /* inputB pointer */
656 float32_t sum
; /* Accumulator */
657 uint32_t i
= 0u , j
; /* loop counters */
658 uint32_t inv
= 0u ; /* Reverse order flag */
659 uint32_t tot
= 0u ; /* Length */
661 /* The algorithm implementation is based on the lengths of the inputs. */
662 /* srcB is always made to slide across srcA. */
663 /* So srcBLen is always considered as shorter or equal to srcALen */
664 /* But CORR(x, y) is reverse of CORR(y, x) */
665 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
666 /* and a varaible, inv is set to 1 */
667 /* If lengths are not equal then zero pad has to be done to make the two
668 * inputs of same length. But to improve the performance, we include zeroes
669 * in the output instead of zero padding either of the the inputs*/
670 /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
671 * starting of the output buffer */
672 /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
673 * ending of the output buffer */
674 /* Once the zero padding is done the remaining of the output is calcualted
675 * using convolution but with the shorter signal time shifted. */
677 /* Calculate the length of the remaining sequence */
678 tot
= (( srcALen
+ srcBLen
) - 2u );
680 if ( srcALen
> srcBLen
)
682 /* Calculating the number of zeros to be padded to the output */
683 j
= srcALen
- srcBLen
;
685 /* Initialise the pointer after zero padding */
689 else if ( srcALen
< srcBLen
)
691 /* Initialization to inputB pointer */
694 /* Initialization to the end of inputA pointer */
695 pIn2
= pSrcA
+ ( srcALen
- 1u );
697 /* Initialisation of the pointer after zero padding */
700 /* Swapping the lengths */
705 /* Setting the reverse flag */
710 /* Loop to calculate convolution for output length number of times */
711 for ( i
= 0u ; i
<= tot
; i
++)
713 /* Initialize sum with zero to carry on MAC operations */
716 /* Loop to perform MAC operations according to convolution equation */
717 for ( j
= 0u ; j
<= i
; j
++)
719 /* Check the array limitations */
720 if (((( i
- j
) < srcBLen
) && ( j
< srcALen
)))
722 /* z[i] += x[i-j] * y[j] */
723 sum
+= pIn1
[ j
] * pIn2
[-(( int32_t ) i
- j
)];
726 /* Store the output in the destination buffer */
733 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
738 * @} end of Corr group