]> git.gir.st - tmk_keyboard.git/blob - tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_correlate_f32.c
Squashed 'tmk_core/' changes from 7967731..b9e0ea0
[tmk_keyboard.git] / tool / mbed / mbed-sdk / libraries / dsp / cmsis_dsp / FilteringFunctions / arm_correlate_f32.c
1 /* ----------------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
3 *
4 * $Date: 17. January 2013
5 * $Revision: V1.4.1
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_correlate_f32.c
9 *
10 * Description: Correlation of floating-point sequences.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------------- */
40
41 #include "arm_math.h"
42
43 /**
44 * @ingroup groupFilters
45 */
46
47 /**
48 * @defgroup Corr Correlation
49 *
50 * Correlation is a mathematical operation that is similar to convolution.
51 * As with convolution, correlation uses two signals to produce a third signal.
52 * The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution.
53 * Correlation is commonly used to measure the similarity between two signals.
54 * It has applications in pattern recognition, cryptanalysis, and searching.
55 * The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types.
56 * Fast versions of the Q15 and Q31 functions are also provided.
57 *
58 * \par Algorithm
59 * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
60 * The convolution of the two signals is denoted by
61 * <pre>
62 * c[n] = a[n] * b[n]
63 * </pre>
64 * In correlation, one of the signals is flipped in time
65 * <pre>
66 * c[n] = a[n] * b[-n]
67 * </pre>
68 *
69 * \par
70 * and this is mathematically defined as
71 * \image html CorrelateEquation.gif
72 * \par
73 * The <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
74 * The result <code>c[n]</code> is of length <code>2 * max(srcALen, srcBLen) - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2)</code>.
75 * The output result is written to <code>pDst</code> and the calling function must allocate <code>2 * max(srcALen, srcBLen) - 1</code> words for the result.
76 *
77 * <b>Note</b>
78 * \par
79 * The <code>pDst</code> should be initialized to all zeros before being used.
80 *
81 * <b>Fixed-Point Behavior</b>
82 * \par
83 * Correlation requires summing up a large number of intermediate products.
84 * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
85 * Refer to the function specific documentation below for further details of the particular algorithm used.
86 *
87 *
88 * <b>Fast Versions</b>
89 *
90 * \par
91 * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of correlate and the design requires
92 * the input signals should be scaled down to avoid intermediate overflows.
93 *
94 *
95 * <b>Opt Versions</b>
96 *
97 * \par
98 * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
99 * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of correlate
100 */
101
102 /**
103 * @addtogroup Corr
104 * @{
105 */
106 /**
107 * @brief Correlation of floating-point sequences.
108 * @param[in] *pSrcA points to the first input sequence.
109 * @param[in] srcALen length of the first input sequence.
110 * @param[in] *pSrcB points to the second input sequence.
111 * @param[in] srcBLen length of the second input sequence.
112 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
113 * @return none.
114 */
115
116 void arm_correlate_f32(
117 float32_t * pSrcA,
118 uint32_t srcALen,
119 float32_t * pSrcB,
120 uint32_t srcBLen,
121 float32_t * pDst)
122 {
123
124
125 #ifndef ARM_MATH_CM0_FAMILY
126
127 /* Run the below code for Cortex-M4 and Cortex-M3 */
128
129 float32_t *pIn1; /* inputA pointer */
130 float32_t *pIn2; /* inputB pointer */
131 float32_t *pOut = pDst; /* output pointer */
132 float32_t *px; /* Intermediate inputA pointer */
133 float32_t *py; /* Intermediate inputB pointer */
134 float32_t *pSrc1; /* Intermediate pointers */
135 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
136 float32_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
137 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counters */
138 int32_t inc = 1; /* Destination address modifier */
139
140
141 /* The algorithm implementation is based on the lengths of the inputs. */
142 /* srcB is always made to slide across srcA. */
143 /* So srcBLen is always considered as shorter or equal to srcALen */
144 /* But CORR(x, y) is reverse of CORR(y, x) */
145 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
146 /* and the destination pointer modifier, inc is set to -1 */
147 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
148 /* But to improve the performance,
149 * we include zeroes in the output instead of zero padding either of the the inputs*/
150 /* If srcALen > srcBLen,
151 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
152 /* If srcALen < srcBLen,
153 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
154 if(srcALen >= srcBLen)
155 {
156 /* Initialization of inputA pointer */
157 pIn1 = pSrcA;
158
159 /* Initialization of inputB pointer */
160 pIn2 = pSrcB;
161
162 /* Number of output samples is calculated */
163 outBlockSize = (2u * srcALen) - 1u;
164
165 /* When srcALen > srcBLen, zero padding has to be done to srcB
166 * to make their lengths equal.
167 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
168 * number of output samples are made zero */
169 j = outBlockSize - (srcALen + (srcBLen - 1u));
170
171 /* Updating the pointer position to non zero value */
172 pOut += j;
173
174 //while(j > 0u)
175 //{
176 // /* Zero is stored in the destination buffer */
177 // *pOut++ = 0.0f;
178
179 // /* Decrement the loop counter */
180 // j--;
181 //}
182
183 }
184 else
185 {
186 /* Initialization of inputA pointer */
187 pIn1 = pSrcB;
188
189 /* Initialization of inputB pointer */
190 pIn2 = pSrcA;
191
192 /* srcBLen is always considered as shorter or equal to srcALen */
193 j = srcBLen;
194 srcBLen = srcALen;
195 srcALen = j;
196
197 /* CORR(x, y) = Reverse order(CORR(y, x)) */
198 /* Hence set the destination pointer to point to the last output sample */
199 pOut = pDst + ((srcALen + srcBLen) - 2u);
200
201 /* Destination address modifier is set to -1 */
202 inc = -1;
203
204 }
205
206 /* The function is internally
207 * divided into three parts according to the number of multiplications that has to be
208 * taken place between inputA samples and inputB samples. In the first part of the
209 * algorithm, the multiplications increase by one for every iteration.
210 * In the second part of the algorithm, srcBLen number of multiplications are done.
211 * In the third part of the algorithm, the multiplications decrease by one
212 * for every iteration.*/
213 /* The algorithm is implemented in three stages.
214 * The loop counters of each stage is initiated here. */
215 blockSize1 = srcBLen - 1u;
216 blockSize2 = srcALen - (srcBLen - 1u);
217 blockSize3 = blockSize1;
218
219 /* --------------------------
220 * Initializations of stage1
221 * -------------------------*/
222
223 /* sum = x[0] * y[srcBlen - 1]
224 * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
225 * ....
226 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
227 */
228
229 /* In this stage the MAC operations are increased by 1 for every iteration.
230 The count variable holds the number of MAC operations performed */
231 count = 1u;
232
233 /* Working pointer of inputA */
234 px = pIn1;
235
236 /* Working pointer of inputB */
237 pSrc1 = pIn2 + (srcBLen - 1u);
238 py = pSrc1;
239
240 /* ------------------------
241 * Stage1 process
242 * ----------------------*/
243
244 /* The first stage starts here */
245 while(blockSize1 > 0u)
246 {
247 /* Accumulator is made zero for every iteration */
248 sum = 0.0f;
249
250 /* Apply loop unrolling and compute 4 MACs simultaneously. */
251 k = count >> 2u;
252
253 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
254 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
255 while(k > 0u)
256 {
257 /* x[0] * y[srcBLen - 4] */
258 sum += *px++ * *py++;
259 /* x[1] * y[srcBLen - 3] */
260 sum += *px++ * *py++;
261 /* x[2] * y[srcBLen - 2] */
262 sum += *px++ * *py++;
263 /* x[3] * y[srcBLen - 1] */
264 sum += *px++ * *py++;
265
266 /* Decrement the loop counter */
267 k--;
268 }
269
270 /* If the count is not a multiple of 4, compute any remaining MACs here.
271 ** No loop unrolling is used. */
272 k = count % 0x4u;
273
274 while(k > 0u)
275 {
276 /* Perform the multiply-accumulate */
277 /* x[0] * y[srcBLen - 1] */
278 sum += *px++ * *py++;
279
280 /* Decrement the loop counter */
281 k--;
282 }
283
284 /* Store the result in the accumulator in the destination buffer. */
285 *pOut = sum;
286 /* Destination pointer is updated according to the address modifier, inc */
287 pOut += inc;
288
289 /* Update the inputA and inputB pointers for next MAC calculation */
290 py = pSrc1 - count;
291 px = pIn1;
292
293 /* Increment the MAC count */
294 count++;
295
296 /* Decrement the loop counter */
297 blockSize1--;
298 }
299
300 /* --------------------------
301 * Initializations of stage2
302 * ------------------------*/
303
304 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
305 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
306 * ....
307 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
308 */
309
310 /* Working pointer of inputA */
311 px = pIn1;
312
313 /* Working pointer of inputB */
314 py = pIn2;
315
316 /* count is index by which the pointer pIn1 to be incremented */
317 count = 0u;
318
319 /* -------------------
320 * Stage2 process
321 * ------------------*/
322
323 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
324 * So, to loop unroll over blockSize2,
325 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
326 if(srcBLen >= 4u)
327 {
328 /* Loop unroll over blockSize2, by 4 */
329 blkCnt = blockSize2 >> 2u;
330
331 while(blkCnt > 0u)
332 {
333 /* Set all accumulators to zero */
334 acc0 = 0.0f;
335 acc1 = 0.0f;
336 acc2 = 0.0f;
337 acc3 = 0.0f;
338
339 /* read x[0], x[1], x[2] samples */
340 x0 = *(px++);
341 x1 = *(px++);
342 x2 = *(px++);
343
344 /* Apply loop unrolling and compute 4 MACs simultaneously. */
345 k = srcBLen >> 2u;
346
347 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
348 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
349 do
350 {
351 /* Read y[0] sample */
352 c0 = *(py++);
353
354 /* Read x[3] sample */
355 x3 = *(px++);
356
357 /* Perform the multiply-accumulate */
358 /* acc0 += x[0] * y[0] */
359 acc0 += x0 * c0;
360 /* acc1 += x[1] * y[0] */
361 acc1 += x1 * c0;
362 /* acc2 += x[2] * y[0] */
363 acc2 += x2 * c0;
364 /* acc3 += x[3] * y[0] */
365 acc3 += x3 * c0;
366
367 /* Read y[1] sample */
368 c0 = *(py++);
369
370 /* Read x[4] sample */
371 x0 = *(px++);
372
373 /* Perform the multiply-accumulate */
374 /* acc0 += x[1] * y[1] */
375 acc0 += x1 * c0;
376 /* acc1 += x[2] * y[1] */
377 acc1 += x2 * c0;
378 /* acc2 += x[3] * y[1] */
379 acc2 += x3 * c0;
380 /* acc3 += x[4] * y[1] */
381 acc3 += x0 * c0;
382
383 /* Read y[2] sample */
384 c0 = *(py++);
385
386 /* Read x[5] sample */
387 x1 = *(px++);
388
389 /* Perform the multiply-accumulates */
390 /* acc0 += x[2] * y[2] */
391 acc0 += x2 * c0;
392 /* acc1 += x[3] * y[2] */
393 acc1 += x3 * c0;
394 /* acc2 += x[4] * y[2] */
395 acc2 += x0 * c0;
396 /* acc3 += x[5] * y[2] */
397 acc3 += x1 * c0;
398
399 /* Read y[3] sample */
400 c0 = *(py++);
401
402 /* Read x[6] sample */
403 x2 = *(px++);
404
405 /* Perform the multiply-accumulates */
406 /* acc0 += x[3] * y[3] */
407 acc0 += x3 * c0;
408 /* acc1 += x[4] * y[3] */
409 acc1 += x0 * c0;
410 /* acc2 += x[5] * y[3] */
411 acc2 += x1 * c0;
412 /* acc3 += x[6] * y[3] */
413 acc3 += x2 * c0;
414
415
416 } while(--k);
417
418 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
419 ** No loop unrolling is used. */
420 k = srcBLen % 0x4u;
421
422 while(k > 0u)
423 {
424 /* Read y[4] sample */
425 c0 = *(py++);
426
427 /* Read x[7] sample */
428 x3 = *(px++);
429
430 /* Perform the multiply-accumulates */
431 /* acc0 += x[4] * y[4] */
432 acc0 += x0 * c0;
433 /* acc1 += x[5] * y[4] */
434 acc1 += x1 * c0;
435 /* acc2 += x[6] * y[4] */
436 acc2 += x2 * c0;
437 /* acc3 += x[7] * y[4] */
438 acc3 += x3 * c0;
439
440 /* Reuse the present samples for the next MAC */
441 x0 = x1;
442 x1 = x2;
443 x2 = x3;
444
445 /* Decrement the loop counter */
446 k--;
447 }
448
449 /* Store the result in the accumulator in the destination buffer. */
450 *pOut = acc0;
451 /* Destination pointer is updated according to the address modifier, inc */
452 pOut += inc;
453
454 *pOut = acc1;
455 pOut += inc;
456
457 *pOut = acc2;
458 pOut += inc;
459
460 *pOut = acc3;
461 pOut += inc;
462
463 /* Increment the pointer pIn1 index, count by 4 */
464 count += 4u;
465
466 /* Update the inputA and inputB pointers for next MAC calculation */
467 px = pIn1 + count;
468 py = pIn2;
469
470 /* Decrement the loop counter */
471 blkCnt--;
472 }
473
474 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
475 ** No loop unrolling is used. */
476 blkCnt = blockSize2 % 0x4u;
477
478 while(blkCnt > 0u)
479 {
480 /* Accumulator is made zero for every iteration */
481 sum = 0.0f;
482
483 /* Apply loop unrolling and compute 4 MACs simultaneously. */
484 k = srcBLen >> 2u;
485
486 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
487 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
488 while(k > 0u)
489 {
490 /* Perform the multiply-accumulates */
491 sum += *px++ * *py++;
492 sum += *px++ * *py++;
493 sum += *px++ * *py++;
494 sum += *px++ * *py++;
495
496 /* Decrement the loop counter */
497 k--;
498 }
499
500 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
501 ** No loop unrolling is used. */
502 k = srcBLen % 0x4u;
503
504 while(k > 0u)
505 {
506 /* Perform the multiply-accumulate */
507 sum += *px++ * *py++;
508
509 /* Decrement the loop counter */
510 k--;
511 }
512
513 /* Store the result in the accumulator in the destination buffer. */
514 *pOut = sum;
515 /* Destination pointer is updated according to the address modifier, inc */
516 pOut += inc;
517
518 /* Increment the pointer pIn1 index, count by 1 */
519 count++;
520
521 /* Update the inputA and inputB pointers for next MAC calculation */
522 px = pIn1 + count;
523 py = pIn2;
524
525 /* Decrement the loop counter */
526 blkCnt--;
527 }
528 }
529 else
530 {
531 /* If the srcBLen is not a multiple of 4,
532 * the blockSize2 loop cannot be unrolled by 4 */
533 blkCnt = blockSize2;
534
535 while(blkCnt > 0u)
536 {
537 /* Accumulator is made zero for every iteration */
538 sum = 0.0f;
539
540 /* Loop over srcBLen */
541 k = srcBLen;
542
543 while(k > 0u)
544 {
545 /* Perform the multiply-accumulate */
546 sum += *px++ * *py++;
547
548 /* Decrement the loop counter */
549 k--;
550 }
551
552 /* Store the result in the accumulator in the destination buffer. */
553 *pOut = sum;
554 /* Destination pointer is updated according to the address modifier, inc */
555 pOut += inc;
556
557 /* Increment the pointer pIn1 index, count by 1 */
558 count++;
559
560 /* Update the inputA and inputB pointers for next MAC calculation */
561 px = pIn1 + count;
562 py = pIn2;
563
564 /* Decrement the loop counter */
565 blkCnt--;
566 }
567 }
568
569 /* --------------------------
570 * Initializations of stage3
571 * -------------------------*/
572
573 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
574 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
575 * ....
576 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
577 * sum += x[srcALen-1] * y[0]
578 */
579
580 /* In this stage the MAC operations are decreased by 1 for every iteration.
581 The count variable holds the number of MAC operations performed */
582 count = srcBLen - 1u;
583
584 /* Working pointer of inputA */
585 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
586 px = pSrc1;
587
588 /* Working pointer of inputB */
589 py = pIn2;
590
591 /* -------------------
592 * Stage3 process
593 * ------------------*/
594
595 while(blockSize3 > 0u)
596 {
597 /* Accumulator is made zero for every iteration */
598 sum = 0.0f;
599
600 /* Apply loop unrolling and compute 4 MACs simultaneously. */
601 k = count >> 2u;
602
603 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
604 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
605 while(k > 0u)
606 {
607 /* Perform the multiply-accumulates */
608 /* sum += x[srcALen - srcBLen + 4] * y[3] */
609 sum += *px++ * *py++;
610 /* sum += x[srcALen - srcBLen + 3] * y[2] */
611 sum += *px++ * *py++;
612 /* sum += x[srcALen - srcBLen + 2] * y[1] */
613 sum += *px++ * *py++;
614 /* sum += x[srcALen - srcBLen + 1] * y[0] */
615 sum += *px++ * *py++;
616
617 /* Decrement the loop counter */
618 k--;
619 }
620
621 /* If the count is not a multiple of 4, compute any remaining MACs here.
622 ** No loop unrolling is used. */
623 k = count % 0x4u;
624
625 while(k > 0u)
626 {
627 /* Perform the multiply-accumulates */
628 sum += *px++ * *py++;
629
630 /* Decrement the loop counter */
631 k--;
632 }
633
634 /* Store the result in the accumulator in the destination buffer. */
635 *pOut = sum;
636 /* Destination pointer is updated according to the address modifier, inc */
637 pOut += inc;
638
639 /* Update the inputA and inputB pointers for next MAC calculation */
640 px = ++pSrc1;
641 py = pIn2;
642
643 /* Decrement the MAC count */
644 count--;
645
646 /* Decrement the loop counter */
647 blockSize3--;
648 }
649
650 #else
651
652 /* Run the below code for Cortex-M0 */
653
654 float32_t *pIn1 = pSrcA; /* inputA pointer */
655 float32_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */
656 float32_t sum; /* Accumulator */
657 uint32_t i = 0u, j; /* loop counters */
658 uint32_t inv = 0u; /* Reverse order flag */
659 uint32_t tot = 0u; /* Length */
660
661 /* The algorithm implementation is based on the lengths of the inputs. */
662 /* srcB is always made to slide across srcA. */
663 /* So srcBLen is always considered as shorter or equal to srcALen */
664 /* But CORR(x, y) is reverse of CORR(y, x) */
665 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
666 /* and a varaible, inv is set to 1 */
667 /* If lengths are not equal then zero pad has to be done to make the two
668 * inputs of same length. But to improve the performance, we include zeroes
669 * in the output instead of zero padding either of the the inputs*/
670 /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
671 * starting of the output buffer */
672 /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
673 * ending of the output buffer */
674 /* Once the zero padding is done the remaining of the output is calcualted
675 * using convolution but with the shorter signal time shifted. */
676
677 /* Calculate the length of the remaining sequence */
678 tot = ((srcALen + srcBLen) - 2u);
679
680 if(srcALen > srcBLen)
681 {
682 /* Calculating the number of zeros to be padded to the output */
683 j = srcALen - srcBLen;
684
685 /* Initialise the pointer after zero padding */
686 pDst += j;
687 }
688
689 else if(srcALen < srcBLen)
690 {
691 /* Initialization to inputB pointer */
692 pIn1 = pSrcB;
693
694 /* Initialization to the end of inputA pointer */
695 pIn2 = pSrcA + (srcALen - 1u);
696
697 /* Initialisation of the pointer after zero padding */
698 pDst = pDst + tot;
699
700 /* Swapping the lengths */
701 j = srcALen;
702 srcALen = srcBLen;
703 srcBLen = j;
704
705 /* Setting the reverse flag */
706 inv = 1;
707
708 }
709
710 /* Loop to calculate convolution for output length number of times */
711 for (i = 0u; i <= tot; i++)
712 {
713 /* Initialize sum with zero to carry on MAC operations */
714 sum = 0.0f;
715
716 /* Loop to perform MAC operations according to convolution equation */
717 for (j = 0u; j <= i; j++)
718 {
719 /* Check the array limitations */
720 if((((i - j) < srcBLen) && (j < srcALen)))
721 {
722 /* z[i] += x[i-j] * y[j] */
723 sum += pIn1[j] * pIn2[-((int32_t) i - j)];
724 }
725 }
726 /* Store the output in the destination buffer */
727 if(inv == 1)
728 *pDst-- = sum;
729 else
730 *pDst++ = sum;
731 }
732
733 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
734
735 }
736
737 /**
738 * @} end of Corr group
739 */
Imprint / Impressum