]> git.gir.st - tmk_keyboard.git/blob - tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_f32.c
Merge commit '1fe4406f374291ab2e86e95a97341fd9c475fcb8'
[tmk_keyboard.git] / tmk_core / tool / mbed / mbed-sdk / libraries / dsp / cmsis_dsp / FilteringFunctions / arm_conv_f32.c
1 /* ----------------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
3 *
4 * $Date: 17. January 2013
5 * $Revision: V1.4.1
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_f32.c
9 *
10 * Description: Convolution of floating-point sequences.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------------- */
40
41 #include "arm_math.h"
42
43 /**
44 * @ingroup groupFilters
45 */
46
47 /**
48 * @defgroup Conv Convolution
49 *
50 * Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector.
51 * Convolution is similar to correlation and is frequently used in filtering and data analysis.
52 * The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types.
53 * The library also provides fast versions of the Q15 and Q31 functions on Cortex-M4 and Cortex-M3.
54 *
55 * \par Algorithm
56 * Let <code>a[n]</code> and <code>b[n]</code> be sequences of length <code>srcALen</code> and <code>srcBLen</code> samples respectively.
57 * Then the convolution
58 *
59 * <pre>
60 * c[n] = a[n] * b[n]
61 * </pre>
62 *
63 * \par
64 * is defined as
65 * \image html ConvolutionEquation.gif
66 * \par
67 * Note that <code>c[n]</code> is of length <code>srcALen + srcBLen - 1</code> and is defined over the interval <code>n=0, 1, 2, ..., srcALen + srcBLen - 2</code>.
68 * <code>pSrcA</code> points to the first input vector of length <code>srcALen</code> and
69 * <code>pSrcB</code> points to the second input vector of length <code>srcBLen</code>.
70 * The output result is written to <code>pDst</code> and the calling function must allocate <code>srcALen+srcBLen-1</code> words for the result.
71 *
72 * \par
73 * Conceptually, when two signals <code>a[n]</code> and <code>b[n]</code> are convolved,
74 * the signal <code>b[n]</code> slides over <code>a[n]</code>.
75 * For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together.
76 *
77 * \par
78 * Note that convolution is a commutative operation:
79 *
80 * <pre>
81 * a[n] * b[n] = b[n] * a[n].
82 * </pre>
83 *
84 * \par
85 * This means that switching the A and B arguments to the convolution functions has no effect.
86 *
87 * <b>Fixed-Point Behavior</b>
88 *
89 * \par
90 * Convolution requires summing up a large number of intermediate products.
91 * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
92 * Refer to the function specific documentation below for further details of the particular algorithm used.
93 *
94 *
95 * <b>Fast Versions</b>
96 *
97 * \par
98 * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires
99 * the input signals should be scaled down to avoid intermediate overflows.
100 *
101 *
102 * <b>Opt Versions</b>
103 *
104 * \par
105 * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
106 * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions
107 */
108
109 /**
110 * @addtogroup Conv
111 * @{
112 */
113
114 /**
115 * @brief Convolution of floating-point sequences.
116 * @param[in] *pSrcA points to the first input sequence.
117 * @param[in] srcALen length of the first input sequence.
118 * @param[in] *pSrcB points to the second input sequence.
119 * @param[in] srcBLen length of the second input sequence.
120 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
121 * @return none.
122 */
123
124 void arm_conv_f32(
125 float32_t * pSrcA,
126 uint32_t srcALen,
127 float32_t * pSrcB,
128 uint32_t srcBLen,
129 float32_t * pDst)
130 {
131
132
133 #ifndef ARM_MATH_CM0_FAMILY
134
135 /* Run the below code for Cortex-M4 and Cortex-M3 */
136
137 float32_t *pIn1; /* inputA pointer */
138 float32_t *pIn2; /* inputB pointer */
139 float32_t *pOut = pDst; /* output pointer */
140 float32_t *px; /* Intermediate inputA pointer */
141 float32_t *py; /* Intermediate inputB pointer */
142 float32_t *pSrc1, *pSrc2; /* Intermediate pointers */
143 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
144 float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
145 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counters */
146
147 /* The algorithm implementation is based on the lengths of the inputs. */
148 /* srcB is always made to slide across srcA. */
149 /* So srcBLen is always considered as shorter or equal to srcALen */
150 if(srcALen >= srcBLen)
151 {
152 /* Initialization of inputA pointer */
153 pIn1 = pSrcA;
154
155 /* Initialization of inputB pointer */
156 pIn2 = pSrcB;
157 }
158 else
159 {
160 /* Initialization of inputA pointer */
161 pIn1 = pSrcB;
162
163 /* Initialization of inputB pointer */
164 pIn2 = pSrcA;
165
166 /* srcBLen is always considered as shorter or equal to srcALen */
167 j = srcBLen;
168 srcBLen = srcALen;
169 srcALen = j;
170 }
171
172 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
173 /* The function is internally
174 * divided into three stages according to the number of multiplications that has to be
175 * taken place between inputA samples and inputB samples. In the first stage of the
176 * algorithm, the multiplications increase by one for every iteration.
177 * In the second stage of the algorithm, srcBLen number of multiplications are done.
178 * In the third stage of the algorithm, the multiplications decrease by one
179 * for every iteration. */
180
181 /* The algorithm is implemented in three stages.
182 The loop counters of each stage is initiated here. */
183 blockSize1 = srcBLen - 1u;
184 blockSize2 = srcALen - (srcBLen - 1u);
185 blockSize3 = blockSize1;
186
187 /* --------------------------
188 * initializations of stage1
189 * -------------------------*/
190
191 /* sum = x[0] * y[0]
192 * sum = x[0] * y[1] + x[1] * y[0]
193 * ....
194 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
195 */
196
197 /* In this stage the MAC operations are increased by 1 for every iteration.
198 The count variable holds the number of MAC operations performed */
199 count = 1u;
200
201 /* Working pointer of inputA */
202 px = pIn1;
203
204 /* Working pointer of inputB */
205 py = pIn2;
206
207
208 /* ------------------------
209 * Stage1 process
210 * ----------------------*/
211
212 /* The first stage starts here */
213 while(blockSize1 > 0u)
214 {
215 /* Accumulator is made zero for every iteration */
216 sum = 0.0f;
217
218 /* Apply loop unrolling and compute 4 MACs simultaneously. */
219 k = count >> 2u;
220
221 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
222 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
223 while(k > 0u)
224 {
225 /* x[0] * y[srcBLen - 1] */
226 sum += *px++ * *py--;
227
228 /* x[1] * y[srcBLen - 2] */
229 sum += *px++ * *py--;
230
231 /* x[2] * y[srcBLen - 3] */
232 sum += *px++ * *py--;
233
234 /* x[3] * y[srcBLen - 4] */
235 sum += *px++ * *py--;
236
237 /* Decrement the loop counter */
238 k--;
239 }
240
241 /* If the count is not a multiple of 4, compute any remaining MACs here.
242 ** No loop unrolling is used. */
243 k = count % 0x4u;
244
245 while(k > 0u)
246 {
247 /* Perform the multiply-accumulate */
248 sum += *px++ * *py--;
249
250 /* Decrement the loop counter */
251 k--;
252 }
253
254 /* Store the result in the accumulator in the destination buffer. */
255 *pOut++ = sum;
256
257 /* Update the inputA and inputB pointers for next MAC calculation */
258 py = pIn2 + count;
259 px = pIn1;
260
261 /* Increment the MAC count */
262 count++;
263
264 /* Decrement the loop counter */
265 blockSize1--;
266 }
267
268 /* --------------------------
269 * Initializations of stage2
270 * ------------------------*/
271
272 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
273 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
274 * ....
275 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
276 */
277
278 /* Working pointer of inputA */
279 px = pIn1;
280
281 /* Working pointer of inputB */
282 pSrc2 = pIn2 + (srcBLen - 1u);
283 py = pSrc2;
284
285 /* count is index by which the pointer pIn1 to be incremented */
286 count = 0u;
287
288 /* -------------------
289 * Stage2 process
290 * ------------------*/
291
292 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
293 * So, to loop unroll over blockSize2,
294 * srcBLen should be greater than or equal to 4 */
295 if(srcBLen >= 4u)
296 {
297 /* Loop unroll over blockSize2, by 4 */
298 blkCnt = blockSize2 >> 2u;
299
300 while(blkCnt > 0u)
301 {
302 /* Set all accumulators to zero */
303 acc0 = 0.0f;
304 acc1 = 0.0f;
305 acc2 = 0.0f;
306 acc3 = 0.0f;
307
308 /* read x[0], x[1], x[2] samples */
309 x0 = *(px++);
310 x1 = *(px++);
311 x2 = *(px++);
312
313 /* Apply loop unrolling and compute 4 MACs simultaneously. */
314 k = srcBLen >> 2u;
315
316 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
317 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
318 do
319 {
320 /* Read y[srcBLen - 1] sample */
321 c0 = *(py--);
322
323 /* Read x[3] sample */
324 x3 = *(px);
325
326 /* Perform the multiply-accumulate */
327 /* acc0 += x[0] * y[srcBLen - 1] */
328 acc0 += x0 * c0;
329
330 /* acc1 += x[1] * y[srcBLen - 1] */
331 acc1 += x1 * c0;
332
333 /* acc2 += x[2] * y[srcBLen - 1] */
334 acc2 += x2 * c0;
335
336 /* acc3 += x[3] * y[srcBLen - 1] */
337 acc3 += x3 * c0;
338
339 /* Read y[srcBLen - 2] sample */
340 c0 = *(py--);
341
342 /* Read x[4] sample */
343 x0 = *(px + 1u);
344
345 /* Perform the multiply-accumulate */
346 /* acc0 += x[1] * y[srcBLen - 2] */
347 acc0 += x1 * c0;
348 /* acc1 += x[2] * y[srcBLen - 2] */
349 acc1 += x2 * c0;
350 /* acc2 += x[3] * y[srcBLen - 2] */
351 acc2 += x3 * c0;
352 /* acc3 += x[4] * y[srcBLen - 2] */
353 acc3 += x0 * c0;
354
355 /* Read y[srcBLen - 3] sample */
356 c0 = *(py--);
357
358 /* Read x[5] sample */
359 x1 = *(px + 2u);
360
361 /* Perform the multiply-accumulates */
362 /* acc0 += x[2] * y[srcBLen - 3] */
363 acc0 += x2 * c0;
364 /* acc1 += x[3] * y[srcBLen - 2] */
365 acc1 += x3 * c0;
366 /* acc2 += x[4] * y[srcBLen - 2] */
367 acc2 += x0 * c0;
368 /* acc3 += x[5] * y[srcBLen - 2] */
369 acc3 += x1 * c0;
370
371 /* Read y[srcBLen - 4] sample */
372 c0 = *(py--);
373
374 /* Read x[6] sample */
375 x2 = *(px + 3u);
376 px += 4u;
377
378 /* Perform the multiply-accumulates */
379 /* acc0 += x[3] * y[srcBLen - 4] */
380 acc0 += x3 * c0;
381 /* acc1 += x[4] * y[srcBLen - 4] */
382 acc1 += x0 * c0;
383 /* acc2 += x[5] * y[srcBLen - 4] */
384 acc2 += x1 * c0;
385 /* acc3 += x[6] * y[srcBLen - 4] */
386 acc3 += x2 * c0;
387
388
389 } while(--k);
390
391 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
392 ** No loop unrolling is used. */
393 k = srcBLen % 0x4u;
394
395 while(k > 0u)
396 {
397 /* Read y[srcBLen - 5] sample */
398 c0 = *(py--);
399
400 /* Read x[7] sample */
401 x3 = *(px++);
402
403 /* Perform the multiply-accumulates */
404 /* acc0 += x[4] * y[srcBLen - 5] */
405 acc0 += x0 * c0;
406 /* acc1 += x[5] * y[srcBLen - 5] */
407 acc1 += x1 * c0;
408 /* acc2 += x[6] * y[srcBLen - 5] */
409 acc2 += x2 * c0;
410 /* acc3 += x[7] * y[srcBLen - 5] */
411 acc3 += x3 * c0;
412
413 /* Reuse the present samples for the next MAC */
414 x0 = x1;
415 x1 = x2;
416 x2 = x3;
417
418 /* Decrement the loop counter */
419 k--;
420 }
421
422 /* Store the result in the accumulator in the destination buffer. */
423 *pOut++ = acc0;
424 *pOut++ = acc1;
425 *pOut++ = acc2;
426 *pOut++ = acc3;
427
428 /* Increment the pointer pIn1 index, count by 4 */
429 count += 4u;
430
431 /* Update the inputA and inputB pointers for next MAC calculation */
432 px = pIn1 + count;
433 py = pSrc2;
434
435
436 /* Decrement the loop counter */
437 blkCnt--;
438 }
439
440
441 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
442 ** No loop unrolling is used. */
443 blkCnt = blockSize2 % 0x4u;
444
445 while(blkCnt > 0u)
446 {
447 /* Accumulator is made zero for every iteration */
448 sum = 0.0f;
449
450 /* Apply loop unrolling and compute 4 MACs simultaneously. */
451 k = srcBLen >> 2u;
452
453 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
454 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
455 while(k > 0u)
456 {
457 /* Perform the multiply-accumulates */
458 sum += *px++ * *py--;
459 sum += *px++ * *py--;
460 sum += *px++ * *py--;
461 sum += *px++ * *py--;
462
463 /* Decrement the loop counter */
464 k--;
465 }
466
467 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
468 ** No loop unrolling is used. */
469 k = srcBLen % 0x4u;
470
471 while(k > 0u)
472 {
473 /* Perform the multiply-accumulate */
474 sum += *px++ * *py--;
475
476 /* Decrement the loop counter */
477 k--;
478 }
479
480 /* Store the result in the accumulator in the destination buffer. */
481 *pOut++ = sum;
482
483 /* Increment the MAC count */
484 count++;
485
486 /* Update the inputA and inputB pointers for next MAC calculation */
487 px = pIn1 + count;
488 py = pSrc2;
489
490 /* Decrement the loop counter */
491 blkCnt--;
492 }
493 }
494 else
495 {
496 /* If the srcBLen is not a multiple of 4,
497 * the blockSize2 loop cannot be unrolled by 4 */
498 blkCnt = blockSize2;
499
500 while(blkCnt > 0u)
501 {
502 /* Accumulator is made zero for every iteration */
503 sum = 0.0f;
504
505 /* srcBLen number of MACS should be performed */
506 k = srcBLen;
507
508 while(k > 0u)
509 {
510 /* Perform the multiply-accumulate */
511 sum += *px++ * *py--;
512
513 /* Decrement the loop counter */
514 k--;
515 }
516
517 /* Store the result in the accumulator in the destination buffer. */
518 *pOut++ = sum;
519
520 /* Increment the MAC count */
521 count++;
522
523 /* Update the inputA and inputB pointers for next MAC calculation */
524 px = pIn1 + count;
525 py = pSrc2;
526
527 /* Decrement the loop counter */
528 blkCnt--;
529 }
530 }
531
532
533 /* --------------------------
534 * Initializations of stage3
535 * -------------------------*/
536
537 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
538 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
539 * ....
540 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
541 * sum += x[srcALen-1] * y[srcBLen-1]
542 */
543
544 /* In this stage the MAC operations are decreased by 1 for every iteration.
545 The blockSize3 variable holds the number of MAC operations performed */
546
547 /* Working pointer of inputA */
548 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
549 px = pSrc1;
550
551 /* Working pointer of inputB */
552 pSrc2 = pIn2 + (srcBLen - 1u);
553 py = pSrc2;
554
555 /* -------------------
556 * Stage3 process
557 * ------------------*/
558
559 while(blockSize3 > 0u)
560 {
561 /* Accumulator is made zero for every iteration */
562 sum = 0.0f;
563
564 /* Apply loop unrolling and compute 4 MACs simultaneously. */
565 k = blockSize3 >> 2u;
566
567 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
568 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
569 while(k > 0u)
570 {
571 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
572 sum += *px++ * *py--;
573
574 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
575 sum += *px++ * *py--;
576
577 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
578 sum += *px++ * *py--;
579
580 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
581 sum += *px++ * *py--;
582
583 /* Decrement the loop counter */
584 k--;
585 }
586
587 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
588 ** No loop unrolling is used. */
589 k = blockSize3 % 0x4u;
590
591 while(k > 0u)
592 {
593 /* Perform the multiply-accumulates */
594 /* sum += x[srcALen-1] * y[srcBLen-1] */
595 sum += *px++ * *py--;
596
597 /* Decrement the loop counter */
598 k--;
599 }
600
601 /* Store the result in the accumulator in the destination buffer. */
602 *pOut++ = sum;
603
604 /* Update the inputA and inputB pointers for next MAC calculation */
605 px = ++pSrc1;
606 py = pSrc2;
607
608 /* Decrement the loop counter */
609 blockSize3--;
610 }
611
612 #else
613
614 /* Run the below code for Cortex-M0 */
615
616 float32_t *pIn1 = pSrcA; /* inputA pointer */
617 float32_t *pIn2 = pSrcB; /* inputB pointer */
618 float32_t sum; /* Accumulator */
619 uint32_t i, j; /* loop counters */
620
621 /* Loop to calculate convolution for output length number of times */
622 for (i = 0u; i < ((srcALen + srcBLen) - 1u); i++)
623 {
624 /* Initialize sum with zero to carry out MAC operations */
625 sum = 0.0f;
626
627 /* Loop to perform MAC operations according to convolution equation */
628 for (j = 0u; j <= i; j++)
629 {
630 /* Check the array limitations */
631 if((((i - j) < srcBLen) && (j < srcALen)))
632 {
633 /* z[i] += x[i-j] * y[j] */
634 sum += pIn1[j] * pIn2[i - j];
635 }
636 }
637 /* Store the output in the destination buffer */
638 pDst[i] = sum;
639 }
640
641 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
642
643 }
644
645 /**
646 * @} end of Conv group
647 */
Imprint / Impressum