]> git.gir.st - tmk_keyboard.git/blob - tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_partial_f32.c
Merge commit '1fe4406f374291ab2e86e95a97341fd9c475fcb8'
[tmk_keyboard.git] / tmk_core / tool / mbed / mbed-sdk / libraries / dsp / cmsis_dsp / FilteringFunctions / arm_conv_partial_f32.c
1 /* ----------------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
3 *
4 * $Date: 17. January 2013
5 * $Revision: V1.4.1
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_partial_f32.c
9 *
10 * Description: Partial convolution of floating-point sequences.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------------- */
40
41 #include "arm_math.h"
42
43 /**
44 * @ingroup groupFilters
45 */
46
47 /**
48 * @defgroup PartialConv Partial Convolution
49 *
50 * Partial Convolution is equivalent to Convolution except that a subset of the output samples is generated.
51 * Each function has two additional arguments.
52 * <code>firstIndex</code> specifies the starting index of the subset of output samples.
53 * <code>numPoints</code> is the number of output samples to compute.
54 * The function computes the output in the range
55 * <code>[firstIndex, ..., firstIndex+numPoints-1]</code>.
56 * The output array <code>pDst</code> contains <code>numPoints</code> values.
57 *
58 * The allowable range of output indices is [0 srcALen+srcBLen-2].
59 * If the requested subset does not fall in this range then the functions return ARM_MATH_ARGUMENT_ERROR.
60 * Otherwise the functions return ARM_MATH_SUCCESS.
61 * \note Refer arm_conv_f32() for details on fixed point behavior.
62 *
63 *
64 * <b>Fast Versions</b>
65 *
66 * \par
67 * Fast versions are supported for Q31 and Q15 of partial convolution. Cycles for Fast versions are less compared to Q31 and Q15 of partial conv and the design requires
68 * the input signals should be scaled down to avoid intermediate overflows.
69 *
70 *
71 * <b>Opt Versions</b>
72 *
73 * \par
74 * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
75 * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of partial convolution
76 */
77
78 /**
79 * @addtogroup PartialConv
80 * @{
81 */
82
83 /**
84 * @brief Partial convolution of floating-point sequences.
85 * @param[in] *pSrcA points to the first input sequence.
86 * @param[in] srcALen length of the first input sequence.
87 * @param[in] *pSrcB points to the second input sequence.
88 * @param[in] srcBLen length of the second input sequence.
89 * @param[out] *pDst points to the location where the output result is written.
90 * @param[in] firstIndex is the first output sample to start with.
91 * @param[in] numPoints is the number of output points to be computed.
92 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
93 */
94
95 arm_status arm_conv_partial_f32(
96 float32_t * pSrcA,
97 uint32_t srcALen,
98 float32_t * pSrcB,
99 uint32_t srcBLen,
100 float32_t * pDst,
101 uint32_t firstIndex,
102 uint32_t numPoints)
103 {
104
105
106 #ifndef ARM_MATH_CM0_FAMILY
107
108 /* Run the below code for Cortex-M4 and Cortex-M3 */
109
110 float32_t *pIn1 = pSrcA; /* inputA pointer */
111 float32_t *pIn2 = pSrcB; /* inputB pointer */
112 float32_t *pOut = pDst; /* output pointer */
113 float32_t *px; /* Intermediate inputA pointer */
114 float32_t *py; /* Intermediate inputB pointer */
115 float32_t *pSrc1, *pSrc2; /* Intermediate pointers */
116 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
117 float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
118 uint32_t j, k, count = 0u, blkCnt, check;
119 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
120 arm_status status; /* status of Partial convolution */
121
122
123 /* Check for range of output samples to be calculated */
124 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
125 {
126 /* Set status as ARM_MATH_ARGUMENT_ERROR */
127 status = ARM_MATH_ARGUMENT_ERROR;
128 }
129 else
130 {
131
132 /* The algorithm implementation is based on the lengths of the inputs. */
133 /* srcB is always made to slide across srcA. */
134 /* So srcBLen is always considered as shorter or equal to srcALen */
135 if(srcALen >= srcBLen)
136 {
137 /* Initialization of inputA pointer */
138 pIn1 = pSrcA;
139
140 /* Initialization of inputB pointer */
141 pIn2 = pSrcB;
142 }
143 else
144 {
145 /* Initialization of inputA pointer */
146 pIn1 = pSrcB;
147
148 /* Initialization of inputB pointer */
149 pIn2 = pSrcA;
150
151 /* srcBLen is always considered as shorter or equal to srcALen */
152 j = srcBLen;
153 srcBLen = srcALen;
154 srcALen = j;
155 }
156
157 /* Conditions to check which loopCounter holds
158 * the first and last indices of the output samples to be calculated. */
159 check = firstIndex + numPoints;
160 blockSize3 = (int32_t) check - (int32_t) srcALen;
161 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
162 blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
163 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
164 (int32_t) numPoints) : 0;
165 blockSize2 = ((int32_t) check - blockSize3) -
166 (blockSize1 + (int32_t) firstIndex);
167 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
168
169 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
170 /* The function is internally
171 * divided into three stages according to the number of multiplications that has to be
172 * taken place between inputA samples and inputB samples. In the first stage of the
173 * algorithm, the multiplications increase by one for every iteration.
174 * In the second stage of the algorithm, srcBLen number of multiplications are done.
175 * In the third stage of the algorithm, the multiplications decrease by one
176 * for every iteration. */
177
178 /* Set the output pointer to point to the firstIndex
179 * of the output sample to be calculated. */
180 pOut = pDst + firstIndex;
181
182 /* --------------------------
183 * Initializations of stage1
184 * -------------------------*/
185
186 /* sum = x[0] * y[0]
187 * sum = x[0] * y[1] + x[1] * y[0]
188 * ....
189 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
190 */
191
192 /* In this stage the MAC operations are increased by 1 for every iteration.
193 The count variable holds the number of MAC operations performed.
194 Since the partial convolution starts from from firstIndex
195 Number of Macs to be performed is firstIndex + 1 */
196 count = 1u + firstIndex;
197
198 /* Working pointer of inputA */
199 px = pIn1;
200
201 /* Working pointer of inputB */
202 pSrc1 = pIn2 + firstIndex;
203 py = pSrc1;
204
205 /* ------------------------
206 * Stage1 process
207 * ----------------------*/
208
209 /* The first stage starts here */
210 while(blockSize1 > 0)
211 {
212 /* Accumulator is made zero for every iteration */
213 sum = 0.0f;
214
215 /* Apply loop unrolling and compute 4 MACs simultaneously. */
216 k = count >> 2u;
217
218 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
219 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
220 while(k > 0u)
221 {
222 /* x[0] * y[srcBLen - 1] */
223 sum += *px++ * *py--;
224
225 /* x[1] * y[srcBLen - 2] */
226 sum += *px++ * *py--;
227
228 /* x[2] * y[srcBLen - 3] */
229 sum += *px++ * *py--;
230
231 /* x[3] * y[srcBLen - 4] */
232 sum += *px++ * *py--;
233
234 /* Decrement the loop counter */
235 k--;
236 }
237
238 /* If the count is not a multiple of 4, compute any remaining MACs here.
239 ** No loop unrolling is used. */
240 k = count % 0x4u;
241
242 while(k > 0u)
243 {
244 /* Perform the multiply-accumulates */
245 sum += *px++ * *py--;
246
247 /* Decrement the loop counter */
248 k--;
249 }
250
251 /* Store the result in the accumulator in the destination buffer. */
252 *pOut++ = sum;
253
254 /* Update the inputA and inputB pointers for next MAC calculation */
255 py = ++pSrc1;
256 px = pIn1;
257
258 /* Increment the MAC count */
259 count++;
260
261 /* Decrement the loop counter */
262 blockSize1--;
263 }
264
265 /* --------------------------
266 * Initializations of stage2
267 * ------------------------*/
268
269 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
270 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
271 * ....
272 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
273 */
274
275 /* Working pointer of inputA */
276 px = pIn1;
277
278 /* Working pointer of inputB */
279 pSrc2 = pIn2 + (srcBLen - 1u);
280 py = pSrc2;
281
282 /* count is index by which the pointer pIn1 to be incremented */
283 count = 0u;
284
285 /* -------------------
286 * Stage2 process
287 * ------------------*/
288
289 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
290 * So, to loop unroll over blockSize2,
291 * srcBLen should be greater than or equal to 4 */
292 if(srcBLen >= 4u)
293 {
294 /* Loop unroll over blockSize2, by 4 */
295 blkCnt = ((uint32_t) blockSize2 >> 2u);
296
297 while(blkCnt > 0u)
298 {
299 /* Set all accumulators to zero */
300 acc0 = 0.0f;
301 acc1 = 0.0f;
302 acc2 = 0.0f;
303 acc3 = 0.0f;
304
305 /* read x[0], x[1], x[2] samples */
306 x0 = *(px++);
307 x1 = *(px++);
308 x2 = *(px++);
309
310 /* Apply loop unrolling and compute 4 MACs simultaneously. */
311 k = srcBLen >> 2u;
312
313 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
314 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
315 do
316 {
317 /* Read y[srcBLen - 1] sample */
318 c0 = *(py--);
319
320 /* Read x[3] sample */
321 x3 = *(px++);
322
323 /* Perform the multiply-accumulate */
324 /* acc0 += x[0] * y[srcBLen - 1] */
325 acc0 += x0 * c0;
326
327 /* acc1 += x[1] * y[srcBLen - 1] */
328 acc1 += x1 * c0;
329
330 /* acc2 += x[2] * y[srcBLen - 1] */
331 acc2 += x2 * c0;
332
333 /* acc3 += x[3] * y[srcBLen - 1] */
334 acc3 += x3 * c0;
335
336 /* Read y[srcBLen - 2] sample */
337 c0 = *(py--);
338
339 /* Read x[4] sample */
340 x0 = *(px++);
341
342 /* Perform the multiply-accumulate */
343 /* acc0 += x[1] * y[srcBLen - 2] */
344 acc0 += x1 * c0;
345 /* acc1 += x[2] * y[srcBLen - 2] */
346 acc1 += x2 * c0;
347 /* acc2 += x[3] * y[srcBLen - 2] */
348 acc2 += x3 * c0;
349 /* acc3 += x[4] * y[srcBLen - 2] */
350 acc3 += x0 * c0;
351
352 /* Read y[srcBLen - 3] sample */
353 c0 = *(py--);
354
355 /* Read x[5] sample */
356 x1 = *(px++);
357
358 /* Perform the multiply-accumulates */
359 /* acc0 += x[2] * y[srcBLen - 3] */
360 acc0 += x2 * c0;
361 /* acc1 += x[3] * y[srcBLen - 2] */
362 acc1 += x3 * c0;
363 /* acc2 += x[4] * y[srcBLen - 2] */
364 acc2 += x0 * c0;
365 /* acc3 += x[5] * y[srcBLen - 2] */
366 acc3 += x1 * c0;
367
368 /* Read y[srcBLen - 4] sample */
369 c0 = *(py--);
370
371 /* Read x[6] sample */
372 x2 = *(px++);
373
374 /* Perform the multiply-accumulates */
375 /* acc0 += x[3] * y[srcBLen - 4] */
376 acc0 += x3 * c0;
377 /* acc1 += x[4] * y[srcBLen - 4] */
378 acc1 += x0 * c0;
379 /* acc2 += x[5] * y[srcBLen - 4] */
380 acc2 += x1 * c0;
381 /* acc3 += x[6] * y[srcBLen - 4] */
382 acc3 += x2 * c0;
383
384
385 } while(--k);
386
387 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
388 ** No loop unrolling is used. */
389 k = srcBLen % 0x4u;
390
391 while(k > 0u)
392 {
393 /* Read y[srcBLen - 5] sample */
394 c0 = *(py--);
395
396 /* Read x[7] sample */
397 x3 = *(px++);
398
399 /* Perform the multiply-accumulates */
400 /* acc0 += x[4] * y[srcBLen - 5] */
401 acc0 += x0 * c0;
402 /* acc1 += x[5] * y[srcBLen - 5] */
403 acc1 += x1 * c0;
404 /* acc2 += x[6] * y[srcBLen - 5] */
405 acc2 += x2 * c0;
406 /* acc3 += x[7] * y[srcBLen - 5] */
407 acc3 += x3 * c0;
408
409 /* Reuse the present samples for the next MAC */
410 x0 = x1;
411 x1 = x2;
412 x2 = x3;
413
414 /* Decrement the loop counter */
415 k--;
416 }
417
418 /* Store the result in the accumulator in the destination buffer. */
419 *pOut++ = acc0;
420 *pOut++ = acc1;
421 *pOut++ = acc2;
422 *pOut++ = acc3;
423
424 /* Increment the pointer pIn1 index, count by 1 */
425 count += 4u;
426
427 /* Update the inputA and inputB pointers for next MAC calculation */
428 px = pIn1 + count;
429 py = pSrc2;
430
431 /* Decrement the loop counter */
432 blkCnt--;
433 }
434
435 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
436 ** No loop unrolling is used. */
437 blkCnt = (uint32_t) blockSize2 % 0x4u;
438
439 while(blkCnt > 0u)
440 {
441 /* Accumulator is made zero for every iteration */
442 sum = 0.0f;
443
444 /* Apply loop unrolling and compute 4 MACs simultaneously. */
445 k = srcBLen >> 2u;
446
447 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
448 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
449 while(k > 0u)
450 {
451 /* Perform the multiply-accumulates */
452 sum += *px++ * *py--;
453 sum += *px++ * *py--;
454 sum += *px++ * *py--;
455 sum += *px++ * *py--;
456
457 /* Decrement the loop counter */
458 k--;
459 }
460
461 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
462 ** No loop unrolling is used. */
463 k = srcBLen % 0x4u;
464
465 while(k > 0u)
466 {
467 /* Perform the multiply-accumulate */
468 sum += *px++ * *py--;
469
470 /* Decrement the loop counter */
471 k--;
472 }
473
474 /* Store the result in the accumulator in the destination buffer. */
475 *pOut++ = sum;
476
477 /* Increment the MAC count */
478 count++;
479
480 /* Update the inputA and inputB pointers for next MAC calculation */
481 px = pIn1 + count;
482 py = pSrc2;
483
484 /* Decrement the loop counter */
485 blkCnt--;
486 }
487 }
488 else
489 {
490 /* If the srcBLen is not a multiple of 4,
491 * the blockSize2 loop cannot be unrolled by 4 */
492 blkCnt = (uint32_t) blockSize2;
493
494 while(blkCnt > 0u)
495 {
496 /* Accumulator is made zero for every iteration */
497 sum = 0.0f;
498
499 /* srcBLen number of MACS should be performed */
500 k = srcBLen;
501
502 while(k > 0u)
503 {
504 /* Perform the multiply-accumulate */
505 sum += *px++ * *py--;
506
507 /* Decrement the loop counter */
508 k--;
509 }
510
511 /* Store the result in the accumulator in the destination buffer. */
512 *pOut++ = sum;
513
514 /* Increment the MAC count */
515 count++;
516
517 /* Update the inputA and inputB pointers for next MAC calculation */
518 px = pIn1 + count;
519 py = pSrc2;
520
521 /* Decrement the loop counter */
522 blkCnt--;
523 }
524 }
525
526
527 /* --------------------------
528 * Initializations of stage3
529 * -------------------------*/
530
531 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
532 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
533 * ....
534 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
535 * sum += x[srcALen-1] * y[srcBLen-1]
536 */
537
538 /* In this stage the MAC operations are decreased by 1 for every iteration.
539 The count variable holds the number of MAC operations performed */
540 count = srcBLen - 1u;
541
542 /* Working pointer of inputA */
543 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
544 px = pSrc1;
545
546 /* Working pointer of inputB */
547 pSrc2 = pIn2 + (srcBLen - 1u);
548 py = pSrc2;
549
550 while(blockSize3 > 0)
551 {
552 /* Accumulator is made zero for every iteration */
553 sum = 0.0f;
554
555 /* Apply loop unrolling and compute 4 MACs simultaneously. */
556 k = count >> 2u;
557
558 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
559 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
560 while(k > 0u)
561 {
562 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
563 sum += *px++ * *py--;
564
565 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
566 sum += *px++ * *py--;
567
568 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
569 sum += *px++ * *py--;
570
571 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
572 sum += *px++ * *py--;
573
574 /* Decrement the loop counter */
575 k--;
576 }
577
578 /* If the count is not a multiple of 4, compute any remaining MACs here.
579 ** No loop unrolling is used. */
580 k = count % 0x4u;
581
582 while(k > 0u)
583 {
584 /* Perform the multiply-accumulates */
585 /* sum += x[srcALen-1] * y[srcBLen-1] */
586 sum += *px++ * *py--;
587
588 /* Decrement the loop counter */
589 k--;
590 }
591
592 /* Store the result in the accumulator in the destination buffer. */
593 *pOut++ = sum;
594
595 /* Update the inputA and inputB pointers for next MAC calculation */
596 px = ++pSrc1;
597 py = pSrc2;
598
599 /* Decrement the MAC count */
600 count--;
601
602 /* Decrement the loop counter */
603 blockSize3--;
604
605 }
606
607 /* set status as ARM_MATH_SUCCESS */
608 status = ARM_MATH_SUCCESS;
609 }
610
611 /* Return to application */
612 return (status);
613
614 #else
615
616 /* Run the below code for Cortex-M0 */
617
618 float32_t *pIn1 = pSrcA; /* inputA pointer */
619 float32_t *pIn2 = pSrcB; /* inputB pointer */
620 float32_t sum; /* Accumulator */
621 uint32_t i, j; /* loop counters */
622 arm_status status; /* status of Partial convolution */
623
624 /* Check for range of output samples to be calculated */
625 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
626 {
627 /* Set status as ARM_ARGUMENT_ERROR */
628 status = ARM_MATH_ARGUMENT_ERROR;
629 }
630 else
631 {
632 /* Loop to calculate convolution for output length number of values */
633 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
634 {
635 /* Initialize sum with zero to carry on MAC operations */
636 sum = 0.0f;
637
638 /* Loop to perform MAC operations according to convolution equation */
639 for (j = 0u; j <= i; j++)
640 {
641 /* Check the array limitations for inputs */
642 if((((i - j) < srcBLen) && (j < srcALen)))
643 {
644 /* z[i] += x[i-j] * y[j] */
645 sum += pIn1[j] * pIn2[i - j];
646 }
647 }
648 /* Store the output in the destination buffer */
649 pDst[i] = sum;
650 }
651 /* set status as ARM_SUCCESS as there are no argument errors */
652 status = ARM_MATH_SUCCESS;
653 }
654 return (status);
655
656 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
657
658 }
659
660 /**
661 * @} end of PartialConv group
662 */
Imprint / Impressum