]>
git.gir.st - tmk_keyboard.git/blob - tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_q7.c
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
4 * $Date: 17. January 2013
7 * Project: CMSIS DSP Library
10 * Description: Convolution of Q7 sequences.
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
44 * @ingroup groupFilters
53 * @brief Convolution of Q7 sequences.
54 * @param[in] *pSrcA points to the first input sequence.
55 * @param[in] srcALen length of the first input sequence.
56 * @param[in] *pSrcB points to the second input sequence.
57 * @param[in] srcBLen length of the second input sequence.
58 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
62 * <b>Scaling and Overflow Behavior:</b>
65 * The function is implemented using a 32-bit internal accumulator.
66 * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
67 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
68 * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
69 * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
72 * Refer the function <code>arm_conv_opt_q7()</code> for a faster implementation of this function.
85 #ifndef ARM_MATH_CM0_FAMILY
87 /* Run the below code for Cortex-M4 and Cortex-M3 */
89 q7_t
*pIn1
; /* inputA pointer */
90 q7_t
*pIn2
; /* inputB pointer */
91 q7_t
*pOut
= pDst
; /* output pointer */
92 q7_t
*px
; /* Intermediate inputA pointer */
93 q7_t
*py
; /* Intermediate inputB pointer */
94 q7_t
*pSrc1
, *pSrc2
; /* Intermediate pointers */
95 q7_t x0
, x1
, x2
, x3
, c0
, c1
; /* Temporary variables to hold state and coefficient values */
96 q31_t sum
, acc0
, acc1
, acc2
, acc3
; /* Accumulator */
97 q31_t input1
, input2
; /* Temporary input variables */
98 q15_t in1
, in2
; /* Temporary input variables */
99 uint32_t j
, k
, count
, blkCnt
, blockSize1
, blockSize2
, blockSize3
; /* loop counter */
101 /* The algorithm implementation is based on the lengths of the inputs. */
102 /* srcB is always made to slide across srcA. */
103 /* So srcBLen is always considered as shorter or equal to srcALen */
104 if(srcALen
>= srcBLen
)
106 /* Initialization of inputA pointer */
109 /* Initialization of inputB pointer */
114 /* Initialization of inputA pointer */
117 /* Initialization of inputB pointer */
120 /* srcBLen is always considered as shorter or equal to srcALen */
126 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
127 /* The function is internally
128 * divided into three stages according to the number of multiplications that has to be
129 * taken place between inputA samples and inputB samples. In the first stage of the
130 * algorithm, the multiplications increase by one for every iteration.
131 * In the second stage of the algorithm, srcBLen number of multiplications are done.
132 * In the third stage of the algorithm, the multiplications decrease by one
133 * for every iteration. */
135 /* The algorithm is implemented in three stages.
136 The loop counters of each stage is initiated here. */
137 blockSize1
= srcBLen
- 1u;
138 blockSize2
= (srcALen
- srcBLen
) + 1u;
139 blockSize3
= blockSize1
;
141 /* --------------------------
142 * Initializations of stage1
143 * -------------------------*/
146 * sum = x[0] * y[1] + x[1] * y[0]
148 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
151 /* In this stage the MAC operations are increased by 1 for every iteration.
152 The count variable holds the number of MAC operations performed */
155 /* Working pointer of inputA */
158 /* Working pointer of inputB */
162 /* ------------------------
164 * ----------------------*/
166 /* The first stage starts here */
167 while(blockSize1
> 0u)
169 /* Accumulator is made zero for every iteration */
172 /* Apply loop unrolling and compute 4 MACs simultaneously. */
175 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
176 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
180 in1
= (q15_t
) * px
++;
181 in2
= (q15_t
) * px
++;
182 input1
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
184 /* y[srcBLen - 1] , y[srcBLen - 2] */
185 in1
= (q15_t
) * py
--;
186 in2
= (q15_t
) * py
--;
187 input2
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
189 /* x[0] * y[srcBLen - 1] */
190 /* x[1] * y[srcBLen - 2] */
191 sum
= __SMLAD(input1
, input2
, sum
);
194 in1
= (q15_t
) * px
++;
195 in2
= (q15_t
) * px
++;
196 input1
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
198 /* y[srcBLen - 3] , y[srcBLen - 4] */
199 in1
= (q15_t
) * py
--;
200 in2
= (q15_t
) * py
--;
201 input2
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
203 /* x[2] * y[srcBLen - 3] */
204 /* x[3] * y[srcBLen - 4] */
205 sum
= __SMLAD(input1
, input2
, sum
);
207 /* Decrement the loop counter */
211 /* If the count is not a multiple of 4, compute any remaining MACs here.
212 ** No loop unrolling is used. */
217 /* Perform the multiply-accumulates */
218 sum
+= ((q15_t
) * px
++ * *py
--);
220 /* Decrement the loop counter */
224 /* Store the result in the accumulator in the destination buffer. */
225 *pOut
++ = (q7_t
) (__SSAT(sum
>> 7u, 8));
227 /* Update the inputA and inputB pointers for next MAC calculation */
231 /* Increment the MAC count */
234 /* Decrement the loop counter */
238 /* --------------------------
239 * Initializations of stage2
240 * ------------------------*/
242 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
243 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
245 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
248 /* Working pointer of inputA */
251 /* Working pointer of inputB */
252 pSrc2
= pIn2
+ (srcBLen
- 1u);
255 /* count is index by which the pointer pIn1 to be incremented */
258 /* -------------------
260 * ------------------*/
262 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
263 * So, to loop unroll over blockSize2,
264 * srcBLen should be greater than or equal to 4 */
267 /* Loop unroll over blockSize2, by 4 */
268 blkCnt
= blockSize2
>> 2u;
272 /* Set all accumulators to zero */
278 /* read x[0], x[1], x[2] samples */
283 /* Apply loop unrolling and compute 4 MACs simultaneously. */
286 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
287 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
290 /* Read y[srcBLen - 1] sample */
292 /* Read y[srcBLen - 2] sample */
295 /* Read x[3] sample */
298 /* x[0] and x[1] are packed */
302 input1
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
304 /* y[srcBLen - 1] and y[srcBLen - 2] are packed */
308 input2
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
310 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
311 acc0
= __SMLAD(input1
, input2
, acc0
);
313 /* x[1] and x[2] are packed */
317 input1
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
319 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
320 acc1
= __SMLAD(input1
, input2
, acc1
);
322 /* x[2] and x[3] are packed */
326 input1
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
328 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
329 acc2
= __SMLAD(input1
, input2
, acc2
);
331 /* Read x[4] sample */
334 /* x[3] and x[4] are packed */
338 input1
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
340 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
341 acc3
= __SMLAD(input1
, input2
, acc3
);
343 /* Read y[srcBLen - 3] sample */
345 /* Read y[srcBLen - 4] sample */
348 /* Read x[5] sample */
351 /* x[2] and x[3] are packed */
355 input1
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
357 /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
361 input2
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
363 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
364 acc0
= __SMLAD(input1
, input2
, acc0
);
366 /* x[3] and x[4] are packed */
370 input1
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
372 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
373 acc1
= __SMLAD(input1
, input2
, acc1
);
375 /* x[4] and x[5] are packed */
379 input1
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
381 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
382 acc2
= __SMLAD(input1
, input2
, acc2
);
384 /* Read x[6] sample */
387 /* x[5] and x[6] are packed */
391 input1
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
393 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
394 acc3
= __SMLAD(input1
, input2
, acc3
);
398 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
399 ** No loop unrolling is used. */
404 /* Read y[srcBLen - 5] sample */
407 /* Read x[7] sample */
410 /* Perform the multiply-accumulates */
411 /* acc0 += x[4] * y[srcBLen - 5] */
412 acc0
+= ((q15_t
) x0
* c0
);
413 /* acc1 += x[5] * y[srcBLen - 5] */
414 acc1
+= ((q15_t
) x1
* c0
);
415 /* acc2 += x[6] * y[srcBLen - 5] */
416 acc2
+= ((q15_t
) x2
* c0
);
417 /* acc3 += x[7] * y[srcBLen - 5] */
418 acc3
+= ((q15_t
) x3
* c0
);
420 /* Reuse the present samples for the next MAC */
425 /* Decrement the loop counter */
430 /* Store the result in the accumulator in the destination buffer. */
431 *pOut
++ = (q7_t
) (__SSAT(acc0
>> 7u, 8));
432 *pOut
++ = (q7_t
) (__SSAT(acc1
>> 7u, 8));
433 *pOut
++ = (q7_t
) (__SSAT(acc2
>> 7u, 8));
434 *pOut
++ = (q7_t
) (__SSAT(acc3
>> 7u, 8));
436 /* Increment the pointer pIn1 index, count by 4 */
439 /* Update the inputA and inputB pointers for next MAC calculation */
443 /* Decrement the loop counter */
447 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
448 ** No loop unrolling is used. */
449 blkCnt
= blockSize2
% 0x4u
;
453 /* Accumulator is made zero for every iteration */
456 /* Apply loop unrolling and compute 4 MACs simultaneously. */
459 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
460 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
464 /* Reading two inputs of SrcA buffer and packing */
465 in1
= (q15_t
) * px
++;
466 in2
= (q15_t
) * px
++;
467 input1
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
469 /* Reading two inputs of SrcB buffer and packing */
470 in1
= (q15_t
) * py
--;
471 in2
= (q15_t
) * py
--;
472 input2
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
474 /* Perform the multiply-accumulates */
475 sum
= __SMLAD(input1
, input2
, sum
);
477 /* Reading two inputs of SrcA buffer and packing */
478 in1
= (q15_t
) * px
++;
479 in2
= (q15_t
) * px
++;
480 input1
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
482 /* Reading two inputs of SrcB buffer and packing */
483 in1
= (q15_t
) * py
--;
484 in2
= (q15_t
) * py
--;
485 input2
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
487 /* Perform the multiply-accumulates */
488 sum
= __SMLAD(input1
, input2
, sum
);
490 /* Decrement the loop counter */
494 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
495 ** No loop unrolling is used. */
500 /* Perform the multiply-accumulates */
501 sum
+= ((q15_t
) * px
++ * *py
--);
503 /* Decrement the loop counter */
507 /* Store the result in the accumulator in the destination buffer. */
508 *pOut
++ = (q7_t
) (__SSAT(sum
>> 7u, 8));
510 /* Increment the pointer pIn1 index, count by 1 */
513 /* Update the inputA and inputB pointers for next MAC calculation */
517 /* Decrement the loop counter */
523 /* If the srcBLen is not a multiple of 4,
524 * the blockSize2 loop cannot be unrolled by 4 */
529 /* Accumulator is made zero for every iteration */
532 /* srcBLen number of MACS should be performed */
537 /* Perform the multiply-accumulate */
538 sum
+= ((q15_t
) * px
++ * *py
--);
540 /* Decrement the loop counter */
544 /* Store the result in the accumulator in the destination buffer. */
545 *pOut
++ = (q7_t
) (__SSAT(sum
>> 7u, 8));
547 /* Increment the MAC count */
550 /* Update the inputA and inputB pointers for next MAC calculation */
554 /* Decrement the loop counter */
560 /* --------------------------
561 * Initializations of stage3
562 * -------------------------*/
564 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
565 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
567 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
568 * sum += x[srcALen-1] * y[srcBLen-1]
571 /* In this stage the MAC operations are decreased by 1 for every iteration.
572 The blockSize3 variable holds the number of MAC operations performed */
574 /* Working pointer of inputA */
575 pSrc1
= pIn1
+ (srcALen
- (srcBLen
- 1u));
578 /* Working pointer of inputB */
579 pSrc2
= pIn2
+ (srcBLen
- 1u);
582 /* -------------------
584 * ------------------*/
586 while(blockSize3
> 0u)
588 /* Accumulator is made zero for every iteration */
591 /* Apply loop unrolling and compute 4 MACs simultaneously. */
592 k
= blockSize3
>> 2u;
594 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
595 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
598 /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
599 in1
= (q15_t
) * px
++;
600 in2
= (q15_t
) * px
++;
601 input1
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
603 /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
604 in1
= (q15_t
) * py
--;
605 in2
= (q15_t
) * py
--;
606 input2
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
608 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
609 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
610 sum
= __SMLAD(input1
, input2
, sum
);
612 /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
613 in1
= (q15_t
) * px
++;
614 in2
= (q15_t
) * px
++;
615 input1
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
617 /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
618 in1
= (q15_t
) * py
--;
619 in2
= (q15_t
) * py
--;
620 input2
= ((q31_t
) in1
& 0x0000FFFF) | ((q31_t
) in2
<< 16u);
622 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
623 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
624 sum
= __SMLAD(input1
, input2
, sum
);
626 /* Decrement the loop counter */
630 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
631 ** No loop unrolling is used. */
632 k
= blockSize3
% 0x4u
;
636 /* Perform the multiply-accumulates */
637 sum
+= ((q15_t
) * px
++ * *py
--);
639 /* Decrement the loop counter */
643 /* Store the result in the accumulator in the destination buffer. */
644 *pOut
++ = (q7_t
) (__SSAT(sum
>> 7u, 8));
646 /* Update the inputA and inputB pointers for next MAC calculation */
650 /* Decrement the loop counter */
656 /* Run the below code for Cortex-M0 */
658 q7_t
*pIn1
= pSrcA
; /* input pointer */
659 q7_t
*pIn2
= pSrcB
; /* coefficient pointer */
660 q31_t sum
; /* Accumulator */
661 uint32_t i
, j
; /* loop counter */
663 /* Loop to calculate output of convolution for output length number of times */
664 for (i
= 0; i
< (srcALen
+ srcBLen
- 1); i
++)
666 /* Initialize sum with zero to carry on MAC operations */
669 /* Loop to perform MAC operations according to convolution equation */
670 for (j
= 0; j
<= i
; j
++)
672 /* Check the array limitations */
673 if(((i
- j
) < srcBLen
) && (j
< srcALen
))
675 /* z[i] += x[i-j] * y[j] */
676 sum
+= (q15_t
) pIn1
[j
] * (pIn2
[i
- j
]);
680 /* Store the output in the destination buffer */
681 pDst
[i
] = (q7_t
) __SSAT((sum
>> 7u), 8u);
684 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
689 * @} end of Conv group