]> git.gir.st - tmk_keyboard.git/blob - tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_q7.c
remove experimental return, cleanup slash_question key
[tmk_keyboard.git] / tmk_core / tool / mbed / mbed-sdk / libraries / dsp / cmsis_dsp / FilteringFunctions / arm_conv_q7.c
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
3 *
4 * $Date: 17. January 2013
5 * $Revision: V1.4.1
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_q7.c
9 *
10 * Description: Convolution of Q7 sequences.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40
41 #include "arm_math.h"
42
43 /**
44 * @ingroup groupFilters
45 */
46
47 /**
48 * @addtogroup Conv
49 * @{
50 */
51
52 /**
53 * @brief Convolution of Q7 sequences.
54 * @param[in] *pSrcA points to the first input sequence.
55 * @param[in] srcALen length of the first input sequence.
56 * @param[in] *pSrcB points to the second input sequence.
57 * @param[in] srcBLen length of the second input sequence.
58 * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
59 * @return none.
60 *
61 * @details
62 * <b>Scaling and Overflow Behavior:</b>
63 *
64 * \par
65 * The function is implemented using a 32-bit internal accumulator.
66 * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
67 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
68 * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
69 * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
70 *
71 * \par
72 * Refer the function <code>arm_conv_opt_q7()</code> for a faster implementation of this function.
73 *
74 */
75
76 void arm_conv_q7(
77 q7_t * pSrcA,
78 uint32_t srcALen,
79 q7_t * pSrcB,
80 uint32_t srcBLen,
81 q7_t * pDst)
82 {
83
84
85 #ifndef ARM_MATH_CM0_FAMILY
86
87 /* Run the below code for Cortex-M4 and Cortex-M3 */
88
89 q7_t *pIn1; /* inputA pointer */
90 q7_t *pIn2; /* inputB pointer */
91 q7_t *pOut = pDst; /* output pointer */
92 q7_t *px; /* Intermediate inputA pointer */
93 q7_t *py; /* Intermediate inputB pointer */
94 q7_t *pSrc1, *pSrc2; /* Intermediate pointers */
95 q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */
96 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
97 q31_t input1, input2; /* Temporary input variables */
98 q15_t in1, in2; /* Temporary input variables */
99 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */
100
101 /* The algorithm implementation is based on the lengths of the inputs. */
102 /* srcB is always made to slide across srcA. */
103 /* So srcBLen is always considered as shorter or equal to srcALen */
104 if(srcALen >= srcBLen)
105 {
106 /* Initialization of inputA pointer */
107 pIn1 = pSrcA;
108
109 /* Initialization of inputB pointer */
110 pIn2 = pSrcB;
111 }
112 else
113 {
114 /* Initialization of inputA pointer */
115 pIn1 = pSrcB;
116
117 /* Initialization of inputB pointer */
118 pIn2 = pSrcA;
119
120 /* srcBLen is always considered as shorter or equal to srcALen */
121 j = srcBLen;
122 srcBLen = srcALen;
123 srcALen = j;
124 }
125
126 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
127 /* The function is internally
128 * divided into three stages according to the number of multiplications that has to be
129 * taken place between inputA samples and inputB samples. In the first stage of the
130 * algorithm, the multiplications increase by one for every iteration.
131 * In the second stage of the algorithm, srcBLen number of multiplications are done.
132 * In the third stage of the algorithm, the multiplications decrease by one
133 * for every iteration. */
134
135 /* The algorithm is implemented in three stages.
136 The loop counters of each stage is initiated here. */
137 blockSize1 = srcBLen - 1u;
138 blockSize2 = (srcALen - srcBLen) + 1u;
139 blockSize3 = blockSize1;
140
141 /* --------------------------
142 * Initializations of stage1
143 * -------------------------*/
144
145 /* sum = x[0] * y[0]
146 * sum = x[0] * y[1] + x[1] * y[0]
147 * ....
148 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
149 */
150
151 /* In this stage the MAC operations are increased by 1 for every iteration.
152 The count variable holds the number of MAC operations performed */
153 count = 1u;
154
155 /* Working pointer of inputA */
156 px = pIn1;
157
158 /* Working pointer of inputB */
159 py = pIn2;
160
161
162 /* ------------------------
163 * Stage1 process
164 * ----------------------*/
165
166 /* The first stage starts here */
167 while(blockSize1 > 0u)
168 {
169 /* Accumulator is made zero for every iteration */
170 sum = 0;
171
172 /* Apply loop unrolling and compute 4 MACs simultaneously. */
173 k = count >> 2u;
174
175 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
176 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
177 while(k > 0u)
178 {
179 /* x[0] , x[1] */
180 in1 = (q15_t) * px++;
181 in2 = (q15_t) * px++;
182 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
183
184 /* y[srcBLen - 1] , y[srcBLen - 2] */
185 in1 = (q15_t) * py--;
186 in2 = (q15_t) * py--;
187 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
188
189 /* x[0] * y[srcBLen - 1] */
190 /* x[1] * y[srcBLen - 2] */
191 sum = __SMLAD(input1, input2, sum);
192
193 /* x[2] , x[3] */
194 in1 = (q15_t) * px++;
195 in2 = (q15_t) * px++;
196 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
197
198 /* y[srcBLen - 3] , y[srcBLen - 4] */
199 in1 = (q15_t) * py--;
200 in2 = (q15_t) * py--;
201 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
202
203 /* x[2] * y[srcBLen - 3] */
204 /* x[3] * y[srcBLen - 4] */
205 sum = __SMLAD(input1, input2, sum);
206
207 /* Decrement the loop counter */
208 k--;
209 }
210
211 /* If the count is not a multiple of 4, compute any remaining MACs here.
212 ** No loop unrolling is used. */
213 k = count % 0x4u;
214
215 while(k > 0u)
216 {
217 /* Perform the multiply-accumulates */
218 sum += ((q15_t) * px++ * *py--);
219
220 /* Decrement the loop counter */
221 k--;
222 }
223
224 /* Store the result in the accumulator in the destination buffer. */
225 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
226
227 /* Update the inputA and inputB pointers for next MAC calculation */
228 py = pIn2 + count;
229 px = pIn1;
230
231 /* Increment the MAC count */
232 count++;
233
234 /* Decrement the loop counter */
235 blockSize1--;
236 }
237
238 /* --------------------------
239 * Initializations of stage2
240 * ------------------------*/
241
242 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
243 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
244 * ....
245 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
246 */
247
248 /* Working pointer of inputA */
249 px = pIn1;
250
251 /* Working pointer of inputB */
252 pSrc2 = pIn2 + (srcBLen - 1u);
253 py = pSrc2;
254
255 /* count is index by which the pointer pIn1 to be incremented */
256 count = 0u;
257
258 /* -------------------
259 * Stage2 process
260 * ------------------*/
261
262 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
263 * So, to loop unroll over blockSize2,
264 * srcBLen should be greater than or equal to 4 */
265 if(srcBLen >= 4u)
266 {
267 /* Loop unroll over blockSize2, by 4 */
268 blkCnt = blockSize2 >> 2u;
269
270 while(blkCnt > 0u)
271 {
272 /* Set all accumulators to zero */
273 acc0 = 0;
274 acc1 = 0;
275 acc2 = 0;
276 acc3 = 0;
277
278 /* read x[0], x[1], x[2] samples */
279 x0 = *(px++);
280 x1 = *(px++);
281 x2 = *(px++);
282
283 /* Apply loop unrolling and compute 4 MACs simultaneously. */
284 k = srcBLen >> 2u;
285
286 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
287 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
288 do
289 {
290 /* Read y[srcBLen - 1] sample */
291 c0 = *(py--);
292 /* Read y[srcBLen - 2] sample */
293 c1 = *(py--);
294
295 /* Read x[3] sample */
296 x3 = *(px++);
297
298 /* x[0] and x[1] are packed */
299 in1 = (q15_t) x0;
300 in2 = (q15_t) x1;
301
302 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
303
304 /* y[srcBLen - 1] and y[srcBLen - 2] are packed */
305 in1 = (q15_t) c0;
306 in2 = (q15_t) c1;
307
308 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
309
310 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
311 acc0 = __SMLAD(input1, input2, acc0);
312
313 /* x[1] and x[2] are packed */
314 in1 = (q15_t) x1;
315 in2 = (q15_t) x2;
316
317 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
318
319 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
320 acc1 = __SMLAD(input1, input2, acc1);
321
322 /* x[2] and x[3] are packed */
323 in1 = (q15_t) x2;
324 in2 = (q15_t) x3;
325
326 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
327
328 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
329 acc2 = __SMLAD(input1, input2, acc2);
330
331 /* Read x[4] sample */
332 x0 = *(px++);
333
334 /* x[3] and x[4] are packed */
335 in1 = (q15_t) x3;
336 in2 = (q15_t) x0;
337
338 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
339
340 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
341 acc3 = __SMLAD(input1, input2, acc3);
342
343 /* Read y[srcBLen - 3] sample */
344 c0 = *(py--);
345 /* Read y[srcBLen - 4] sample */
346 c1 = *(py--);
347
348 /* Read x[5] sample */
349 x1 = *(px++);
350
351 /* x[2] and x[3] are packed */
352 in1 = (q15_t) x2;
353 in2 = (q15_t) x3;
354
355 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
356
357 /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
358 in1 = (q15_t) c0;
359 in2 = (q15_t) c1;
360
361 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
362
363 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
364 acc0 = __SMLAD(input1, input2, acc0);
365
366 /* x[3] and x[4] are packed */
367 in1 = (q15_t) x3;
368 in2 = (q15_t) x0;
369
370 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
371
372 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
373 acc1 = __SMLAD(input1, input2, acc1);
374
375 /* x[4] and x[5] are packed */
376 in1 = (q15_t) x0;
377 in2 = (q15_t) x1;
378
379 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
380
381 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
382 acc2 = __SMLAD(input1, input2, acc2);
383
384 /* Read x[6] sample */
385 x2 = *(px++);
386
387 /* x[5] and x[6] are packed */
388 in1 = (q15_t) x1;
389 in2 = (q15_t) x2;
390
391 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
392
393 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
394 acc3 = __SMLAD(input1, input2, acc3);
395
396 } while(--k);
397
398 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
399 ** No loop unrolling is used. */
400 k = srcBLen % 0x4u;
401
402 while(k > 0u)
403 {
404 /* Read y[srcBLen - 5] sample */
405 c0 = *(py--);
406
407 /* Read x[7] sample */
408 x3 = *(px++);
409
410 /* Perform the multiply-accumulates */
411 /* acc0 += x[4] * y[srcBLen - 5] */
412 acc0 += ((q15_t) x0 * c0);
413 /* acc1 += x[5] * y[srcBLen - 5] */
414 acc1 += ((q15_t) x1 * c0);
415 /* acc2 += x[6] * y[srcBLen - 5] */
416 acc2 += ((q15_t) x2 * c0);
417 /* acc3 += x[7] * y[srcBLen - 5] */
418 acc3 += ((q15_t) x3 * c0);
419
420 /* Reuse the present samples for the next MAC */
421 x0 = x1;
422 x1 = x2;
423 x2 = x3;
424
425 /* Decrement the loop counter */
426 k--;
427 }
428
429
430 /* Store the result in the accumulator in the destination buffer. */
431 *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
432 *pOut++ = (q7_t) (__SSAT(acc1 >> 7u, 8));
433 *pOut++ = (q7_t) (__SSAT(acc2 >> 7u, 8));
434 *pOut++ = (q7_t) (__SSAT(acc3 >> 7u, 8));
435
436 /* Increment the pointer pIn1 index, count by 4 */
437 count += 4u;
438
439 /* Update the inputA and inputB pointers for next MAC calculation */
440 px = pIn1 + count;
441 py = pSrc2;
442
443 /* Decrement the loop counter */
444 blkCnt--;
445 }
446
447 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
448 ** No loop unrolling is used. */
449 blkCnt = blockSize2 % 0x4u;
450
451 while(blkCnt > 0u)
452 {
453 /* Accumulator is made zero for every iteration */
454 sum = 0;
455
456 /* Apply loop unrolling and compute 4 MACs simultaneously. */
457 k = srcBLen >> 2u;
458
459 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
460 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
461 while(k > 0u)
462 {
463
464 /* Reading two inputs of SrcA buffer and packing */
465 in1 = (q15_t) * px++;
466 in2 = (q15_t) * px++;
467 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
468
469 /* Reading two inputs of SrcB buffer and packing */
470 in1 = (q15_t) * py--;
471 in2 = (q15_t) * py--;
472 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
473
474 /* Perform the multiply-accumulates */
475 sum = __SMLAD(input1, input2, sum);
476
477 /* Reading two inputs of SrcA buffer and packing */
478 in1 = (q15_t) * px++;
479 in2 = (q15_t) * px++;
480 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
481
482 /* Reading two inputs of SrcB buffer and packing */
483 in1 = (q15_t) * py--;
484 in2 = (q15_t) * py--;
485 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
486
487 /* Perform the multiply-accumulates */
488 sum = __SMLAD(input1, input2, sum);
489
490 /* Decrement the loop counter */
491 k--;
492 }
493
494 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
495 ** No loop unrolling is used. */
496 k = srcBLen % 0x4u;
497
498 while(k > 0u)
499 {
500 /* Perform the multiply-accumulates */
501 sum += ((q15_t) * px++ * *py--);
502
503 /* Decrement the loop counter */
504 k--;
505 }
506
507 /* Store the result in the accumulator in the destination buffer. */
508 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
509
510 /* Increment the pointer pIn1 index, count by 1 */
511 count++;
512
513 /* Update the inputA and inputB pointers for next MAC calculation */
514 px = pIn1 + count;
515 py = pSrc2;
516
517 /* Decrement the loop counter */
518 blkCnt--;
519 }
520 }
521 else
522 {
523 /* If the srcBLen is not a multiple of 4,
524 * the blockSize2 loop cannot be unrolled by 4 */
525 blkCnt = blockSize2;
526
527 while(blkCnt > 0u)
528 {
529 /* Accumulator is made zero for every iteration */
530 sum = 0;
531
532 /* srcBLen number of MACS should be performed */
533 k = srcBLen;
534
535 while(k > 0u)
536 {
537 /* Perform the multiply-accumulate */
538 sum += ((q15_t) * px++ * *py--);
539
540 /* Decrement the loop counter */
541 k--;
542 }
543
544 /* Store the result in the accumulator in the destination buffer. */
545 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
546
547 /* Increment the MAC count */
548 count++;
549
550 /* Update the inputA and inputB pointers for next MAC calculation */
551 px = pIn1 + count;
552 py = pSrc2;
553
554 /* Decrement the loop counter */
555 blkCnt--;
556 }
557 }
558
559
560 /* --------------------------
561 * Initializations of stage3
562 * -------------------------*/
563
564 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
565 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
566 * ....
567 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
568 * sum += x[srcALen-1] * y[srcBLen-1]
569 */
570
571 /* In this stage the MAC operations are decreased by 1 for every iteration.
572 The blockSize3 variable holds the number of MAC operations performed */
573
574 /* Working pointer of inputA */
575 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
576 px = pSrc1;
577
578 /* Working pointer of inputB */
579 pSrc2 = pIn2 + (srcBLen - 1u);
580 py = pSrc2;
581
582 /* -------------------
583 * Stage3 process
584 * ------------------*/
585
586 while(blockSize3 > 0u)
587 {
588 /* Accumulator is made zero for every iteration */
589 sum = 0;
590
591 /* Apply loop unrolling and compute 4 MACs simultaneously. */
592 k = blockSize3 >> 2u;
593
594 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
595 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
596 while(k > 0u)
597 {
598 /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
599 in1 = (q15_t) * px++;
600 in2 = (q15_t) * px++;
601 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
602
603 /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
604 in1 = (q15_t) * py--;
605 in2 = (q15_t) * py--;
606 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
607
608 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
609 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
610 sum = __SMLAD(input1, input2, sum);
611
612 /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
613 in1 = (q15_t) * px++;
614 in2 = (q15_t) * px++;
615 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
616
617 /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
618 in1 = (q15_t) * py--;
619 in2 = (q15_t) * py--;
620 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
621
622 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
623 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
624 sum = __SMLAD(input1, input2, sum);
625
626 /* Decrement the loop counter */
627 k--;
628 }
629
630 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
631 ** No loop unrolling is used. */
632 k = blockSize3 % 0x4u;
633
634 while(k > 0u)
635 {
636 /* Perform the multiply-accumulates */
637 sum += ((q15_t) * px++ * *py--);
638
639 /* Decrement the loop counter */
640 k--;
641 }
642
643 /* Store the result in the accumulator in the destination buffer. */
644 *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
645
646 /* Update the inputA and inputB pointers for next MAC calculation */
647 px = ++pSrc1;
648 py = pSrc2;
649
650 /* Decrement the loop counter */
651 blockSize3--;
652 }
653
654 #else
655
656 /* Run the below code for Cortex-M0 */
657
658 q7_t *pIn1 = pSrcA; /* input pointer */
659 q7_t *pIn2 = pSrcB; /* coefficient pointer */
660 q31_t sum; /* Accumulator */
661 uint32_t i, j; /* loop counter */
662
663 /* Loop to calculate output of convolution for output length number of times */
664 for (i = 0; i < (srcALen + srcBLen - 1); i++)
665 {
666 /* Initialize sum with zero to carry on MAC operations */
667 sum = 0;
668
669 /* Loop to perform MAC operations according to convolution equation */
670 for (j = 0; j <= i; j++)
671 {
672 /* Check the array limitations */
673 if(((i - j) < srcBLen) && (j < srcALen))
674 {
675 /* z[i] += x[i-j] * y[j] */
676 sum += (q15_t) pIn1[j] * (pIn2[i - j]);
677 }
678 }
679
680 /* Store the output in the destination buffer */
681 pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u);
682 }
683
684 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
685
686 }
687
688 /**
689 * @} end of Conv group
690 */
Imprint / Impressum