]> git.gir.st - tmk_keyboard.git/blob - tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_partial_q15.c
Merge commit '1fe4406f374291ab2e86e95a97341fd9c475fcb8'
[tmk_keyboard.git] / tmk_core / tool / mbed / mbed-sdk / libraries / dsp / cmsis_dsp / FilteringFunctions / arm_conv_partial_q15.c
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
3 *
4 * $Date: 17. January 2013
5 * $Revision: V1.4.1
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_partial_q15.c
9 *
10 * Description: Partial convolution of Q15 sequences.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40
41 #include "arm_math.h"
42
43 /**
44 * @ingroup groupFilters
45 */
46
47 /**
48 * @addtogroup PartialConv
49 * @{
50 */
51
52 /**
53 * @brief Partial convolution of Q15 sequences.
54 * @param[in] *pSrcA points to the first input sequence.
55 * @param[in] srcALen length of the first input sequence.
56 * @param[in] *pSrcB points to the second input sequence.
57 * @param[in] srcBLen length of the second input sequence.
58 * @param[out] *pDst points to the location where the output result is written.
59 * @param[in] firstIndex is the first output sample to start with.
60 * @param[in] numPoints is the number of output points to be computed.
61 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
62 *
63 * Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
64 *
65 * \par
66 * Refer the function <code>arm_conv_partial_opt_q15()</code> for a faster implementation of this function using scratch buffers.
67 *
68 */
69
70
71 arm_status arm_conv_partial_q15(
72 q15_t * pSrcA,
73 uint32_t srcALen,
74 q15_t * pSrcB,
75 uint32_t srcBLen,
76 q15_t * pDst,
77 uint32_t firstIndex,
78 uint32_t numPoints)
79 {
80
81 #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
82
83 /* Run the below code for Cortex-M4 and Cortex-M3 */
84
85 q15_t *pIn1; /* inputA pointer */
86 q15_t *pIn2; /* inputB pointer */
87 q15_t *pOut = pDst; /* output pointer */
88 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
89 q15_t *px; /* Intermediate inputA pointer */
90 q15_t *py; /* Intermediate inputB pointer */
91 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
92 q31_t x0, x1, x2, x3, c0; /* Temporary input variables */
93 uint32_t j, k, count, check, blkCnt;
94 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */
95 arm_status status; /* status of Partial convolution */
96
97 /* Check for range of output samples to be calculated */
98 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
99 {
100 /* Set status as ARM_MATH_ARGUMENT_ERROR */
101 status = ARM_MATH_ARGUMENT_ERROR;
102 }
103 else
104 {
105
106 /* The algorithm implementation is based on the lengths of the inputs. */
107 /* srcB is always made to slide across srcA. */
108 /* So srcBLen is always considered as shorter or equal to srcALen */
109 if(srcALen >= srcBLen)
110 {
111 /* Initialization of inputA pointer */
112 pIn1 = pSrcA;
113
114 /* Initialization of inputB pointer */
115 pIn2 = pSrcB;
116 }
117 else
118 {
119 /* Initialization of inputA pointer */
120 pIn1 = pSrcB;
121
122 /* Initialization of inputB pointer */
123 pIn2 = pSrcA;
124
125 /* srcBLen is always considered as shorter or equal to srcALen */
126 j = srcBLen;
127 srcBLen = srcALen;
128 srcALen = j;
129 }
130
131 /* Conditions to check which loopCounter holds
132 * the first and last indices of the output samples to be calculated. */
133 check = firstIndex + numPoints;
134 blockSize3 = ((int32_t) check - (int32_t) srcALen);
135 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
136 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
137 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
138 (int32_t) numPoints) : 0;
139 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
140 (int32_t) firstIndex);
141 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
142
143 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
144 /* The function is internally
145 * divided into three stages according to the number of multiplications that has to be
146 * taken place between inputA samples and inputB samples. In the first stage of the
147 * algorithm, the multiplications increase by one for every iteration.
148 * In the second stage of the algorithm, srcBLen number of multiplications are done.
149 * In the third stage of the algorithm, the multiplications decrease by one
150 * for every iteration. */
151
152 /* Set the output pointer to point to the firstIndex
153 * of the output sample to be calculated. */
154 pOut = pDst + firstIndex;
155
156 /* --------------------------
157 * Initializations of stage1
158 * -------------------------*/
159
160 /* sum = x[0] * y[0]
161 * sum = x[0] * y[1] + x[1] * y[0]
162 * ....
163 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
164 */
165
166 /* In this stage the MAC operations are increased by 1 for every iteration.
167 The count variable holds the number of MAC operations performed.
168 Since the partial convolution starts from firstIndex
169 Number of Macs to be performed is firstIndex + 1 */
170 count = 1u + firstIndex;
171
172 /* Working pointer of inputA */
173 px = pIn1;
174
175 /* Working pointer of inputB */
176 pSrc2 = pIn2 + firstIndex;
177 py = pSrc2;
178
179 /* ------------------------
180 * Stage1 process
181 * ----------------------*/
182
183 /* For loop unrolling by 4, this stage is divided into two. */
184 /* First part of this stage computes the MAC operations less than 4 */
185 /* Second part of this stage computes the MAC operations greater than or equal to 4 */
186
187 /* The first part of the stage starts here */
188 while((count < 4u) && (blockSize1 > 0))
189 {
190 /* Accumulator is made zero for every iteration */
191 sum = 0;
192
193 /* Loop over number of MAC operations between
194 * inputA samples and inputB samples */
195 k = count;
196
197 while(k > 0u)
198 {
199 /* Perform the multiply-accumulates */
200 sum = __SMLALD(*px++, *py--, sum);
201
202 /* Decrement the loop counter */
203 k--;
204 }
205
206 /* Store the result in the accumulator in the destination buffer. */
207 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
208
209 /* Update the inputA and inputB pointers for next MAC calculation */
210 py = ++pSrc2;
211 px = pIn1;
212
213 /* Increment the MAC count */
214 count++;
215
216 /* Decrement the loop counter */
217 blockSize1--;
218 }
219
220 /* The second part of the stage starts here */
221 /* The internal loop, over count, is unrolled by 4 */
222 /* To, read the last two inputB samples using SIMD:
223 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
224 py = py - 1;
225
226 while(blockSize1 > 0)
227 {
228 /* Accumulator is made zero for every iteration */
229 sum = 0;
230
231 /* Apply loop unrolling and compute 4 MACs simultaneously. */
232 k = count >> 2u;
233
234 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
235 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
236 while(k > 0u)
237 {
238 /* Perform the multiply-accumulates */
239 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
240 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
241 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
242 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
243
244 /* Decrement the loop counter */
245 k--;
246 }
247
248 /* For the next MAC operations, the pointer py is used without SIMD
249 * So, py is incremented by 1 */
250 py = py + 1u;
251
252 /* If the count is not a multiple of 4, compute any remaining MACs here.
253 ** No loop unrolling is used. */
254 k = count % 0x4u;
255
256 while(k > 0u)
257 {
258 /* Perform the multiply-accumulates */
259 sum = __SMLALD(*px++, *py--, sum);
260
261 /* Decrement the loop counter */
262 k--;
263 }
264
265 /* Store the result in the accumulator in the destination buffer. */
266 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
267
268 /* Update the inputA and inputB pointers for next MAC calculation */
269 py = ++pSrc2 - 1u;
270 px = pIn1;
271
272 /* Increment the MAC count */
273 count++;
274
275 /* Decrement the loop counter */
276 blockSize1--;
277 }
278
279 /* --------------------------
280 * Initializations of stage2
281 * ------------------------*/
282
283 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
284 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
285 * ....
286 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
287 */
288
289 /* Working pointer of inputA */
290 px = pIn1;
291
292 /* Working pointer of inputB */
293 pSrc2 = pIn2 + (srcBLen - 1u);
294 py = pSrc2;
295
296 /* count is the index by which the pointer pIn1 to be incremented */
297 count = 0u;
298
299
300 /* --------------------
301 * Stage2 process
302 * -------------------*/
303
304 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
305 * So, to loop unroll over blockSize2,
306 * srcBLen should be greater than or equal to 4 */
307 if(srcBLen >= 4u)
308 {
309 /* Loop unroll over blockSize2, by 4 */
310 blkCnt = blockSize2 >> 2u;
311
312 while(blkCnt > 0u)
313 {
314 py = py - 1u;
315
316 /* Set all accumulators to zero */
317 acc0 = 0;
318 acc1 = 0;
319 acc2 = 0;
320 acc3 = 0;
321
322
323 /* read x[0], x[1] samples */
324 x0 = *__SIMD32(px);
325 /* read x[1], x[2] samples */
326 x1 = _SIMD32_OFFSET(px+1);
327 px+= 2u;
328
329
330 /* Apply loop unrolling and compute 4 MACs simultaneously. */
331 k = srcBLen >> 2u;
332
333 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
334 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
335 do
336 {
337 /* Read the last two inputB samples using SIMD:
338 * y[srcBLen - 1] and y[srcBLen - 2] */
339 c0 = *__SIMD32(py)--;
340
341 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
342 acc0 = __SMLALDX(x0, c0, acc0);
343
344 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
345 acc1 = __SMLALDX(x1, c0, acc1);
346
347 /* Read x[2], x[3] */
348 x2 = *__SIMD32(px);
349
350 /* Read x[3], x[4] */
351 x3 = _SIMD32_OFFSET(px+1);
352
353 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
354 acc2 = __SMLALDX(x2, c0, acc2);
355
356 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
357 acc3 = __SMLALDX(x3, c0, acc3);
358
359 /* Read y[srcBLen - 3] and y[srcBLen - 4] */
360 c0 = *__SIMD32(py)--;
361
362 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
363 acc0 = __SMLALDX(x2, c0, acc0);
364
365 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
366 acc1 = __SMLALDX(x3, c0, acc1);
367
368 /* Read x[4], x[5] */
369 x0 = _SIMD32_OFFSET(px+2);
370
371 /* Read x[5], x[6] */
372 x1 = _SIMD32_OFFSET(px+3);
373 px += 4u;
374
375 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
376 acc2 = __SMLALDX(x0, c0, acc2);
377
378 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
379 acc3 = __SMLALDX(x1, c0, acc3);
380
381 } while(--k);
382
383 /* For the next MAC operations, SIMD is not used
384 * So, the 16 bit pointer if inputB, py is updated */
385
386 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
387 ** No loop unrolling is used. */
388 k = srcBLen % 0x4u;
389
390 if(k == 1u)
391 {
392 /* Read y[srcBLen - 5] */
393 c0 = *(py+1);
394
395 #ifdef ARM_MATH_BIG_ENDIAN
396
397 c0 = c0 << 16u;
398
399 #else
400
401 c0 = c0 & 0x0000FFFF;
402
403 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
404
405 /* Read x[7] */
406 x3 = *__SIMD32(px);
407 px++;
408
409 /* Perform the multiply-accumulates */
410 acc0 = __SMLALD(x0, c0, acc0);
411 acc1 = __SMLALD(x1, c0, acc1);
412 acc2 = __SMLALDX(x1, c0, acc2);
413 acc3 = __SMLALDX(x3, c0, acc3);
414 }
415
416 if(k == 2u)
417 {
418 /* Read y[srcBLen - 5], y[srcBLen - 6] */
419 c0 = _SIMD32_OFFSET(py);
420
421 /* Read x[7], x[8] */
422 x3 = *__SIMD32(px);
423
424 /* Read x[9] */
425 x2 = _SIMD32_OFFSET(px+1);
426 px += 2u;
427
428 /* Perform the multiply-accumulates */
429 acc0 = __SMLALDX(x0, c0, acc0);
430 acc1 = __SMLALDX(x1, c0, acc1);
431 acc2 = __SMLALDX(x3, c0, acc2);
432 acc3 = __SMLALDX(x2, c0, acc3);
433 }
434
435 if(k == 3u)
436 {
437 /* Read y[srcBLen - 5], y[srcBLen - 6] */
438 c0 = _SIMD32_OFFSET(py);
439
440 /* Read x[7], x[8] */
441 x3 = *__SIMD32(px);
442
443 /* Read x[9] */
444 x2 = _SIMD32_OFFSET(px+1);
445
446 /* Perform the multiply-accumulates */
447 acc0 = __SMLALDX(x0, c0, acc0);
448 acc1 = __SMLALDX(x1, c0, acc1);
449 acc2 = __SMLALDX(x3, c0, acc2);
450 acc3 = __SMLALDX(x2, c0, acc3);
451
452 c0 = *(py-1);
453
454 #ifdef ARM_MATH_BIG_ENDIAN
455
456 c0 = c0 << 16u;
457 #else
458
459 c0 = c0 & 0x0000FFFF;
460 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
461
462 /* Read x[10] */
463 x3 = _SIMD32_OFFSET(px+2);
464 px += 3u;
465
466 /* Perform the multiply-accumulates */
467 acc0 = __SMLALDX(x1, c0, acc0);
468 acc1 = __SMLALD(x2, c0, acc1);
469 acc2 = __SMLALDX(x2, c0, acc2);
470 acc3 = __SMLALDX(x3, c0, acc3);
471 }
472
473
474 /* Store the results in the accumulators in the destination buffer. */
475
476 #ifndef ARM_MATH_BIG_ENDIAN
477
478 *__SIMD32(pOut)++ =
479 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
480 *__SIMD32(pOut)++ =
481 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
482
483 #else
484
485 *__SIMD32(pOut)++ =
486 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
487 *__SIMD32(pOut)++ =
488 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
489
490 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
491
492 /* Increment the pointer pIn1 index, count by 4 */
493 count += 4u;
494
495 /* Update the inputA and inputB pointers for next MAC calculation */
496 px = pIn1 + count;
497 py = pSrc2;
498
499 /* Decrement the loop counter */
500 blkCnt--;
501 }
502
503 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
504 ** No loop unrolling is used. */
505 blkCnt = (uint32_t) blockSize2 % 0x4u;
506
507 while(blkCnt > 0u)
508 {
509 /* Accumulator is made zero for every iteration */
510 sum = 0;
511
512 /* Apply loop unrolling and compute 4 MACs simultaneously. */
513 k = srcBLen >> 2u;
514
515 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
516 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
517 while(k > 0u)
518 {
519 /* Perform the multiply-accumulates */
520 sum += (q63_t) ((q31_t) * px++ * *py--);
521 sum += (q63_t) ((q31_t) * px++ * *py--);
522 sum += (q63_t) ((q31_t) * px++ * *py--);
523 sum += (q63_t) ((q31_t) * px++ * *py--);
524
525 /* Decrement the loop counter */
526 k--;
527 }
528
529 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
530 ** No loop unrolling is used. */
531 k = srcBLen % 0x4u;
532
533 while(k > 0u)
534 {
535 /* Perform the multiply-accumulates */
536 sum += (q63_t) ((q31_t) * px++ * *py--);
537
538 /* Decrement the loop counter */
539 k--;
540 }
541
542 /* Store the result in the accumulator in the destination buffer. */
543 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
544
545 /* Increment the pointer pIn1 index, count by 1 */
546 count++;
547
548 /* Update the inputA and inputB pointers for next MAC calculation */
549 px = pIn1 + count;
550 py = pSrc2;
551
552 /* Decrement the loop counter */
553 blkCnt--;
554 }
555 }
556 else
557 {
558 /* If the srcBLen is not a multiple of 4,
559 * the blockSize2 loop cannot be unrolled by 4 */
560 blkCnt = (uint32_t) blockSize2;
561
562 while(blkCnt > 0u)
563 {
564 /* Accumulator is made zero for every iteration */
565 sum = 0;
566
567 /* srcBLen number of MACS should be performed */
568 k = srcBLen;
569
570 while(k > 0u)
571 {
572 /* Perform the multiply-accumulate */
573 sum += (q63_t) ((q31_t) * px++ * *py--);
574
575 /* Decrement the loop counter */
576 k--;
577 }
578
579 /* Store the result in the accumulator in the destination buffer. */
580 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
581
582 /* Increment the MAC count */
583 count++;
584
585 /* Update the inputA and inputB pointers for next MAC calculation */
586 px = pIn1 + count;
587 py = pSrc2;
588
589 /* Decrement the loop counter */
590 blkCnt--;
591 }
592 }
593
594
595 /* --------------------------
596 * Initializations of stage3
597 * -------------------------*/
598
599 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
600 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
601 * ....
602 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
603 * sum += x[srcALen-1] * y[srcBLen-1]
604 */
605
606 /* In this stage the MAC operations are decreased by 1 for every iteration.
607 The count variable holds the number of MAC operations performed */
608 count = srcBLen - 1u;
609
610 /* Working pointer of inputA */
611 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
612 px = pSrc1;
613
614 /* Working pointer of inputB */
615 pSrc2 = pIn2 + (srcBLen - 1u);
616 pIn2 = pSrc2 - 1u;
617 py = pIn2;
618
619 /* -------------------
620 * Stage3 process
621 * ------------------*/
622
623 /* For loop unrolling by 4, this stage is divided into two. */
624 /* First part of this stage computes the MAC operations greater than 4 */
625 /* Second part of this stage computes the MAC operations less than or equal to 4 */
626
627 /* The first part of the stage starts here */
628 j = count >> 2u;
629
630 while((j > 0u) && (blockSize3 > 0))
631 {
632 /* Accumulator is made zero for every iteration */
633 sum = 0;
634
635 /* Apply loop unrolling and compute 4 MACs simultaneously. */
636 k = count >> 2u;
637
638 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
639 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
640 while(k > 0u)
641 {
642 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
643 * with y[srcBLen - 1], y[srcBLen - 2] respectively */
644 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
645 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
646 * with y[srcBLen - 3], y[srcBLen - 4] respectively */
647 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
648
649 /* Decrement the loop counter */
650 k--;
651 }
652
653 /* For the next MAC operations, the pointer py is used without SIMD
654 * So, py is incremented by 1 */
655 py = py + 1u;
656
657 /* If the count is not a multiple of 4, compute any remaining MACs here.
658 ** No loop unrolling is used. */
659 k = count % 0x4u;
660
661 while(k > 0u)
662 {
663 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
664 sum = __SMLALD(*px++, *py--, sum);
665
666 /* Decrement the loop counter */
667 k--;
668 }
669
670 /* Store the result in the accumulator in the destination buffer. */
671 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
672
673 /* Update the inputA and inputB pointers for next MAC calculation */
674 px = ++pSrc1;
675 py = pIn2;
676
677 /* Decrement the MAC count */
678 count--;
679
680 /* Decrement the loop counter */
681 blockSize3--;
682
683 j--;
684 }
685
686 /* The second part of the stage starts here */
687 /* SIMD is not used for the next MAC operations,
688 * so pointer py is updated to read only one sample at a time */
689 py = py + 1u;
690
691 while(blockSize3 > 0)
692 {
693 /* Accumulator is made zero for every iteration */
694 sum = 0;
695
696 /* Apply loop unrolling and compute 4 MACs simultaneously. */
697 k = count;
698
699 while(k > 0u)
700 {
701 /* Perform the multiply-accumulates */
702 /* sum += x[srcALen-1] * y[srcBLen-1] */
703 sum = __SMLALD(*px++, *py--, sum);
704
705 /* Decrement the loop counter */
706 k--;
707 }
708
709 /* Store the result in the accumulator in the destination buffer. */
710 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
711
712 /* Update the inputA and inputB pointers for next MAC calculation */
713 px = ++pSrc1;
714 py = pSrc2;
715
716 /* Decrement the MAC count */
717 count--;
718
719 /* Decrement the loop counter */
720 blockSize3--;
721 }
722
723 /* set status as ARM_MATH_SUCCESS */
724 status = ARM_MATH_SUCCESS;
725 }
726
727 /* Return to application */
728 return (status);
729
730 #else
731
732 /* Run the below code for Cortex-M0 */
733
734 q15_t *pIn1 = pSrcA; /* inputA pointer */
735 q15_t *pIn2 = pSrcB; /* inputB pointer */
736 q63_t sum; /* Accumulator */
737 uint32_t i, j; /* loop counters */
738 arm_status status; /* status of Partial convolution */
739
740 /* Check for range of output samples to be calculated */
741 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
742 {
743 /* Set status as ARM_ARGUMENT_ERROR */
744 status = ARM_MATH_ARGUMENT_ERROR;
745 }
746 else
747 {
748 /* Loop to calculate convolution for output length number of values */
749 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
750 {
751 /* Initialize sum with zero to carry on MAC operations */
752 sum = 0;
753
754 /* Loop to perform MAC operations according to convolution equation */
755 for (j = 0; j <= i; j++)
756 {
757 /* Check the array limitations */
758 if(((i - j) < srcBLen) && (j < srcALen))
759 {
760 /* z[i] += x[i-j] * y[j] */
761 sum += ((q31_t) pIn1[j] * (pIn2[i - j]));
762 }
763 }
764
765 /* Store the output in the destination buffer */
766 pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u);
767 }
768 /* set status as ARM_SUCCESS as there are no argument errors */
769 status = ARM_MATH_SUCCESS;
770 }
771 return (status);
772
773 #endif /* #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
774
775 }
776
777 /**
778 * @} end of PartialConv group
779 */
Imprint / Impressum