]> git.gir.st - tmk_keyboard.git/blob - tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_conv_partial_fast_q15.c
Merge commit '1fe4406f374291ab2e86e95a97341fd9c475fcb8'
[tmk_keyboard.git] / tmk_core / tool / mbed / mbed-sdk / libraries / dsp / cmsis_dsp / FilteringFunctions / arm_conv_partial_fast_q15.c
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
3 *
4 * $Date: 17. January 2013
5 * $Revision: V1.4.1
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_partial_fast_q15.c
9 *
10 * Description: Fast Q15 Partial convolution.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40
41 #include "arm_math.h"
42
43 /**
44 * @ingroup groupFilters
45 */
46
47 /**
48 * @addtogroup PartialConv
49 * @{
50 */
51
52 /**
53 * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
54 * @param[in] *pSrcA points to the first input sequence.
55 * @param[in] srcALen length of the first input sequence.
56 * @param[in] *pSrcB points to the second input sequence.
57 * @param[in] srcBLen length of the second input sequence.
58 * @param[out] *pDst points to the location where the output result is written.
59 * @param[in] firstIndex is the first output sample to start with.
60 * @param[in] numPoints is the number of output points to be computed.
61 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
62 *
63 * See <code>arm_conv_partial_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
64 */
65
66
67 arm_status arm_conv_partial_fast_q15(
68 q15_t * pSrcA,
69 uint32_t srcALen,
70 q15_t * pSrcB,
71 uint32_t srcBLen,
72 q15_t * pDst,
73 uint32_t firstIndex,
74 uint32_t numPoints)
75 {
76 #ifndef UNALIGNED_SUPPORT_DISABLE
77
78 q15_t *pIn1; /* inputA pointer */
79 q15_t *pIn2; /* inputB pointer */
80 q15_t *pOut = pDst; /* output pointer */
81 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
82 q15_t *px; /* Intermediate inputA pointer */
83 q15_t *py; /* Intermediate inputB pointer */
84 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
85 q31_t x0, x1, x2, x3, c0;
86 uint32_t j, k, count, check, blkCnt;
87 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
88 arm_status status; /* status of Partial convolution */
89
90 /* Check for range of output samples to be calculated */
91 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
92 {
93 /* Set status as ARM_MATH_ARGUMENT_ERROR */
94 status = ARM_MATH_ARGUMENT_ERROR;
95 }
96 else
97 {
98
99 /* The algorithm implementation is based on the lengths of the inputs. */
100 /* srcB is always made to slide across srcA. */
101 /* So srcBLen is always considered as shorter or equal to srcALen */
102 if(srcALen >=srcBLen)
103 {
104 /* Initialization of inputA pointer */
105 pIn1 = pSrcA;
106
107 /* Initialization of inputB pointer */
108 pIn2 = pSrcB;
109 }
110 else
111 {
112 /* Initialization of inputA pointer */
113 pIn1 = pSrcB;
114
115 /* Initialization of inputB pointer */
116 pIn2 = pSrcA;
117
118 /* srcBLen is always considered as shorter or equal to srcALen */
119 j = srcBLen;
120 srcBLen = srcALen;
121 srcALen = j;
122 }
123
124 /* Conditions to check which loopCounter holds
125 * the first and last indices of the output samples to be calculated. */
126 check = firstIndex + numPoints;
127 blockSize3 = ((int32_t) check - (int32_t) srcALen);
128 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
129 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
130 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
131 (int32_t) numPoints) : 0;
132 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
133 (int32_t) firstIndex);
134 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
135
136 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
137 /* The function is internally
138 * divided into three stages according to the number of multiplications that has to be
139 * taken place between inputA samples and inputB samples. In the first stage of the
140 * algorithm, the multiplications increase by one for every iteration.
141 * In the second stage of the algorithm, srcBLen number of multiplications are done.
142 * In the third stage of the algorithm, the multiplications decrease by one
143 * for every iteration. */
144
145 /* Set the output pointer to point to the firstIndex
146 * of the output sample to be calculated. */
147 pOut = pDst + firstIndex;
148
149 /* --------------------------
150 * Initializations of stage1
151 * -------------------------*/
152
153 /* sum = x[0] * y[0]
154 * sum = x[0] * y[1] + x[1] * y[0]
155 * ....
156 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
157 */
158
159 /* In this stage the MAC operations are increased by 1 for every iteration.
160 The count variable holds the number of MAC operations performed.
161 Since the partial convolution starts from firstIndex
162 Number of Macs to be performed is firstIndex + 1 */
163 count = 1u + firstIndex;
164
165 /* Working pointer of inputA */
166 px = pIn1;
167
168 /* Working pointer of inputB */
169 pSrc2 = pIn2 + firstIndex;
170 py = pSrc2;
171
172 /* ------------------------
173 * Stage1 process
174 * ----------------------*/
175
176 /* For loop unrolling by 4, this stage is divided into two. */
177 /* First part of this stage computes the MAC operations less than 4 */
178 /* Second part of this stage computes the MAC operations greater than or equal to 4 */
179
180 /* The first part of the stage starts here */
181 while((count < 4u) && (blockSize1 > 0))
182 {
183 /* Accumulator is made zero for every iteration */
184 sum = 0;
185
186 /* Loop over number of MAC operations between
187 * inputA samples and inputB samples */
188 k = count;
189
190 while(k > 0u)
191 {
192 /* Perform the multiply-accumulates */
193 sum = __SMLAD(*px++, *py--, sum);
194
195 /* Decrement the loop counter */
196 k--;
197 }
198
199 /* Store the result in the accumulator in the destination buffer. */
200 *pOut++ = (q15_t) (sum >> 15);
201
202 /* Update the inputA and inputB pointers for next MAC calculation */
203 py = ++pSrc2;
204 px = pIn1;
205
206 /* Increment the MAC count */
207 count++;
208
209 /* Decrement the loop counter */
210 blockSize1--;
211 }
212
213 /* The second part of the stage starts here */
214 /* The internal loop, over count, is unrolled by 4 */
215 /* To, read the last two inputB samples using SIMD:
216 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
217 py = py - 1;
218
219 while(blockSize1 > 0)
220 {
221 /* Accumulator is made zero for every iteration */
222 sum = 0;
223
224 /* Apply loop unrolling and compute 4 MACs simultaneously. */
225 k = count >> 2u;
226
227 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
228 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
229 while(k > 0u)
230 {
231 /* Perform the multiply-accumulates */
232 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
233 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
234 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
235 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
236
237 /* Decrement the loop counter */
238 k--;
239 }
240
241 /* For the next MAC operations, the pointer py is used without SIMD
242 * So, py is incremented by 1 */
243 py = py + 1u;
244
245 /* If the count is not a multiple of 4, compute any remaining MACs here.
246 ** No loop unrolling is used. */
247 k = count % 0x4u;
248
249 while(k > 0u)
250 {
251 /* Perform the multiply-accumulates */
252 sum = __SMLAD(*px++, *py--, sum);
253
254 /* Decrement the loop counter */
255 k--;
256 }
257
258 /* Store the result in the accumulator in the destination buffer. */
259 *pOut++ = (q15_t) (sum >> 15);
260
261 /* Update the inputA and inputB pointers for next MAC calculation */
262 py = ++pSrc2 - 1u;
263 px = pIn1;
264
265 /* Increment the MAC count */
266 count++;
267
268 /* Decrement the loop counter */
269 blockSize1--;
270 }
271
272 /* --------------------------
273 * Initializations of stage2
274 * ------------------------*/
275
276 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
277 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
278 * ....
279 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
280 */
281
282 /* Working pointer of inputA */
283 px = pIn1;
284
285 /* Working pointer of inputB */
286 pSrc2 = pIn2 + (srcBLen - 1u);
287 py = pSrc2;
288
289 /* count is the index by which the pointer pIn1 to be incremented */
290 count = 0u;
291
292
293 /* --------------------
294 * Stage2 process
295 * -------------------*/
296
297 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
298 * So, to loop unroll over blockSize2,
299 * srcBLen should be greater than or equal to 4 */
300 if(srcBLen >= 4u)
301 {
302 /* Loop unroll over blockSize2, by 4 */
303 blkCnt = ((uint32_t) blockSize2 >> 2u);
304
305 while(blkCnt > 0u)
306 {
307 py = py - 1u;
308
309 /* Set all accumulators to zero */
310 acc0 = 0;
311 acc1 = 0;
312 acc2 = 0;
313 acc3 = 0;
314
315
316 /* read x[0], x[1] samples */
317 x0 = *__SIMD32(px);
318 /* read x[1], x[2] samples */
319 x1 = _SIMD32_OFFSET(px+1);
320 px+= 2u;
321
322
323 /* Apply loop unrolling and compute 4 MACs simultaneously. */
324 k = srcBLen >> 2u;
325
326 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
327 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
328 do
329 {
330 /* Read the last two inputB samples using SIMD:
331 * y[srcBLen - 1] and y[srcBLen - 2] */
332 c0 = *__SIMD32(py)--;
333
334 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
335 acc0 = __SMLADX(x0, c0, acc0);
336
337 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
338 acc1 = __SMLADX(x1, c0, acc1);
339
340 /* Read x[2], x[3] */
341 x2 = *__SIMD32(px);
342
343 /* Read x[3], x[4] */
344 x3 = _SIMD32_OFFSET(px+1);
345
346 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
347 acc2 = __SMLADX(x2, c0, acc2);
348
349 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
350 acc3 = __SMLADX(x3, c0, acc3);
351
352 /* Read y[srcBLen - 3] and y[srcBLen - 4] */
353 c0 = *__SIMD32(py)--;
354
355 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
356 acc0 = __SMLADX(x2, c0, acc0);
357
358 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
359 acc1 = __SMLADX(x3, c0, acc1);
360
361 /* Read x[4], x[5] */
362 x0 = _SIMD32_OFFSET(px+2);
363
364 /* Read x[5], x[6] */
365 x1 = _SIMD32_OFFSET(px+3);
366 px += 4u;
367
368 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
369 acc2 = __SMLADX(x0, c0, acc2);
370
371 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
372 acc3 = __SMLADX(x1, c0, acc3);
373
374 } while(--k);
375
376 /* For the next MAC operations, SIMD is not used
377 * So, the 16 bit pointer if inputB, py is updated */
378
379 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
380 ** No loop unrolling is used. */
381 k = srcBLen % 0x4u;
382
383 if(k == 1u)
384 {
385 /* Read y[srcBLen - 5] */
386 c0 = *(py+1);
387 #ifdef ARM_MATH_BIG_ENDIAN
388
389 c0 = c0 << 16u;
390
391 #else
392
393 c0 = c0 & 0x0000FFFF;
394
395 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
396
397 /* Read x[7] */
398 x3 = *__SIMD32(px);
399 px++;
400
401 /* Perform the multiply-accumulates */
402 acc0 = __SMLAD(x0, c0, acc0);
403 acc1 = __SMLAD(x1, c0, acc1);
404 acc2 = __SMLADX(x1, c0, acc2);
405 acc3 = __SMLADX(x3, c0, acc3);
406 }
407
408 if(k == 2u)
409 {
410 /* Read y[srcBLen - 5], y[srcBLen - 6] */
411 c0 = _SIMD32_OFFSET(py);
412
413 /* Read x[7], x[8] */
414 x3 = *__SIMD32(px);
415
416 /* Read x[9] */
417 x2 = _SIMD32_OFFSET(px+1);
418 px += 2u;
419
420 /* Perform the multiply-accumulates */
421 acc0 = __SMLADX(x0, c0, acc0);
422 acc1 = __SMLADX(x1, c0, acc1);
423 acc2 = __SMLADX(x3, c0, acc2);
424 acc3 = __SMLADX(x2, c0, acc3);
425 }
426
427 if(k == 3u)
428 {
429 /* Read y[srcBLen - 5], y[srcBLen - 6] */
430 c0 = _SIMD32_OFFSET(py);
431
432 /* Read x[7], x[8] */
433 x3 = *__SIMD32(px);
434
435 /* Read x[9] */
436 x2 = _SIMD32_OFFSET(px+1);
437
438 /* Perform the multiply-accumulates */
439 acc0 = __SMLADX(x0, c0, acc0);
440 acc1 = __SMLADX(x1, c0, acc1);
441 acc2 = __SMLADX(x3, c0, acc2);
442 acc3 = __SMLADX(x2, c0, acc3);
443
444 c0 = *(py-1);
445 #ifdef ARM_MATH_BIG_ENDIAN
446
447 c0 = c0 << 16u;
448 #else
449
450 c0 = c0 & 0x0000FFFF;
451 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
452
453 /* Read x[10] */
454 x3 = _SIMD32_OFFSET(px+2);
455 px += 3u;
456
457 /* Perform the multiply-accumulates */
458 acc0 = __SMLADX(x1, c0, acc0);
459 acc1 = __SMLAD(x2, c0, acc1);
460 acc2 = __SMLADX(x2, c0, acc2);
461 acc3 = __SMLADX(x3, c0, acc3);
462 }
463
464 /* Store the results in the accumulators in the destination buffer. */
465 #ifndef ARM_MATH_BIG_ENDIAN
466
467 *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16);
468 *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16);
469
470 #else
471
472 *__SIMD32(pOut)++ = __PKHBT(acc1 >> 15, acc0 >> 15, 16);
473 *__SIMD32(pOut)++ = __PKHBT(acc3 >> 15, acc2 >> 15, 16);
474
475 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
476
477 /* Increment the pointer pIn1 index, count by 4 */
478 count += 4u;
479
480 /* Update the inputA and inputB pointers for next MAC calculation */
481 px = pIn1 + count;
482 py = pSrc2;
483
484 /* Decrement the loop counter */
485 blkCnt--;
486 }
487
488 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
489 ** No loop unrolling is used. */
490 blkCnt = (uint32_t) blockSize2 % 0x4u;
491
492 while(blkCnt > 0u)
493 {
494 /* Accumulator is made zero for every iteration */
495 sum = 0;
496
497 /* Apply loop unrolling and compute 4 MACs simultaneously. */
498 k = srcBLen >> 2u;
499
500 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
501 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
502 while(k > 0u)
503 {
504 /* Perform the multiply-accumulates */
505 sum += ((q31_t) * px++ * *py--);
506 sum += ((q31_t) * px++ * *py--);
507 sum += ((q31_t) * px++ * *py--);
508 sum += ((q31_t) * px++ * *py--);
509
510 /* Decrement the loop counter */
511 k--;
512 }
513
514 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
515 ** No loop unrolling is used. */
516 k = srcBLen % 0x4u;
517
518 while(k > 0u)
519 {
520 /* Perform the multiply-accumulates */
521 sum += ((q31_t) * px++ * *py--);
522
523 /* Decrement the loop counter */
524 k--;
525 }
526
527 /* Store the result in the accumulator in the destination buffer. */
528 *pOut++ = (q15_t) (sum >> 15);
529
530 /* Increment the pointer pIn1 index, count by 1 */
531 count++;
532
533 /* Update the inputA and inputB pointers for next MAC calculation */
534 px = pIn1 + count;
535 py = pSrc2;
536
537 /* Decrement the loop counter */
538 blkCnt--;
539 }
540 }
541 else
542 {
543 /* If the srcBLen is not a multiple of 4,
544 * the blockSize2 loop cannot be unrolled by 4 */
545 blkCnt = (uint32_t) blockSize2;
546
547 while(blkCnt > 0u)
548 {
549 /* Accumulator is made zero for every iteration */
550 sum = 0;
551
552 /* srcBLen number of MACS should be performed */
553 k = srcBLen;
554
555 while(k > 0u)
556 {
557 /* Perform the multiply-accumulate */
558 sum += ((q31_t) * px++ * *py--);
559
560 /* Decrement the loop counter */
561 k--;
562 }
563
564 /* Store the result in the accumulator in the destination buffer. */
565 *pOut++ = (q15_t) (sum >> 15);
566
567 /* Increment the MAC count */
568 count++;
569
570 /* Update the inputA and inputB pointers for next MAC calculation */
571 px = pIn1 + count;
572 py = pSrc2;
573
574 /* Decrement the loop counter */
575 blkCnt--;
576 }
577 }
578
579
580 /* --------------------------
581 * Initializations of stage3
582 * -------------------------*/
583
584 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
585 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
586 * ....
587 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
588 * sum += x[srcALen-1] * y[srcBLen-1]
589 */
590
591 /* In this stage the MAC operations are decreased by 1 for every iteration.
592 The count variable holds the number of MAC operations performed */
593 count = srcBLen - 1u;
594
595 /* Working pointer of inputA */
596 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
597 px = pSrc1;
598
599 /* Working pointer of inputB */
600 pSrc2 = pIn2 + (srcBLen - 1u);
601 pIn2 = pSrc2 - 1u;
602 py = pIn2;
603
604 /* -------------------
605 * Stage3 process
606 * ------------------*/
607
608 /* For loop unrolling by 4, this stage is divided into two. */
609 /* First part of this stage computes the MAC operations greater than 4 */
610 /* Second part of this stage computes the MAC operations less than or equal to 4 */
611
612 /* The first part of the stage starts here */
613 j = count >> 2u;
614
615 while((j > 0u) && (blockSize3 > 0))
616 {
617 /* Accumulator is made zero for every iteration */
618 sum = 0;
619
620 /* Apply loop unrolling and compute 4 MACs simultaneously. */
621 k = count >> 2u;
622
623 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
624 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
625 while(k > 0u)
626 {
627 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
628 * with y[srcBLen - 1], y[srcBLen - 2] respectively */
629 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
630 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
631 * with y[srcBLen - 3], y[srcBLen - 4] respectively */
632 sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
633
634 /* Decrement the loop counter */
635 k--;
636 }
637
638 /* For the next MAC operations, the pointer py is used without SIMD
639 * So, py is incremented by 1 */
640 py = py + 1u;
641
642 /* If the count is not a multiple of 4, compute any remaining MACs here.
643 ** No loop unrolling is used. */
644 k = count % 0x4u;
645
646 while(k > 0u)
647 {
648 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
649 sum = __SMLAD(*px++, *py--, sum);
650
651 /* Decrement the loop counter */
652 k--;
653 }
654
655 /* Store the result in the accumulator in the destination buffer. */
656 *pOut++ = (q15_t) (sum >> 15);
657
658 /* Update the inputA and inputB pointers for next MAC calculation */
659 px = ++pSrc1;
660 py = pIn2;
661
662 /* Decrement the MAC count */
663 count--;
664
665 /* Decrement the loop counter */
666 blockSize3--;
667
668 j--;
669 }
670
671 /* The second part of the stage starts here */
672 /* SIMD is not used for the next MAC operations,
673 * so pointer py is updated to read only one sample at a time */
674 py = py + 1u;
675
676 while(blockSize3 > 0)
677 {
678 /* Accumulator is made zero for every iteration */
679 sum = 0;
680
681 /* Apply loop unrolling and compute 4 MACs simultaneously. */
682 k = count;
683
684 while(k > 0u)
685 {
686 /* Perform the multiply-accumulates */
687 /* sum += x[srcALen-1] * y[srcBLen-1] */
688 sum = __SMLAD(*px++, *py--, sum);
689
690 /* Decrement the loop counter */
691 k--;
692 }
693
694 /* Store the result in the accumulator in the destination buffer. */
695 *pOut++ = (q15_t) (sum >> 15);
696
697 /* Update the inputA and inputB pointers for next MAC calculation */
698 px = ++pSrc1;
699 py = pSrc2;
700
701 /* Decrement the MAC count */
702 count--;
703
704 /* Decrement the loop counter */
705 blockSize3--;
706 }
707
708 /* set status as ARM_MATH_SUCCESS */
709 status = ARM_MATH_SUCCESS;
710 }
711
712 /* Return to application */
713 return (status);
714
715 #else
716
717 q15_t *pIn1; /* inputA pointer */
718 q15_t *pIn2; /* inputB pointer */
719 q15_t *pOut = pDst; /* output pointer */
720 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
721 q15_t *px; /* Intermediate inputA pointer */
722 q15_t *py; /* Intermediate inputB pointer */
723 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
724 q31_t x0, x1, x2, x3, c0;
725 uint32_t j, k, count, check, blkCnt;
726 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
727 arm_status status; /* status of Partial convolution */
728 q15_t a, b;
729
730 /* Check for range of output samples to be calculated */
731 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
732 {
733 /* Set status as ARM_MATH_ARGUMENT_ERROR */
734 status = ARM_MATH_ARGUMENT_ERROR;
735 }
736 else
737 {
738
739 /* The algorithm implementation is based on the lengths of the inputs. */
740 /* srcB is always made to slide across srcA. */
741 /* So srcBLen is always considered as shorter or equal to srcALen */
742 if(srcALen >=srcBLen)
743 {
744 /* Initialization of inputA pointer */
745 pIn1 = pSrcA;
746
747 /* Initialization of inputB pointer */
748 pIn2 = pSrcB;
749 }
750 else
751 {
752 /* Initialization of inputA pointer */
753 pIn1 = pSrcB;
754
755 /* Initialization of inputB pointer */
756 pIn2 = pSrcA;
757
758 /* srcBLen is always considered as shorter or equal to srcALen */
759 j = srcBLen;
760 srcBLen = srcALen;
761 srcALen = j;
762 }
763
764 /* Conditions to check which loopCounter holds
765 * the first and last indices of the output samples to be calculated. */
766 check = firstIndex + numPoints;
767 blockSize3 = ((int32_t) check - (int32_t) srcALen);
768 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
769 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
770 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
771 (int32_t) numPoints) : 0;
772 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
773 (int32_t) firstIndex);
774 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
775
776 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
777 /* The function is internally
778 * divided into three stages according to the number of multiplications that has to be
779 * taken place between inputA samples and inputB samples. In the first stage of the
780 * algorithm, the multiplications increase by one for every iteration.
781 * In the second stage of the algorithm, srcBLen number of multiplications are done.
782 * In the third stage of the algorithm, the multiplications decrease by one
783 * for every iteration. */
784
785 /* Set the output pointer to point to the firstIndex
786 * of the output sample to be calculated. */
787 pOut = pDst + firstIndex;
788
789 /* --------------------------
790 * Initializations of stage1
791 * -------------------------*/
792
793 /* sum = x[0] * y[0]
794 * sum = x[0] * y[1] + x[1] * y[0]
795 * ....
796 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
797 */
798
799 /* In this stage the MAC operations are increased by 1 for every iteration.
800 The count variable holds the number of MAC operations performed.
801 Since the partial convolution starts from firstIndex
802 Number of Macs to be performed is firstIndex + 1 */
803 count = 1u + firstIndex;
804
805 /* Working pointer of inputA */
806 px = pIn1;
807
808 /* Working pointer of inputB */
809 pSrc2 = pIn2 + firstIndex;
810 py = pSrc2;
811
812 /* ------------------------
813 * Stage1 process
814 * ----------------------*/
815
816 /* For loop unrolling by 4, this stage is divided into two. */
817 /* First part of this stage computes the MAC operations less than 4 */
818 /* Second part of this stage computes the MAC operations greater than or equal to 4 */
819
820 /* The first part of the stage starts here */
821 while((count < 4u) && (blockSize1 > 0u))
822 {
823 /* Accumulator is made zero for every iteration */
824 sum = 0;
825
826 /* Loop over number of MAC operations between
827 * inputA samples and inputB samples */
828 k = count;
829
830 while(k > 0u)
831 {
832 /* Perform the multiply-accumulates */
833 sum += ((q31_t) * px++ * *py--);
834
835 /* Decrement the loop counter */
836 k--;
837 }
838
839 /* Store the result in the accumulator in the destination buffer. */
840 *pOut++ = (q15_t) (sum >> 15);
841
842 /* Update the inputA and inputB pointers for next MAC calculation */
843 py = ++pSrc2;
844 px = pIn1;
845
846 /* Increment the MAC count */
847 count++;
848
849 /* Decrement the loop counter */
850 blockSize1--;
851 }
852
853 /* The second part of the stage starts here */
854 /* The internal loop, over count, is unrolled by 4 */
855 /* To, read the last two inputB samples using SIMD:
856 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
857 py = py - 1;
858
859 while(blockSize1 > 0u)
860 {
861 /* Accumulator is made zero for every iteration */
862 sum = 0;
863
864 /* Apply loop unrolling and compute 4 MACs simultaneously. */
865 k = count >> 2u;
866
867 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
868 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
869 py++;
870
871 while(k > 0u)
872 {
873 /* Perform the multiply-accumulates */
874 sum += ((q31_t) * px++ * *py--);
875 sum += ((q31_t) * px++ * *py--);
876 sum += ((q31_t) * px++ * *py--);
877 sum += ((q31_t) * px++ * *py--);
878
879 /* Decrement the loop counter */
880 k--;
881 }
882
883 /* If the count is not a multiple of 4, compute any remaining MACs here.
884 ** No loop unrolling is used. */
885 k = count % 0x4u;
886
887 while(k > 0u)
888 {
889 /* Perform the multiply-accumulates */
890 sum += ((q31_t) * px++ * *py--);
891
892 /* Decrement the loop counter */
893 k--;
894 }
895
896 /* Store the result in the accumulator in the destination buffer. */
897 *pOut++ = (q15_t) (sum >> 15);
898
899 /* Update the inputA and inputB pointers for next MAC calculation */
900 py = ++pSrc2 - 1u;
901 px = pIn1;
902
903 /* Increment the MAC count */
904 count++;
905
906 /* Decrement the loop counter */
907 blockSize1--;
908 }
909
910 /* --------------------------
911 * Initializations of stage2
912 * ------------------------*/
913
914 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
915 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
916 * ....
917 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
918 */
919
920 /* Working pointer of inputA */
921 px = pIn1;
922
923 /* Working pointer of inputB */
924 pSrc2 = pIn2 + (srcBLen - 1u);
925 py = pSrc2;
926
927 /* count is the index by which the pointer pIn1 to be incremented */
928 count = 0u;
929
930
931 /* --------------------
932 * Stage2 process
933 * -------------------*/
934
935 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
936 * So, to loop unroll over blockSize2,
937 * srcBLen should be greater than or equal to 4 */
938 if(srcBLen >= 4u)
939 {
940 /* Loop unroll over blockSize2, by 4 */
941 blkCnt = ((uint32_t) blockSize2 >> 2u);
942
943 while(blkCnt > 0u)
944 {
945 py = py - 1u;
946
947 /* Set all accumulators to zero */
948 acc0 = 0;
949 acc1 = 0;
950 acc2 = 0;
951 acc3 = 0;
952
953 /* read x[0], x[1] samples */
954 a = *px++;
955 b = *px++;
956
957 #ifndef ARM_MATH_BIG_ENDIAN
958
959 x0 = __PKHBT(a, b, 16);
960 a = *px;
961 x1 = __PKHBT(b, a, 16);
962
963 #else
964
965 x0 = __PKHBT(b, a, 16);
966 a = *px;
967 x1 = __PKHBT(a, b, 16);
968
969 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
970
971 /* Apply loop unrolling and compute 4 MACs simultaneously. */
972 k = srcBLen >> 2u;
973
974 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
975 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
976 do
977 {
978 /* Read the last two inputB samples using SIMD:
979 * y[srcBLen - 1] and y[srcBLen - 2] */
980 a = *py;
981 b = *(py+1);
982 py -= 2;
983
984 #ifndef ARM_MATH_BIG_ENDIAN
985
986 c0 = __PKHBT(a, b, 16);
987
988 #else
989
990 c0 = __PKHBT(b, a, 16);;
991
992 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
993
994 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
995 acc0 = __SMLADX(x0, c0, acc0);
996
997 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
998 acc1 = __SMLADX(x1, c0, acc1);
999
1000 a = *px;
1001 b = *(px + 1);
1002
1003 #ifndef ARM_MATH_BIG_ENDIAN
1004
1005 x2 = __PKHBT(a, b, 16);
1006 a = *(px + 2);
1007 x3 = __PKHBT(b, a, 16);
1008
1009 #else
1010
1011 x2 = __PKHBT(b, a, 16);
1012 a = *(px + 2);
1013 x3 = __PKHBT(a, b, 16);
1014
1015 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1016
1017 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
1018 acc2 = __SMLADX(x2, c0, acc2);
1019
1020 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
1021 acc3 = __SMLADX(x3, c0, acc3);
1022
1023 /* Read y[srcBLen - 3] and y[srcBLen - 4] */
1024 a = *py;
1025 b = *(py+1);
1026 py -= 2;
1027
1028 #ifndef ARM_MATH_BIG_ENDIAN
1029
1030 c0 = __PKHBT(a, b, 16);
1031
1032 #else
1033
1034 c0 = __PKHBT(b, a, 16);;
1035
1036 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1037
1038 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
1039 acc0 = __SMLADX(x2, c0, acc0);
1040
1041 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
1042 acc1 = __SMLADX(x3, c0, acc1);
1043
1044 /* Read x[4], x[5], x[6] */
1045 a = *(px + 2);
1046 b = *(px + 3);
1047
1048 #ifndef ARM_MATH_BIG_ENDIAN
1049
1050 x0 = __PKHBT(a, b, 16);
1051 a = *(px + 4);
1052 x1 = __PKHBT(b, a, 16);
1053
1054 #else
1055
1056 x0 = __PKHBT(b, a, 16);
1057 a = *(px + 4);
1058 x1 = __PKHBT(a, b, 16);
1059
1060 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1061
1062 px += 4u;
1063
1064 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
1065 acc2 = __SMLADX(x0, c0, acc2);
1066
1067 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
1068 acc3 = __SMLADX(x1, c0, acc3);
1069
1070 } while(--k);
1071
1072 /* For the next MAC operations, SIMD is not used
1073 * So, the 16 bit pointer if inputB, py is updated */
1074
1075 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1076 ** No loop unrolling is used. */
1077 k = srcBLen % 0x4u;
1078
1079 if(k == 1u)
1080 {
1081 /* Read y[srcBLen - 5] */
1082 c0 = *(py+1);
1083
1084 #ifdef ARM_MATH_BIG_ENDIAN
1085
1086 c0 = c0 << 16u;
1087
1088 #else
1089
1090 c0 = c0 & 0x0000FFFF;
1091
1092 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
1093
1094 /* Read x[7] */
1095 a = *px;
1096 b = *(px+1);
1097 px++;
1098
1099 #ifndef ARM_MATH_BIG_ENDIAN
1100
1101 x3 = __PKHBT(a, b, 16);
1102
1103 #else
1104
1105 x3 = __PKHBT(b, a, 16);;
1106
1107 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1108
1109
1110 /* Perform the multiply-accumulates */
1111 acc0 = __SMLAD(x0, c0, acc0);
1112 acc1 = __SMLAD(x1, c0, acc1);
1113 acc2 = __SMLADX(x1, c0, acc2);
1114 acc3 = __SMLADX(x3, c0, acc3);
1115 }
1116
1117 if(k == 2u)
1118 {
1119 /* Read y[srcBLen - 5], y[srcBLen - 6] */
1120 a = *py;
1121 b = *(py+1);
1122
1123 #ifndef ARM_MATH_BIG_ENDIAN
1124
1125 c0 = __PKHBT(a, b, 16);
1126
1127 #else
1128
1129 c0 = __PKHBT(b, a, 16);;
1130
1131 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1132
1133 /* Read x[7], x[8], x[9] */
1134 a = *px;
1135 b = *(px + 1);
1136
1137 #ifndef ARM_MATH_BIG_ENDIAN
1138
1139 x3 = __PKHBT(a, b, 16);
1140 a = *(px + 2);
1141 x2 = __PKHBT(b, a, 16);
1142
1143 #else
1144
1145 x3 = __PKHBT(b, a, 16);
1146 a = *(px + 2);
1147 x2 = __PKHBT(a, b, 16);
1148
1149 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1150 px += 2u;
1151
1152 /* Perform the multiply-accumulates */
1153 acc0 = __SMLADX(x0, c0, acc0);
1154 acc1 = __SMLADX(x1, c0, acc1);
1155 acc2 = __SMLADX(x3, c0, acc2);
1156 acc3 = __SMLADX(x2, c0, acc3);
1157 }
1158
1159 if(k == 3u)
1160 {
1161 /* Read y[srcBLen - 5], y[srcBLen - 6] */
1162 a = *py;
1163 b = *(py+1);
1164
1165 #ifndef ARM_MATH_BIG_ENDIAN
1166
1167 c0 = __PKHBT(a, b, 16);
1168
1169 #else
1170
1171 c0 = __PKHBT(b, a, 16);;
1172
1173 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1174
1175 /* Read x[7], x[8], x[9] */
1176 a = *px;
1177 b = *(px + 1);
1178
1179 #ifndef ARM_MATH_BIG_ENDIAN
1180
1181 x3 = __PKHBT(a, b, 16);
1182 a = *(px + 2);
1183 x2 = __PKHBT(b, a, 16);
1184
1185 #else
1186
1187 x3 = __PKHBT(b, a, 16);
1188 a = *(px + 2);
1189 x2 = __PKHBT(a, b, 16);
1190
1191 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1192
1193 /* Perform the multiply-accumulates */
1194 acc0 = __SMLADX(x0, c0, acc0);
1195 acc1 = __SMLADX(x1, c0, acc1);
1196 acc2 = __SMLADX(x3, c0, acc2);
1197 acc3 = __SMLADX(x2, c0, acc3);
1198
1199 /* Read y[srcBLen - 7] */
1200 c0 = *(py-1);
1201 #ifdef ARM_MATH_BIG_ENDIAN
1202
1203 c0 = c0 << 16u;
1204 #else
1205
1206 c0 = c0 & 0x0000FFFF;
1207 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
1208
1209 /* Read x[10] */
1210 a = *(px+2);
1211 b = *(px+3);
1212
1213 #ifndef ARM_MATH_BIG_ENDIAN
1214
1215 x3 = __PKHBT(a, b, 16);
1216
1217 #else
1218
1219 x3 = __PKHBT(b, a, 16);;
1220
1221 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1222
1223 px += 3u;
1224
1225 /* Perform the multiply-accumulates */
1226 acc0 = __SMLADX(x1, c0, acc0);
1227 acc1 = __SMLAD(x2, c0, acc1);
1228 acc2 = __SMLADX(x2, c0, acc2);
1229 acc3 = __SMLADX(x3, c0, acc3);
1230 }
1231
1232 /* Store the results in the accumulators in the destination buffer. */
1233 *pOut++ = (q15_t)(acc0 >> 15);
1234 *pOut++ = (q15_t)(acc1 >> 15);
1235 *pOut++ = (q15_t)(acc2 >> 15);
1236 *pOut++ = (q15_t)(acc3 >> 15);
1237
1238 /* Increment the pointer pIn1 index, count by 4 */
1239 count += 4u;
1240
1241 /* Update the inputA and inputB pointers for next MAC calculation */
1242 px = pIn1 + count;
1243 py = pSrc2;
1244
1245 /* Decrement the loop counter */
1246 blkCnt--;
1247 }
1248
1249 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
1250 ** No loop unrolling is used. */
1251 blkCnt = (uint32_t) blockSize2 % 0x4u;
1252
1253 while(blkCnt > 0u)
1254 {
1255 /* Accumulator is made zero for every iteration */
1256 sum = 0;
1257
1258 /* Apply loop unrolling and compute 4 MACs simultaneously. */
1259 k = srcBLen >> 2u;
1260
1261 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
1262 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1263 while(k > 0u)
1264 {
1265 /* Perform the multiply-accumulates */
1266 sum += ((q31_t) * px++ * *py--);
1267 sum += ((q31_t) * px++ * *py--);
1268 sum += ((q31_t) * px++ * *py--);
1269 sum += ((q31_t) * px++ * *py--);
1270
1271 /* Decrement the loop counter */
1272 k--;
1273 }
1274
1275 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1276 ** No loop unrolling is used. */
1277 k = srcBLen % 0x4u;
1278
1279 while(k > 0u)
1280 {
1281 /* Perform the multiply-accumulates */
1282 sum += ((q31_t) * px++ * *py--);
1283
1284 /* Decrement the loop counter */
1285 k--;
1286 }
1287
1288 /* Store the result in the accumulator in the destination buffer. */
1289 *pOut++ = (q15_t) (sum >> 15);
1290
1291 /* Increment the pointer pIn1 index, count by 1 */
1292 count++;
1293
1294 /* Update the inputA and inputB pointers for next MAC calculation */
1295 px = pIn1 + count;
1296 py = pSrc2;
1297
1298 /* Decrement the loop counter */
1299 blkCnt--;
1300 }
1301 }
1302 else
1303 {
1304 /* If the srcBLen is not a multiple of 4,
1305 * the blockSize2 loop cannot be unrolled by 4 */
1306 blkCnt = (uint32_t) blockSize2;
1307
1308 while(blkCnt > 0u)
1309 {
1310 /* Accumulator is made zero for every iteration */
1311 sum = 0;
1312
1313 /* srcBLen number of MACS should be performed */
1314 k = srcBLen;
1315
1316 while(k > 0u)
1317 {
1318 /* Perform the multiply-accumulate */
1319 sum += ((q31_t) * px++ * *py--);
1320
1321 /* Decrement the loop counter */
1322 k--;
1323 }
1324
1325 /* Store the result in the accumulator in the destination buffer. */
1326 *pOut++ = (q15_t) (sum >> 15);
1327
1328 /* Increment the MAC count */
1329 count++;
1330
1331 /* Update the inputA and inputB pointers for next MAC calculation */
1332 px = pIn1 + count;
1333 py = pSrc2;
1334
1335 /* Decrement the loop counter */
1336 blkCnt--;
1337 }
1338 }
1339
1340
1341 /* --------------------------
1342 * Initializations of stage3
1343 * -------------------------*/
1344
1345 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
1346 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
1347 * ....
1348 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
1349 * sum += x[srcALen-1] * y[srcBLen-1]
1350 */
1351
1352 /* In this stage the MAC operations are decreased by 1 for every iteration.
1353 The count variable holds the number of MAC operations performed */
1354 count = srcBLen - 1u;
1355
1356 /* Working pointer of inputA */
1357 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
1358 px = pSrc1;
1359
1360 /* Working pointer of inputB */
1361 pSrc2 = pIn2 + (srcBLen - 1u);
1362 pIn2 = pSrc2 - 1u;
1363 py = pIn2;
1364
1365 /* -------------------
1366 * Stage3 process
1367 * ------------------*/
1368
1369 /* For loop unrolling by 4, this stage is divided into two. */
1370 /* First part of this stage computes the MAC operations greater than 4 */
1371 /* Second part of this stage computes the MAC operations less than or equal to 4 */
1372
1373 /* The first part of the stage starts here */
1374 j = count >> 2u;
1375
1376 while((j > 0u) && (blockSize3 > 0))
1377 {
1378 /* Accumulator is made zero for every iteration */
1379 sum = 0;
1380
1381 /* Apply loop unrolling and compute 4 MACs simultaneously. */
1382 k = count >> 2u;
1383
1384 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
1385 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1386 py++;
1387
1388 while(k > 0u)
1389 {
1390 /* Perform the multiply-accumulates */
1391 sum += ((q31_t) * px++ * *py--);
1392 sum += ((q31_t) * px++ * *py--);
1393 sum += ((q31_t) * px++ * *py--);
1394 sum += ((q31_t) * px++ * *py--);
1395 /* Decrement the loop counter */
1396 k--;
1397 }
1398
1399
1400 /* If the count is not a multiple of 4, compute any remaining MACs here.
1401 ** No loop unrolling is used. */
1402 k = count % 0x4u;
1403
1404 while(k > 0u)
1405 {
1406 /* Perform the multiply-accumulates */
1407 sum += ((q31_t) * px++ * *py--);
1408
1409 /* Decrement the loop counter */
1410 k--;
1411 }
1412
1413 /* Store the result in the accumulator in the destination buffer. */
1414 *pOut++ = (q15_t) (sum >> 15);
1415
1416 /* Update the inputA and inputB pointers for next MAC calculation */
1417 px = ++pSrc1;
1418 py = pIn2;
1419
1420 /* Decrement the MAC count */
1421 count--;
1422
1423 /* Decrement the loop counter */
1424 blockSize3--;
1425
1426 j--;
1427 }
1428
1429 /* The second part of the stage starts here */
1430 /* SIMD is not used for the next MAC operations,
1431 * so pointer py is updated to read only one sample at a time */
1432 py = py + 1u;
1433
1434 while(blockSize3 > 0u)
1435 {
1436 /* Accumulator is made zero for every iteration */
1437 sum = 0;
1438
1439 /* Apply loop unrolling and compute 4 MACs simultaneously. */
1440 k = count;
1441
1442 while(k > 0u)
1443 {
1444 /* Perform the multiply-accumulates */
1445 /* sum += x[srcALen-1] * y[srcBLen-1] */
1446 sum += ((q31_t) * px++ * *py--);
1447
1448 /* Decrement the loop counter */
1449 k--;
1450 }
1451
1452 /* Store the result in the accumulator in the destination buffer. */
1453 *pOut++ = (q15_t) (sum >> 15);
1454
1455 /* Update the inputA and inputB pointers for next MAC calculation */
1456 px = ++pSrc1;
1457 py = pSrc2;
1458
1459 /* Decrement the MAC count */
1460 count--;
1461
1462 /* Decrement the loop counter */
1463 blockSize3--;
1464 }
1465
1466 /* set status as ARM_MATH_SUCCESS */
1467 status = ARM_MATH_SUCCESS;
1468 }
1469
1470 /* Return to application */
1471 return (status);
1472
1473 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
1474 }
1475
1476 /**
1477 * @} end of PartialConv group
1478 */
Imprint / Impressum