]> git.gir.st - tmk_keyboard.git/blob - tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_correlate_fast_q15.c
Merge commit '1fe4406f374291ab2e86e95a97341fd9c475fcb8'
[tmk_keyboard.git] / tmk_core / tool / mbed / mbed-sdk / libraries / dsp / cmsis_dsp / FilteringFunctions / arm_correlate_fast_q15.c
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
3 *
4 * $Date: 17. January 2013
5 * $Revision: V1.4.1
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_correlate_fast_q15.c
9 *
10 * Description: Fast Q15 Correlation.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40
41 #include "arm_math.h"
42
43 /**
44 * @ingroup groupFilters
45 */
46
47 /**
48 * @addtogroup Corr
49 * @{
50 */
51
52 /**
53 * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
54 * @param[in] *pSrcA points to the first input sequence.
55 * @param[in] srcALen length of the first input sequence.
56 * @param[in] *pSrcB points to the second input sequence.
57 * @param[in] srcBLen length of the second input sequence.
58 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
59 * @return none.
60 *
61 * <b>Scaling and Overflow Behavior:</b>
62 *
63 * \par
64 * This fast version uses a 32-bit accumulator with 2.30 format.
65 * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
66 * There is no saturation on intermediate additions.
67 * Thus, if the accumulator overflows it wraps around and distorts the result.
68 * The input signals should be scaled down to avoid intermediate overflows.
69 * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
70 * maximum of min(srcALen, srcBLen) number of additions is carried internally.
71 * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
72 *
73 * \par
74 * See <code>arm_correlate_q15()</code> for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
75 */
76
77 void arm_correlate_fast_q15(
78 q15_t * pSrcA,
79 uint32_t srcALen,
80 q15_t * pSrcB,
81 uint32_t srcBLen,
82 q15_t * pDst)
83 {
84 #ifndef UNALIGNED_SUPPORT_DISABLE
85
86 q15_t *pIn1; /* inputA pointer */
87 q15_t *pIn2; /* inputB pointer */
88 q15_t *pOut = pDst; /* output pointer */
89 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
90 q15_t *px; /* Intermediate inputA pointer */
91 q15_t *py; /* Intermediate inputB pointer */
92 q15_t *pSrc1; /* Intermediate pointers */
93 q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
94 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
95 int32_t inc = 1; /* Destination address modifier */
96
97
98 /* The algorithm implementation is based on the lengths of the inputs. */
99 /* srcB is always made to slide across srcA. */
100 /* So srcBLen is always considered as shorter or equal to srcALen */
101 /* But CORR(x, y) is reverse of CORR(y, x) */
102 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
103 /* and the destination pointer modifier, inc is set to -1 */
104 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
105 /* But to improve the performance,
106 * we include zeroes in the output instead of zero padding either of the the inputs*/
107 /* If srcALen > srcBLen,
108 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
109 /* If srcALen < srcBLen,
110 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
111 if(srcALen >= srcBLen)
112 {
113 /* Initialization of inputA pointer */
114 pIn1 = (pSrcA);
115
116 /* Initialization of inputB pointer */
117 pIn2 = (pSrcB);
118
119 /* Number of output samples is calculated */
120 outBlockSize = (2u * srcALen) - 1u;
121
122 /* When srcALen > srcBLen, zero padding is done to srcB
123 * to make their lengths equal.
124 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
125 * number of output samples are made zero */
126 j = outBlockSize - (srcALen + (srcBLen - 1u));
127
128 /* Updating the pointer position to non zero value */
129 pOut += j;
130
131 }
132 else
133 {
134 /* Initialization of inputA pointer */
135 pIn1 = (pSrcB);
136
137 /* Initialization of inputB pointer */
138 pIn2 = (pSrcA);
139
140 /* srcBLen is always considered as shorter or equal to srcALen */
141 j = srcBLen;
142 srcBLen = srcALen;
143 srcALen = j;
144
145 /* CORR(x, y) = Reverse order(CORR(y, x)) */
146 /* Hence set the destination pointer to point to the last output sample */
147 pOut = pDst + ((srcALen + srcBLen) - 2u);
148
149 /* Destination address modifier is set to -1 */
150 inc = -1;
151
152 }
153
154 /* The function is internally
155 * divided into three parts according to the number of multiplications that has to be
156 * taken place between inputA samples and inputB samples. In the first part of the
157 * algorithm, the multiplications increase by one for every iteration.
158 * In the second part of the algorithm, srcBLen number of multiplications are done.
159 * In the third part of the algorithm, the multiplications decrease by one
160 * for every iteration.*/
161 /* The algorithm is implemented in three stages.
162 * The loop counters of each stage is initiated here. */
163 blockSize1 = srcBLen - 1u;
164 blockSize2 = srcALen - (srcBLen - 1u);
165 blockSize3 = blockSize1;
166
167 /* --------------------------
168 * Initializations of stage1
169 * -------------------------*/
170
171 /* sum = x[0] * y[srcBlen - 1]
172 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
173 * ....
174 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
175 */
176
177 /* In this stage the MAC operations are increased by 1 for every iteration.
178 The count variable holds the number of MAC operations performed */
179 count = 1u;
180
181 /* Working pointer of inputA */
182 px = pIn1;
183
184 /* Working pointer of inputB */
185 pSrc1 = pIn2 + (srcBLen - 1u);
186 py = pSrc1;
187
188 /* ------------------------
189 * Stage1 process
190 * ----------------------*/
191
192 /* The first loop starts here */
193 while(blockSize1 > 0u)
194 {
195 /* Accumulator is made zero for every iteration */
196 sum = 0;
197
198 /* Apply loop unrolling and compute 4 MACs simultaneously. */
199 k = count >> 2;
200
201 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
202 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
203 while(k > 0u)
204 {
205 /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
206 sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
207 /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
208 sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
209
210 /* Decrement the loop counter */
211 k--;
212 }
213
214 /* If the count is not a multiple of 4, compute any remaining MACs here.
215 ** No loop unrolling is used. */
216 k = count % 0x4u;
217
218 while(k > 0u)
219 {
220 /* Perform the multiply-accumulates */
221 /* x[0] * y[srcBLen - 1] */
222 sum = __SMLAD(*px++, *py++, sum);
223
224 /* Decrement the loop counter */
225 k--;
226 }
227
228 /* Store the result in the accumulator in the destination buffer. */
229 *pOut = (q15_t) (sum >> 15);
230 /* Destination pointer is updated according to the address modifier, inc */
231 pOut += inc;
232
233 /* Update the inputA and inputB pointers for next MAC calculation */
234 py = pSrc1 - count;
235 px = pIn1;
236
237 /* Increment the MAC count */
238 count++;
239
240 /* Decrement the loop counter */
241 blockSize1--;
242 }
243
244 /* --------------------------
245 * Initializations of stage2
246 * ------------------------*/
247
248 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
249 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
250 * ....
251 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
252 */
253
254 /* Working pointer of inputA */
255 px = pIn1;
256
257 /* Working pointer of inputB */
258 py = pIn2;
259
260 /* count is index by which the pointer pIn1 to be incremented */
261 count = 0u;
262
263 /* -------------------
264 * Stage2 process
265 * ------------------*/
266
267 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
268 * So, to loop unroll over blockSize2,
269 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
270 if(srcBLen >= 4u)
271 {
272 /* Loop unroll over blockSize2, by 4 */
273 blkCnt = blockSize2 >> 2u;
274
275 while(blkCnt > 0u)
276 {
277 /* Set all accumulators to zero */
278 acc0 = 0;
279 acc1 = 0;
280 acc2 = 0;
281 acc3 = 0;
282
283 /* read x[0], x[1] samples */
284 x0 = *__SIMD32(px);
285 /* read x[1], x[2] samples */
286 x1 = _SIMD32_OFFSET(px + 1);
287 px += 2u;
288
289 /* Apply loop unrolling and compute 4 MACs simultaneously. */
290 k = srcBLen >> 2u;
291
292 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
293 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
294 do
295 {
296 /* Read the first two inputB samples using SIMD:
297 * y[0] and y[1] */
298 c0 = *__SIMD32(py)++;
299
300 /* acc0 += x[0] * y[0] + x[1] * y[1] */
301 acc0 = __SMLAD(x0, c0, acc0);
302
303 /* acc1 += x[1] * y[0] + x[2] * y[1] */
304 acc1 = __SMLAD(x1, c0, acc1);
305
306 /* Read x[2], x[3] */
307 x2 = *__SIMD32(px);
308
309 /* Read x[3], x[4] */
310 x3 = _SIMD32_OFFSET(px + 1);
311
312 /* acc2 += x[2] * y[0] + x[3] * y[1] */
313 acc2 = __SMLAD(x2, c0, acc2);
314
315 /* acc3 += x[3] * y[0] + x[4] * y[1] */
316 acc3 = __SMLAD(x3, c0, acc3);
317
318 /* Read y[2] and y[3] */
319 c0 = *__SIMD32(py)++;
320
321 /* acc0 += x[2] * y[2] + x[3] * y[3] */
322 acc0 = __SMLAD(x2, c0, acc0);
323
324 /* acc1 += x[3] * y[2] + x[4] * y[3] */
325 acc1 = __SMLAD(x3, c0, acc1);
326
327 /* Read x[4], x[5] */
328 x0 = _SIMD32_OFFSET(px + 2);
329
330 /* Read x[5], x[6] */
331 x1 = _SIMD32_OFFSET(px + 3);
332 px += 4u;
333
334 /* acc2 += x[4] * y[2] + x[5] * y[3] */
335 acc2 = __SMLAD(x0, c0, acc2);
336
337 /* acc3 += x[5] * y[2] + x[6] * y[3] */
338 acc3 = __SMLAD(x1, c0, acc3);
339
340 } while(--k);
341
342 /* For the next MAC operations, SIMD is not used
343 * So, the 16 bit pointer if inputB, py is updated */
344
345 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
346 ** No loop unrolling is used. */
347 k = srcBLen % 0x4u;
348
349 if(k == 1u)
350 {
351 /* Read y[4] */
352 c0 = *py;
353 #ifdef ARM_MATH_BIG_ENDIAN
354
355 c0 = c0 << 16u;
356
357 #else
358
359 c0 = c0 & 0x0000FFFF;
360
361 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
362
363 /* Read x[7] */
364 x3 = *__SIMD32(px);
365 px++;
366
367 /* Perform the multiply-accumulates */
368 acc0 = __SMLAD(x0, c0, acc0);
369 acc1 = __SMLAD(x1, c0, acc1);
370 acc2 = __SMLADX(x1, c0, acc2);
371 acc3 = __SMLADX(x3, c0, acc3);
372 }
373
374 if(k == 2u)
375 {
376 /* Read y[4], y[5] */
377 c0 = *__SIMD32(py);
378
379 /* Read x[7], x[8] */
380 x3 = *__SIMD32(px);
381
382 /* Read x[9] */
383 x2 = _SIMD32_OFFSET(px + 1);
384 px += 2u;
385
386 /* Perform the multiply-accumulates */
387 acc0 = __SMLAD(x0, c0, acc0);
388 acc1 = __SMLAD(x1, c0, acc1);
389 acc2 = __SMLAD(x3, c0, acc2);
390 acc3 = __SMLAD(x2, c0, acc3);
391 }
392
393 if(k == 3u)
394 {
395 /* Read y[4], y[5] */
396 c0 = *__SIMD32(py)++;
397
398 /* Read x[7], x[8] */
399 x3 = *__SIMD32(px);
400
401 /* Read x[9] */
402 x2 = _SIMD32_OFFSET(px + 1);
403
404 /* Perform the multiply-accumulates */
405 acc0 = __SMLAD(x0, c0, acc0);
406 acc1 = __SMLAD(x1, c0, acc1);
407 acc2 = __SMLAD(x3, c0, acc2);
408 acc3 = __SMLAD(x2, c0, acc3);
409
410 c0 = (*py);
411 /* Read y[6] */
412 #ifdef ARM_MATH_BIG_ENDIAN
413
414 c0 = c0 << 16u;
415 #else
416
417 c0 = c0 & 0x0000FFFF;
418 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
419
420 /* Read x[10] */
421 x3 = _SIMD32_OFFSET(px + 2);
422 px += 3u;
423
424 /* Perform the multiply-accumulates */
425 acc0 = __SMLADX(x1, c0, acc0);
426 acc1 = __SMLAD(x2, c0, acc1);
427 acc2 = __SMLADX(x2, c0, acc2);
428 acc3 = __SMLADX(x3, c0, acc3);
429 }
430
431 /* Store the result in the accumulator in the destination buffer. */
432 *pOut = (q15_t) (acc0 >> 15);
433 /* Destination pointer is updated according to the address modifier, inc */
434 pOut += inc;
435
436 *pOut = (q15_t) (acc1 >> 15);
437 pOut += inc;
438
439 *pOut = (q15_t) (acc2 >> 15);
440 pOut += inc;
441
442 *pOut = (q15_t) (acc3 >> 15);
443 pOut += inc;
444
445 /* Increment the pointer pIn1 index, count by 1 */
446 count += 4u;
447
448 /* Update the inputA and inputB pointers for next MAC calculation */
449 px = pIn1 + count;
450 py = pIn2;
451
452
453 /* Decrement the loop counter */
454 blkCnt--;
455 }
456
457 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
458 ** No loop unrolling is used. */
459 blkCnt = blockSize2 % 0x4u;
460
461 while(blkCnt > 0u)
462 {
463 /* Accumulator is made zero for every iteration */
464 sum = 0;
465
466 /* Apply loop unrolling and compute 4 MACs simultaneously. */
467 k = srcBLen >> 2u;
468
469 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
470 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
471 while(k > 0u)
472 {
473 /* Perform the multiply-accumulates */
474 sum += ((q31_t) * px++ * *py++);
475 sum += ((q31_t) * px++ * *py++);
476 sum += ((q31_t) * px++ * *py++);
477 sum += ((q31_t) * px++ * *py++);
478
479 /* Decrement the loop counter */
480 k--;
481 }
482
483 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
484 ** No loop unrolling is used. */
485 k = srcBLen % 0x4u;
486
487 while(k > 0u)
488 {
489 /* Perform the multiply-accumulates */
490 sum += ((q31_t) * px++ * *py++);
491
492 /* Decrement the loop counter */
493 k--;
494 }
495
496 /* Store the result in the accumulator in the destination buffer. */
497 *pOut = (q15_t) (sum >> 15);
498 /* Destination pointer is updated according to the address modifier, inc */
499 pOut += inc;
500
501 /* Increment the pointer pIn1 index, count by 1 */
502 count++;
503
504 /* Update the inputA and inputB pointers for next MAC calculation */
505 px = pIn1 + count;
506 py = pIn2;
507
508 /* Decrement the loop counter */
509 blkCnt--;
510 }
511 }
512 else
513 {
514 /* If the srcBLen is not a multiple of 4,
515 * the blockSize2 loop cannot be unrolled by 4 */
516 blkCnt = blockSize2;
517
518 while(blkCnt > 0u)
519 {
520 /* Accumulator is made zero for every iteration */
521 sum = 0;
522
523 /* Loop over srcBLen */
524 k = srcBLen;
525
526 while(k > 0u)
527 {
528 /* Perform the multiply-accumulate */
529 sum += ((q31_t) * px++ * *py++);
530
531 /* Decrement the loop counter */
532 k--;
533 }
534
535 /* Store the result in the accumulator in the destination buffer. */
536 *pOut = (q15_t) (sum >> 15);
537 /* Destination pointer is updated according to the address modifier, inc */
538 pOut += inc;
539
540 /* Increment the MAC count */
541 count++;
542
543 /* Update the inputA and inputB pointers for next MAC calculation */
544 px = pIn1 + count;
545 py = pIn2;
546
547 /* Decrement the loop counter */
548 blkCnt--;
549 }
550 }
551
552 /* --------------------------
553 * Initializations of stage3
554 * -------------------------*/
555
556 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
557 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
558 * ....
559 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
560 * sum += x[srcALen-1] * y[0]
561 */
562
563 /* In this stage the MAC operations are decreased by 1 for every iteration.
564 The count variable holds the number of MAC operations performed */
565 count = srcBLen - 1u;
566
567 /* Working pointer of inputA */
568 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
569 px = pSrc1;
570
571 /* Working pointer of inputB */
572 py = pIn2;
573
574 /* -------------------
575 * Stage3 process
576 * ------------------*/
577
578 while(blockSize3 > 0u)
579 {
580 /* Accumulator is made zero for every iteration */
581 sum = 0;
582
583 /* Apply loop unrolling and compute 4 MACs simultaneously. */
584 k = count >> 2u;
585
586 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
587 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
588 while(k > 0u)
589 {
590 /* Perform the multiply-accumulates */
591 /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
592 sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
593 /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
594 sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
595
596 /* Decrement the loop counter */
597 k--;
598 }
599
600 /* If the count is not a multiple of 4, compute any remaining MACs here.
601 ** No loop unrolling is used. */
602 k = count % 0x4u;
603
604 while(k > 0u)
605 {
606 /* Perform the multiply-accumulates */
607 sum = __SMLAD(*px++, *py++, sum);
608
609 /* Decrement the loop counter */
610 k--;
611 }
612
613 /* Store the result in the accumulator in the destination buffer. */
614 *pOut = (q15_t) (sum >> 15);
615 /* Destination pointer is updated according to the address modifier, inc */
616 pOut += inc;
617
618 /* Update the inputA and inputB pointers for next MAC calculation */
619 px = ++pSrc1;
620 py = pIn2;
621
622 /* Decrement the MAC count */
623 count--;
624
625 /* Decrement the loop counter */
626 blockSize3--;
627 }
628
629 #else
630
631 q15_t *pIn1; /* inputA pointer */
632 q15_t *pIn2; /* inputB pointer */
633 q15_t *pOut = pDst; /* output pointer */
634 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
635 q15_t *px; /* Intermediate inputA pointer */
636 q15_t *py; /* Intermediate inputB pointer */
637 q15_t *pSrc1; /* Intermediate pointers */
638 q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
639 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
640 int32_t inc = 1; /* Destination address modifier */
641 q15_t a, b;
642
643
644 /* The algorithm implementation is based on the lengths of the inputs. */
645 /* srcB is always made to slide across srcA. */
646 /* So srcBLen is always considered as shorter or equal to srcALen */
647 /* But CORR(x, y) is reverse of CORR(y, x) */
648 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
649 /* and the destination pointer modifier, inc is set to -1 */
650 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
651 /* But to improve the performance,
652 * we include zeroes in the output instead of zero padding either of the the inputs*/
653 /* If srcALen > srcBLen,
654 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
655 /* If srcALen < srcBLen,
656 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
657 if(srcALen >= srcBLen)
658 {
659 /* Initialization of inputA pointer */
660 pIn1 = (pSrcA);
661
662 /* Initialization of inputB pointer */
663 pIn2 = (pSrcB);
664
665 /* Number of output samples is calculated */
666 outBlockSize = (2u * srcALen) - 1u;
667
668 /* When srcALen > srcBLen, zero padding is done to srcB
669 * to make their lengths equal.
670 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
671 * number of output samples are made zero */
672 j = outBlockSize - (srcALen + (srcBLen - 1u));
673
674 /* Updating the pointer position to non zero value */
675 pOut += j;
676
677 }
678 else
679 {
680 /* Initialization of inputA pointer */
681 pIn1 = (pSrcB);
682
683 /* Initialization of inputB pointer */
684 pIn2 = (pSrcA);
685
686 /* srcBLen is always considered as shorter or equal to srcALen */
687 j = srcBLen;
688 srcBLen = srcALen;
689 srcALen = j;
690
691 /* CORR(x, y) = Reverse order(CORR(y, x)) */
692 /* Hence set the destination pointer to point to the last output sample */
693 pOut = pDst + ((srcALen + srcBLen) - 2u);
694
695 /* Destination address modifier is set to -1 */
696 inc = -1;
697
698 }
699
700 /* The function is internally
701 * divided into three parts according to the number of multiplications that has to be
702 * taken place between inputA samples and inputB samples. In the first part of the
703 * algorithm, the multiplications increase by one for every iteration.
704 * In the second part of the algorithm, srcBLen number of multiplications are done.
705 * In the third part of the algorithm, the multiplications decrease by one
706 * for every iteration.*/
707 /* The algorithm is implemented in three stages.
708 * The loop counters of each stage is initiated here. */
709 blockSize1 = srcBLen - 1u;
710 blockSize2 = srcALen - (srcBLen - 1u);
711 blockSize3 = blockSize1;
712
713 /* --------------------------
714 * Initializations of stage1
715 * -------------------------*/
716
717 /* sum = x[0] * y[srcBlen - 1]
718 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
719 * ....
720 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
721 */
722
723 /* In this stage the MAC operations are increased by 1 for every iteration.
724 The count variable holds the number of MAC operations performed */
725 count = 1u;
726
727 /* Working pointer of inputA */
728 px = pIn1;
729
730 /* Working pointer of inputB */
731 pSrc1 = pIn2 + (srcBLen - 1u);
732 py = pSrc1;
733
734 /* ------------------------
735 * Stage1 process
736 * ----------------------*/
737
738 /* The first loop starts here */
739 while(blockSize1 > 0u)
740 {
741 /* Accumulator is made zero for every iteration */
742 sum = 0;
743
744 /* Apply loop unrolling and compute 4 MACs simultaneously. */
745 k = count >> 2;
746
747 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
748 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
749 while(k > 0u)
750 {
751 /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
752 sum += ((q31_t) * px++ * *py++);
753 sum += ((q31_t) * px++ * *py++);
754 sum += ((q31_t) * px++ * *py++);
755 sum += ((q31_t) * px++ * *py++);
756
757 /* Decrement the loop counter */
758 k--;
759 }
760
761 /* If the count is not a multiple of 4, compute any remaining MACs here.
762 ** No loop unrolling is used. */
763 k = count % 0x4u;
764
765 while(k > 0u)
766 {
767 /* Perform the multiply-accumulates */
768 /* x[0] * y[srcBLen - 1] */
769 sum += ((q31_t) * px++ * *py++);
770
771 /* Decrement the loop counter */
772 k--;
773 }
774
775 /* Store the result in the accumulator in the destination buffer. */
776 *pOut = (q15_t) (sum >> 15);
777 /* Destination pointer is updated according to the address modifier, inc */
778 pOut += inc;
779
780 /* Update the inputA and inputB pointers for next MAC calculation */
781 py = pSrc1 - count;
782 px = pIn1;
783
784 /* Increment the MAC count */
785 count++;
786
787 /* Decrement the loop counter */
788 blockSize1--;
789 }
790
791 /* --------------------------
792 * Initializations of stage2
793 * ------------------------*/
794
795 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
796 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
797 * ....
798 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
799 */
800
801 /* Working pointer of inputA */
802 px = pIn1;
803
804 /* Working pointer of inputB */
805 py = pIn2;
806
807 /* count is index by which the pointer pIn1 to be incremented */
808 count = 0u;
809
810 /* -------------------
811 * Stage2 process
812 * ------------------*/
813
814 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
815 * So, to loop unroll over blockSize2,
816 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
817 if(srcBLen >= 4u)
818 {
819 /* Loop unroll over blockSize2, by 4 */
820 blkCnt = blockSize2 >> 2u;
821
822 while(blkCnt > 0u)
823 {
824 /* Set all accumulators to zero */
825 acc0 = 0;
826 acc1 = 0;
827 acc2 = 0;
828 acc3 = 0;
829
830 /* read x[0], x[1], x[2] samples */
831 a = *px;
832 b = *(px + 1);
833
834 #ifndef ARM_MATH_BIG_ENDIAN
835
836 x0 = __PKHBT(a, b, 16);
837 a = *(px + 2);
838 x1 = __PKHBT(b, a, 16);
839
840 #else
841
842 x0 = __PKHBT(b, a, 16);
843 a = *(px + 2);
844 x1 = __PKHBT(a, b, 16);
845
846 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
847
848 px += 2u;
849
850 /* Apply loop unrolling and compute 4 MACs simultaneously. */
851 k = srcBLen >> 2u;
852
853 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
854 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
855 do
856 {
857 /* Read the first two inputB samples using SIMD:
858 * y[0] and y[1] */
859 a = *py;
860 b = *(py + 1);
861
862 #ifndef ARM_MATH_BIG_ENDIAN
863
864 c0 = __PKHBT(a, b, 16);
865
866 #else
867
868 c0 = __PKHBT(b, a, 16);
869
870 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
871
872 /* acc0 += x[0] * y[0] + x[1] * y[1] */
873 acc0 = __SMLAD(x0, c0, acc0);
874
875 /* acc1 += x[1] * y[0] + x[2] * y[1] */
876 acc1 = __SMLAD(x1, c0, acc1);
877
878 /* Read x[2], x[3], x[4] */
879 a = *px;
880 b = *(px + 1);
881
882 #ifndef ARM_MATH_BIG_ENDIAN
883
884 x2 = __PKHBT(a, b, 16);
885 a = *(px + 2);
886 x3 = __PKHBT(b, a, 16);
887
888 #else
889
890 x2 = __PKHBT(b, a, 16);
891 a = *(px + 2);
892 x3 = __PKHBT(a, b, 16);
893
894 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
895
896 /* acc2 += x[2] * y[0] + x[3] * y[1] */
897 acc2 = __SMLAD(x2, c0, acc2);
898
899 /* acc3 += x[3] * y[0] + x[4] * y[1] */
900 acc3 = __SMLAD(x3, c0, acc3);
901
902 /* Read y[2] and y[3] */
903 a = *(py + 2);
904 b = *(py + 3);
905
906 py += 4u;
907
908 #ifndef ARM_MATH_BIG_ENDIAN
909
910 c0 = __PKHBT(a, b, 16);
911
912 #else
913
914 c0 = __PKHBT(b, a, 16);
915
916 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
917
918 /* acc0 += x[2] * y[2] + x[3] * y[3] */
919 acc0 = __SMLAD(x2, c0, acc0);
920
921 /* acc1 += x[3] * y[2] + x[4] * y[3] */
922 acc1 = __SMLAD(x3, c0, acc1);
923
924 /* Read x[4], x[5], x[6] */
925 a = *(px + 2);
926 b = *(px + 3);
927
928 #ifndef ARM_MATH_BIG_ENDIAN
929
930 x0 = __PKHBT(a, b, 16);
931 a = *(px + 4);
932 x1 = __PKHBT(b, a, 16);
933
934 #else
935
936 x0 = __PKHBT(b, a, 16);
937 a = *(px + 4);
938 x1 = __PKHBT(a, b, 16);
939
940 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
941
942 px += 4u;
943
944 /* acc2 += x[4] * y[2] + x[5] * y[3] */
945 acc2 = __SMLAD(x0, c0, acc2);
946
947 /* acc3 += x[5] * y[2] + x[6] * y[3] */
948 acc3 = __SMLAD(x1, c0, acc3);
949
950 } while(--k);
951
952 /* For the next MAC operations, SIMD is not used
953 * So, the 16 bit pointer if inputB, py is updated */
954
955 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
956 ** No loop unrolling is used. */
957 k = srcBLen % 0x4u;
958
959 if(k == 1u)
960 {
961 /* Read y[4] */
962 c0 = *py;
963 #ifdef ARM_MATH_BIG_ENDIAN
964
965 c0 = c0 << 16u;
966
967 #else
968
969 c0 = c0 & 0x0000FFFF;
970
971 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
972
973 /* Read x[7] */
974 a = *px;
975 b = *(px + 1);
976
977 px++;;
978
979 #ifndef ARM_MATH_BIG_ENDIAN
980
981 x3 = __PKHBT(a, b, 16);
982
983 #else
984
985 x3 = __PKHBT(b, a, 16);
986
987 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
988
989 px++;
990
991 /* Perform the multiply-accumulates */
992 acc0 = __SMLAD(x0, c0, acc0);
993 acc1 = __SMLAD(x1, c0, acc1);
994 acc2 = __SMLADX(x1, c0, acc2);
995 acc3 = __SMLADX(x3, c0, acc3);
996 }
997
998 if(k == 2u)
999 {
1000 /* Read y[4], y[5] */
1001 a = *py;
1002 b = *(py + 1);
1003
1004 #ifndef ARM_MATH_BIG_ENDIAN
1005
1006 c0 = __PKHBT(a, b, 16);
1007
1008 #else
1009
1010 c0 = __PKHBT(b, a, 16);
1011
1012 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1013
1014 /* Read x[7], x[8], x[9] */
1015 a = *px;
1016 b = *(px + 1);
1017
1018 #ifndef ARM_MATH_BIG_ENDIAN
1019
1020 x3 = __PKHBT(a, b, 16);
1021 a = *(px + 2);
1022 x2 = __PKHBT(b, a, 16);
1023
1024 #else
1025
1026 x3 = __PKHBT(b, a, 16);
1027 a = *(px + 2);
1028 x2 = __PKHBT(a, b, 16);
1029
1030 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1031
1032 px += 2u;
1033
1034 /* Perform the multiply-accumulates */
1035 acc0 = __SMLAD(x0, c0, acc0);
1036 acc1 = __SMLAD(x1, c0, acc1);
1037 acc2 = __SMLAD(x3, c0, acc2);
1038 acc3 = __SMLAD(x2, c0, acc3);
1039 }
1040
1041 if(k == 3u)
1042 {
1043 /* Read y[4], y[5] */
1044 a = *py;
1045 b = *(py + 1);
1046
1047 #ifndef ARM_MATH_BIG_ENDIAN
1048
1049 c0 = __PKHBT(a, b, 16);
1050
1051 #else
1052
1053 c0 = __PKHBT(b, a, 16);
1054
1055 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1056
1057 py += 2u;
1058
1059 /* Read x[7], x[8], x[9] */
1060 a = *px;
1061 b = *(px + 1);
1062
1063 #ifndef ARM_MATH_BIG_ENDIAN
1064
1065 x3 = __PKHBT(a, b, 16);
1066 a = *(px + 2);
1067 x2 = __PKHBT(b, a, 16);
1068
1069 #else
1070
1071 x3 = __PKHBT(b, a, 16);
1072 a = *(px + 2);
1073 x2 = __PKHBT(a, b, 16);
1074
1075 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1076
1077 /* Perform the multiply-accumulates */
1078 acc0 = __SMLAD(x0, c0, acc0);
1079 acc1 = __SMLAD(x1, c0, acc1);
1080 acc2 = __SMLAD(x3, c0, acc2);
1081 acc3 = __SMLAD(x2, c0, acc3);
1082
1083 c0 = (*py);
1084 /* Read y[6] */
1085 #ifdef ARM_MATH_BIG_ENDIAN
1086
1087 c0 = c0 << 16u;
1088 #else
1089
1090 c0 = c0 & 0x0000FFFF;
1091 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
1092
1093 /* Read x[10] */
1094 b = *(px + 3);
1095
1096 #ifndef ARM_MATH_BIG_ENDIAN
1097
1098 x3 = __PKHBT(a, b, 16);
1099
1100 #else
1101
1102 x3 = __PKHBT(b, a, 16);
1103
1104 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1105
1106 px += 3u;
1107
1108 /* Perform the multiply-accumulates */
1109 acc0 = __SMLADX(x1, c0, acc0);
1110 acc1 = __SMLAD(x2, c0, acc1);
1111 acc2 = __SMLADX(x2, c0, acc2);
1112 acc3 = __SMLADX(x3, c0, acc3);
1113 }
1114
1115 /* Store the result in the accumulator in the destination buffer. */
1116 *pOut = (q15_t) (acc0 >> 15);
1117 /* Destination pointer is updated according to the address modifier, inc */
1118 pOut += inc;
1119
1120 *pOut = (q15_t) (acc1 >> 15);
1121 pOut += inc;
1122
1123 *pOut = (q15_t) (acc2 >> 15);
1124 pOut += inc;
1125
1126 *pOut = (q15_t) (acc3 >> 15);
1127 pOut += inc;
1128
1129 /* Increment the pointer pIn1 index, count by 1 */
1130 count += 4u;
1131
1132 /* Update the inputA and inputB pointers for next MAC calculation */
1133 px = pIn1 + count;
1134 py = pIn2;
1135
1136
1137 /* Decrement the loop counter */
1138 blkCnt--;
1139 }
1140
1141 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
1142 ** No loop unrolling is used. */
1143 blkCnt = blockSize2 % 0x4u;
1144
1145 while(blkCnt > 0u)
1146 {
1147 /* Accumulator is made zero for every iteration */
1148 sum = 0;
1149
1150 /* Apply loop unrolling and compute 4 MACs simultaneously. */
1151 k = srcBLen >> 2u;
1152
1153 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
1154 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1155 while(k > 0u)
1156 {
1157 /* Perform the multiply-accumulates */
1158 sum += ((q31_t) * px++ * *py++);
1159 sum += ((q31_t) * px++ * *py++);
1160 sum += ((q31_t) * px++ * *py++);
1161 sum += ((q31_t) * px++ * *py++);
1162
1163 /* Decrement the loop counter */
1164 k--;
1165 }
1166
1167 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1168 ** No loop unrolling is used. */
1169 k = srcBLen % 0x4u;
1170
1171 while(k > 0u)
1172 {
1173 /* Perform the multiply-accumulates */
1174 sum += ((q31_t) * px++ * *py++);
1175
1176 /* Decrement the loop counter */
1177 k--;
1178 }
1179
1180 /* Store the result in the accumulator in the destination buffer. */
1181 *pOut = (q15_t) (sum >> 15);
1182 /* Destination pointer is updated according to the address modifier, inc */
1183 pOut += inc;
1184
1185 /* Increment the pointer pIn1 index, count by 1 */
1186 count++;
1187
1188 /* Update the inputA and inputB pointers for next MAC calculation */
1189 px = pIn1 + count;
1190 py = pIn2;
1191
1192 /* Decrement the loop counter */
1193 blkCnt--;
1194 }
1195 }
1196 else
1197 {
1198 /* If the srcBLen is not a multiple of 4,
1199 * the blockSize2 loop cannot be unrolled by 4 */
1200 blkCnt = blockSize2;
1201
1202 while(blkCnt > 0u)
1203 {
1204 /* Accumulator is made zero for every iteration */
1205 sum = 0;
1206
1207 /* Loop over srcBLen */
1208 k = srcBLen;
1209
1210 while(k > 0u)
1211 {
1212 /* Perform the multiply-accumulate */
1213 sum += ((q31_t) * px++ * *py++);
1214
1215 /* Decrement the loop counter */
1216 k--;
1217 }
1218
1219 /* Store the result in the accumulator in the destination buffer. */
1220 *pOut = (q15_t) (sum >> 15);
1221 /* Destination pointer is updated according to the address modifier, inc */
1222 pOut += inc;
1223
1224 /* Increment the MAC count */
1225 count++;
1226
1227 /* Update the inputA and inputB pointers for next MAC calculation */
1228 px = pIn1 + count;
1229 py = pIn2;
1230
1231 /* Decrement the loop counter */
1232 blkCnt--;
1233 }
1234 }
1235
1236 /* --------------------------
1237 * Initializations of stage3
1238 * -------------------------*/
1239
1240 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
1241 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
1242 * ....
1243 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
1244 * sum += x[srcALen-1] * y[0]
1245 */
1246
1247 /* In this stage the MAC operations are decreased by 1 for every iteration.
1248 The count variable holds the number of MAC operations performed */
1249 count = srcBLen - 1u;
1250
1251 /* Working pointer of inputA */
1252 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
1253 px = pSrc1;
1254
1255 /* Working pointer of inputB */
1256 py = pIn2;
1257
1258 /* -------------------
1259 * Stage3 process
1260 * ------------------*/
1261
1262 while(blockSize3 > 0u)
1263 {
1264 /* Accumulator is made zero for every iteration */
1265 sum = 0;
1266
1267 /* Apply loop unrolling and compute 4 MACs simultaneously. */
1268 k = count >> 2u;
1269
1270 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
1271 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1272 while(k > 0u)
1273 {
1274 /* Perform the multiply-accumulates */
1275 sum += ((q31_t) * px++ * *py++);
1276 sum += ((q31_t) * px++ * *py++);
1277 sum += ((q31_t) * px++ * *py++);
1278 sum += ((q31_t) * px++ * *py++);
1279
1280 /* Decrement the loop counter */
1281 k--;
1282 }
1283
1284 /* If the count is not a multiple of 4, compute any remaining MACs here.
1285 ** No loop unrolling is used. */
1286 k = count % 0x4u;
1287
1288 while(k > 0u)
1289 {
1290 /* Perform the multiply-accumulates */
1291 sum += ((q31_t) * px++ * *py++);
1292
1293 /* Decrement the loop counter */
1294 k--;
1295 }
1296
1297 /* Store the result in the accumulator in the destination buffer. */
1298 *pOut = (q15_t) (sum >> 15);
1299 /* Destination pointer is updated according to the address modifier, inc */
1300 pOut += inc;
1301
1302 /* Update the inputA and inputB pointers for next MAC calculation */
1303 px = ++pSrc1;
1304 py = pIn2;
1305
1306 /* Decrement the MAC count */
1307 count--;
1308
1309 /* Decrement the loop counter */
1310 blockSize3--;
1311 }
1312
1313 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
1314
1315 }
1316
1317 /**
1318 * @} end of Corr group
1319 */
Imprint / Impressum