]>
git.gir.st - tmk_keyboard.git/blob - tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_correlate_opt_q7.c
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
4 * $Date: 17. January 2013
7 * Project: CMSIS DSP Library
8 * Title: arm_correlate_opt_q7.c
10 * Description: Correlation of Q7 sequences.
12 * Target Processor: Cortex-M4/Cortex-M3
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
44 * @ingroup groupFilters
53 * @brief Correlation of Q7 sequences.
54 * @param[in] *pSrcA points to the first input sequence.
55 * @param[in] srcALen length of the first input sequence.
56 * @param[in] *pSrcB points to the second input sequence.
57 * @param[in] srcBLen length of the second input sequence.
58 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
59 * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
60 * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
65 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
66 * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
69 * <b>Scaling and Overflow Behavior:</b>
72 * The function is implemented using a 32-bit internal accumulator.
73 * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
74 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
75 * This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
76 * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format.
83 void arm_correlate_opt_q7(
92 q7_t
*pOut
= pDst
; /* output pointer */
93 q15_t
*pScr1
= pScratch1
; /* Temporary pointer for scratch */
94 q15_t
*pScr2
= pScratch2
; /* Temporary pointer for scratch */
95 q7_t
*pIn1
; /* inputA pointer */
96 q7_t
*pIn2
; /* inputB pointer */
97 q15_t
*py
; /* Intermediate inputB pointer */
98 q31_t acc0
, acc1
, acc2
, acc3
; /* Accumulators */
99 uint32_t j
, k
= 0u, blkCnt
; /* loop counter */
100 int32_t inc
= 1; /* output pointer increment */
101 uint32_t outBlockSize
; /* loop counter */
102 q15_t x4
; /* Temporary input variable */
103 uint32_t tapCnt
; /* loop counter */
104 q31_t x1
, x2
, x3
, y1
; /* Temporary input variables */
106 /* The algorithm implementation is based on the lengths of the inputs. */
107 /* srcB is always made to slide across srcA. */
108 /* So srcBLen is always considered as shorter or equal to srcALen */
109 /* But CORR(x, y) is reverse of CORR(y, x) */
110 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
111 /* and the destination pointer modifier, inc is set to -1 */
112 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
113 /* But to improve the performance,
114 * we include zeroes in the output instead of zero padding either of the the inputs*/
115 /* If srcALen > srcBLen,
116 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
117 /* If srcALen < srcBLen,
118 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
119 if(srcALen
>= srcBLen
)
121 /* Initialization of inputA pointer */
124 /* Initialization of inputB pointer */
127 /* Number of output samples is calculated */
128 outBlockSize
= (2u * srcALen
) - 1u;
130 /* When srcALen > srcBLen, zero padding is done to srcB
131 * to make their lengths equal.
132 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
133 * number of output samples are made zero */
134 j
= outBlockSize
- (srcALen
+ (srcBLen
- 1u));
136 /* Updating the pointer position to non zero value */
142 /* Initialization of inputA pointer */
145 /* Initialization of inputB pointer */
148 /* srcBLen is always considered as shorter or equal to srcALen */
153 /* CORR(x, y) = Reverse order(CORR(y, x)) */
154 /* Hence set the destination pointer to point to the last output sample */
155 pOut
= pDst
+ ((srcALen
+ srcBLen
) - 2u);
157 /* Destination address modifier is set to -1 */
163 /* Copy (srcBLen) samples in scratch buffer */
166 /* First part of the processing with loop unrolling copies 4 data points at a time.
167 ** a second loop below copies for the remaining 1 to 3 samples. */
170 /* copy second buffer in reversal manner */
171 x4
= (q15_t
) * pIn2
++;
173 x4
= (q15_t
) * pIn2
++;
175 x4
= (q15_t
) * pIn2
++;
177 x4
= (q15_t
) * pIn2
++;
180 /* Decrement the loop counter */
184 /* If the count is not a multiple of 4, copy remaining samples here.
185 ** No loop unrolling is used. */
190 /* copy second buffer in reversal manner for remaining samples */
191 x4
= (q15_t
) * pIn2
++;
194 /* Decrement the loop counter */
198 /* Fill (srcBLen - 1u) zeros in scratch buffer */
199 arm_fill_q15(0, pScr1
, (srcBLen
- 1u));
201 /* Update temporary scratch pointer */
202 pScr1
+= (srcBLen
- 1u);
204 /* Copy (srcALen) samples in scratch buffer */
207 /* First part of the processing with loop unrolling copies 4 data points at a time.
208 ** a second loop below copies for the remaining 1 to 3 samples. */
211 /* copy second buffer in reversal manner */
212 x4
= (q15_t
) * pIn1
++;
214 x4
= (q15_t
) * pIn1
++;
216 x4
= (q15_t
) * pIn1
++;
218 x4
= (q15_t
) * pIn1
++;
221 /* Decrement the loop counter */
225 /* If the count is not a multiple of 4, copy remaining samples here.
226 ** No loop unrolling is used. */
231 /* copy second buffer in reversal manner for remaining samples */
232 x4
= (q15_t
) * pIn1
++;
235 /* Decrement the loop counter */
239 #ifndef UNALIGNED_SUPPORT_DISABLE
241 /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
242 arm_fill_q15(0, pScr1
, (srcBLen
- 1u));
245 pScr1
+= (srcBLen
- 1u);
249 /* Apply loop unrolling and do 4 Copies simultaneously. */
250 k
= (srcBLen
- 1u) >> 2u;
252 /* First part of the processing with loop unrolling copies 4 data points at a time.
253 ** a second loop below copies for the remaining 1 to 3 samples. */
256 /* copy second buffer in reversal manner */
262 /* Decrement the loop counter */
266 /* If the count is not a multiple of 4, copy remaining samples here.
267 ** No loop unrolling is used. */
268 k
= (srcBLen
- 1u) % 0x4u
;
272 /* copy second buffer in reversal manner for remaining samples */
275 /* Decrement the loop counter */
279 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
281 /* Temporary pointer for second sequence */
284 /* Initialization of pScr2 pointer */
287 /* Actual correlation process starts here */
288 blkCnt
= (srcALen
+ srcBLen
- 1u) >> 2;
292 /* Initialze temporary scratch pointer as scratch1 */
295 /* Clear Accumlators */
301 /* Read two samples from scratch1 buffer */
302 x1
= *__SIMD32(pScr1
)++;
304 /* Read next two samples from scratch1 buffer */
305 x2
= *__SIMD32(pScr1
)++;
307 tapCnt
= (srcBLen
) >> 2u;
312 /* Read four samples from smaller buffer */
313 y1
= _SIMD32_OFFSET(pScr2
);
315 /* multiply and accumlate */
316 acc0
= __SMLAD(x1
, y1
, acc0
);
317 acc2
= __SMLAD(x2
, y1
, acc2
);
319 /* pack input data */
320 #ifndef ARM_MATH_BIG_ENDIAN
321 x3
= __PKHBT(x2
, x1
, 0);
323 x3
= __PKHBT(x1
, x2
, 0);
326 /* multiply and accumlate */
327 acc1
= __SMLADX(x3
, y1
, acc1
);
329 /* Read next two samples from scratch1 buffer */
330 x1
= *__SIMD32(pScr1
)++;
332 /* pack input data */
333 #ifndef ARM_MATH_BIG_ENDIAN
334 x3
= __PKHBT(x1
, x2
, 0);
336 x3
= __PKHBT(x2
, x1
, 0);
339 acc3
= __SMLADX(x3
, y1
, acc3
);
341 /* Read four samples from smaller buffer */
342 y1
= _SIMD32_OFFSET(pScr2
+ 2u);
344 acc0
= __SMLAD(x2
, y1
, acc0
);
346 acc2
= __SMLAD(x1
, y1
, acc2
);
348 acc1
= __SMLADX(x3
, y1
, acc1
);
350 x2
= *__SIMD32(pScr1
)++;
352 #ifndef ARM_MATH_BIG_ENDIAN
353 x3
= __PKHBT(x2
, x1
, 0);
355 x3
= __PKHBT(x1
, x2
, 0);
358 acc3
= __SMLADX(x3
, y1
, acc3
);
363 /* Decrement the loop counter */
369 /* Update scratch pointer for remaining samples of smaller length sequence */
373 /* apply same above for remaining samples of smaller length sequence */
374 tapCnt
= (srcBLen
) & 3u;
379 /* accumlate the results */
380 acc0
+= (*pScr1
++ * *pScr2
);
381 acc1
+= (*pScr1
++ * *pScr2
);
382 acc2
+= (*pScr1
++ * *pScr2
);
383 acc3
+= (*pScr1
++ * *pScr2
++);
387 /* Decrement the loop counter */
393 /* Store the result in the accumulator in the destination buffer. */
394 *pOut
= (q7_t
) (__SSAT(acc0
>> 7u, 8));
396 *pOut
= (q7_t
) (__SSAT(acc1
>> 7u, 8));
398 *pOut
= (q7_t
) (__SSAT(acc2
>> 7u, 8));
400 *pOut
= (q7_t
) (__SSAT(acc3
>> 7u, 8));
403 /* Initialization of inputB pointer */
411 blkCnt
= (srcALen
+ srcBLen
- 1u) & 0x3;
413 /* Calculate correlation for remaining samples of Bigger length sequence */
416 /* Initialze temporary scratch pointer as scratch1 */
419 /* Clear Accumlators */
422 tapCnt
= (srcBLen
) >> 1u;
426 acc0
+= (*pScr1
++ * *pScr2
++);
427 acc0
+= (*pScr1
++ * *pScr2
++);
429 /* Decrement the loop counter */
433 tapCnt
= (srcBLen
) & 1u;
435 /* apply same above for remaining samples of smaller length sequence */
439 /* accumlate the results */
440 acc0
+= (*pScr1
++ * *pScr2
++);
442 /* Decrement the loop counter */
448 /* Store the result in the accumulator in the destination buffer. */
449 *pOut
= (q7_t
) (__SSAT(acc0
>> 7u, 8));
453 /* Initialization of inputB pointer */
463 * @} end of Corr group