]> git.gir.st - tmk_keyboard.git/blob - tmk_core/tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_fir_decimate_fast_q15.c
Merge commit '1fe4406f374291ab2e86e95a97341fd9c475fcb8'
[tmk_keyboard.git] / tmk_core / tool / mbed / mbed-sdk / libraries / dsp / cmsis_dsp / FilteringFunctions / arm_fir_decimate_fast_q15.c
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
3 *
4 * $Date: 17. January 2013
5 * $Revision: V1.4.1
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_fir_decimate_fast_q15.c
9 *
10 * Description: Fast Q15 FIR Decimator.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40
41 #include "arm_math.h"
42
43 /**
44 * @ingroup groupFilters
45 */
46
47 /**
48 * @addtogroup FIR_decimate
49 * @{
50 */
51
52 /**
53 * @brief Processing function for the Q15 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.
54 * @param[in] *S points to an instance of the Q15 FIR decimator structure.
55 * @param[in] *pSrc points to the block of input data.
56 * @param[out] *pDst points to the block of output data
57 * @param[in] blockSize number of input samples to process per call.
58 * @return none
59 *
60 * \par Restrictions
61 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
62 * In this case input, output, state buffers should be aligned by 32-bit
63 *
64 * <b>Scaling and Overflow Behavior:</b>
65 * \par
66 * This fast version uses a 32-bit accumulator with 2.30 format.
67 * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
68 * Thus, if the accumulator result overflows it wraps around and distorts the result.
69 * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (log2 is read as log to the base 2).
70 * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
71 *
72 * \par
73 * Refer to the function <code>arm_fir_decimate_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
74 * Both the slow and the fast versions use the same instance structure.
75 * Use the function <code>arm_fir_decimate_init_q15()</code> to initialize the filter structure.
76 */
77
78 #ifndef UNALIGNED_SUPPORT_DISABLE
79
80 void arm_fir_decimate_fast_q15(
81 const arm_fir_decimate_instance_q15 * S,
82 q15_t * pSrc,
83 q15_t * pDst,
84 uint32_t blockSize)
85 {
86 q15_t *pState = S->pState; /* State pointer */
87 q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
88 q15_t *pStateCurnt; /* Points to the current sample of the state */
89 q15_t *px; /* Temporary pointer for state buffer */
90 q15_t *pb; /* Temporary pointer coefficient buffer */
91 q31_t x0, x1, c0, c1; /* Temporary variables to hold state and coefficient values */
92 q31_t sum0; /* Accumulators */
93 q31_t acc0, acc1;
94 q15_t *px0, *px1;
95 uint32_t blkCntN3;
96 uint32_t numTaps = S->numTaps; /* Number of taps */
97 uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
98
99
100 /* S->pState buffer contains previous frame (numTaps - 1) samples */
101 /* pStateCurnt points to the location where the new input data should be written */
102 pStateCurnt = S->pState + (numTaps - 1u);
103
104
105 /* Total number of output samples to be computed */
106 blkCnt = outBlockSize / 2;
107 blkCntN3 = outBlockSize - (2 * blkCnt);
108
109
110 while(blkCnt > 0u)
111 {
112 /* Copy decimation factor number of new input samples into the state buffer */
113 i = 2 * S->M;
114
115 do
116 {
117 *pStateCurnt++ = *pSrc++;
118
119 } while(--i);
120
121 /* Set accumulator to zero */
122 acc0 = 0;
123 acc1 = 0;
124
125 /* Initialize state pointer */
126 px0 = pState;
127
128 px1 = pState + S->M;
129
130
131 /* Initialize coeff pointer */
132 pb = pCoeffs;
133
134 /* Loop unrolling. Process 4 taps at a time. */
135 tapCnt = numTaps >> 2;
136
137 /* Loop over the number of taps. Unroll by a factor of 4.
138 ** Repeat until we've computed numTaps-4 coefficients. */
139 while(tapCnt > 0u)
140 {
141 /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */
142 c0 = *__SIMD32(pb)++;
143
144 /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
145 x0 = *__SIMD32(px0)++;
146
147 x1 = *__SIMD32(px1)++;
148
149 /* Perform the multiply-accumulate */
150 acc0 = __SMLAD(x0, c0, acc0);
151
152 acc1 = __SMLAD(x1, c0, acc1);
153
154 /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
155 c0 = *__SIMD32(pb)++;
156
157 /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
158 x0 = *__SIMD32(px0)++;
159
160 x1 = *__SIMD32(px1)++;
161
162 /* Perform the multiply-accumulate */
163 acc0 = __SMLAD(x0, c0, acc0);
164
165 acc1 = __SMLAD(x1, c0, acc1);
166
167 /* Decrement the loop counter */
168 tapCnt--;
169 }
170
171 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
172 tapCnt = numTaps % 0x4u;
173
174 while(tapCnt > 0u)
175 {
176 /* Read coefficients */
177 c0 = *pb++;
178
179 /* Fetch 1 state variable */
180 x0 = *px0++;
181
182 x1 = *px1++;
183
184 /* Perform the multiply-accumulate */
185 acc0 = __SMLAD(x0, c0, acc0);
186 acc1 = __SMLAD(x1, c0, acc1);
187
188 /* Decrement the loop counter */
189 tapCnt--;
190 }
191
192 /* Advance the state pointer by the decimation factor
193 * to process the next group of decimation factor number samples */
194 pState = pState + S->M * 2;
195
196 /* Store filter output, smlad returns the values in 2.14 format */
197 /* so downsacle by 15 to get output in 1.15 */
198 *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
199 *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
200
201 /* Decrement the loop counter */
202 blkCnt--;
203 }
204
205
206
207 while(blkCntN3 > 0u)
208 {
209 /* Copy decimation factor number of new input samples into the state buffer */
210 i = S->M;
211
212 do
213 {
214 *pStateCurnt++ = *pSrc++;
215
216 } while(--i);
217
218 /*Set sum to zero */
219 sum0 = 0;
220
221 /* Initialize state pointer */
222 px = pState;
223
224 /* Initialize coeff pointer */
225 pb = pCoeffs;
226
227 /* Loop unrolling. Process 4 taps at a time. */
228 tapCnt = numTaps >> 2;
229
230 /* Loop over the number of taps. Unroll by a factor of 4.
231 ** Repeat until we've computed numTaps-4 coefficients. */
232 while(tapCnt > 0u)
233 {
234 /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */
235 c0 = *__SIMD32(pb)++;
236
237 /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
238 x0 = *__SIMD32(px)++;
239
240 /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
241 c1 = *__SIMD32(pb)++;
242
243 /* Perform the multiply-accumulate */
244 sum0 = __SMLAD(x0, c0, sum0);
245
246 /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
247 x0 = *__SIMD32(px)++;
248
249 /* Perform the multiply-accumulate */
250 sum0 = __SMLAD(x0, c1, sum0);
251
252 /* Decrement the loop counter */
253 tapCnt--;
254 }
255
256 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
257 tapCnt = numTaps % 0x4u;
258
259 while(tapCnt > 0u)
260 {
261 /* Read coefficients */
262 c0 = *pb++;
263
264 /* Fetch 1 state variable */
265 x0 = *px++;
266
267 /* Perform the multiply-accumulate */
268 sum0 = __SMLAD(x0, c0, sum0);
269
270 /* Decrement the loop counter */
271 tapCnt--;
272 }
273
274 /* Advance the state pointer by the decimation factor
275 * to process the next group of decimation factor number samples */
276 pState = pState + S->M;
277
278 /* Store filter output, smlad returns the values in 2.14 format */
279 /* so downsacle by 15 to get output in 1.15 */
280 *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
281
282 /* Decrement the loop counter */
283 blkCntN3--;
284 }
285
286 /* Processing is complete.
287 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
288 ** This prepares the state buffer for the next function call. */
289
290 /* Points to the start of the state buffer */
291 pStateCurnt = S->pState;
292
293 i = (numTaps - 1u) >> 2u;
294
295 /* copy data */
296 while(i > 0u)
297 {
298 *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
299 *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
300
301 /* Decrement the loop counter */
302 i--;
303 }
304
305 i = (numTaps - 1u) % 0x04u;
306
307 /* copy data */
308 while(i > 0u)
309 {
310 *pStateCurnt++ = *pState++;
311
312 /* Decrement the loop counter */
313 i--;
314 }
315 }
316
317 #else
318
319
320 void arm_fir_decimate_fast_q15(
321 const arm_fir_decimate_instance_q15 * S,
322 q15_t * pSrc,
323 q15_t * pDst,
324 uint32_t blockSize)
325 {
326 q15_t *pState = S->pState; /* State pointer */
327 q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
328 q15_t *pStateCurnt; /* Points to the current sample of the state */
329 q15_t *px; /* Temporary pointer for state buffer */
330 q15_t *pb; /* Temporary pointer coefficient buffer */
331 q15_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */
332 q31_t sum0; /* Accumulators */
333 q31_t acc0, acc1;
334 q15_t *px0, *px1;
335 uint32_t blkCntN3;
336 uint32_t numTaps = S->numTaps; /* Number of taps */
337 uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
338
339
340 /* S->pState buffer contains previous frame (numTaps - 1) samples */
341 /* pStateCurnt points to the location where the new input data should be written */
342 pStateCurnt = S->pState + (numTaps - 1u);
343
344
345 /* Total number of output samples to be computed */
346 blkCnt = outBlockSize / 2;
347 blkCntN3 = outBlockSize - (2 * blkCnt);
348
349 while(blkCnt > 0u)
350 {
351 /* Copy decimation factor number of new input samples into the state buffer */
352 i = 2 * S->M;
353
354 do
355 {
356 *pStateCurnt++ = *pSrc++;
357
358 } while(--i);
359
360 /* Set accumulator to zero */
361 acc0 = 0;
362 acc1 = 0;
363
364 /* Initialize state pointer */
365 px0 = pState;
366
367 px1 = pState + S->M;
368
369
370 /* Initialize coeff pointer */
371 pb = pCoeffs;
372
373 /* Loop unrolling. Process 4 taps at a time. */
374 tapCnt = numTaps >> 2;
375
376 /* Loop over the number of taps. Unroll by a factor of 4.
377 ** Repeat until we've computed numTaps-4 coefficients. */
378 while(tapCnt > 0u)
379 {
380 /* Read the Read b[numTaps-1] coefficients */
381 c0 = *pb++;
382
383 /* Read x[n-numTaps-1] for sample 0 and for sample 1 */
384 x0 = *px0++;
385 x1 = *px1++;
386
387 /* Perform the multiply-accumulate */
388 acc0 += x0 * c0;
389 acc1 += x1 * c0;
390
391 /* Read the b[numTaps-2] coefficient */
392 c0 = *pb++;
393
394 /* Read x[n-numTaps-2] for sample 0 and sample 1 */
395 x0 = *px0++;
396 x1 = *px1++;
397
398 /* Perform the multiply-accumulate */
399 acc0 += x0 * c0;
400 acc1 += x1 * c0;
401
402 /* Read the b[numTaps-3] coefficients */
403 c0 = *pb++;
404
405 /* Read x[n-numTaps-3] for sample 0 and sample 1 */
406 x0 = *px0++;
407 x1 = *px1++;
408
409 /* Perform the multiply-accumulate */
410 acc0 += x0 * c0;
411 acc1 += x1 * c0;
412
413 /* Read the b[numTaps-4] coefficient */
414 c0 = *pb++;
415
416 /* Read x[n-numTaps-4] for sample 0 and sample 1 */
417 x0 = *px0++;
418 x1 = *px1++;
419
420 /* Perform the multiply-accumulate */
421 acc0 += x0 * c0;
422 acc1 += x1 * c0;
423
424 /* Decrement the loop counter */
425 tapCnt--;
426 }
427
428 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
429 tapCnt = numTaps % 0x4u;
430
431 while(tapCnt > 0u)
432 {
433 /* Read coefficients */
434 c0 = *pb++;
435
436 /* Fetch 1 state variable */
437 x0 = *px0++;
438 x1 = *px1++;
439
440 /* Perform the multiply-accumulate */
441 acc0 += x0 * c0;
442 acc1 += x1 * c0;
443
444 /* Decrement the loop counter */
445 tapCnt--;
446 }
447
448 /* Advance the state pointer by the decimation factor
449 * to process the next group of decimation factor number samples */
450 pState = pState + S->M * 2;
451
452 /* Store filter output, smlad returns the values in 2.14 format */
453 /* so downsacle by 15 to get output in 1.15 */
454
455 *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
456 *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
457
458
459 /* Decrement the loop counter */
460 blkCnt--;
461 }
462
463 while(blkCntN3 > 0u)
464 {
465 /* Copy decimation factor number of new input samples into the state buffer */
466 i = S->M;
467
468 do
469 {
470 *pStateCurnt++ = *pSrc++;
471
472 } while(--i);
473
474 /*Set sum to zero */
475 sum0 = 0;
476
477 /* Initialize state pointer */
478 px = pState;
479
480 /* Initialize coeff pointer */
481 pb = pCoeffs;
482
483 /* Loop unrolling. Process 4 taps at a time. */
484 tapCnt = numTaps >> 2;
485
486 /* Loop over the number of taps. Unroll by a factor of 4.
487 ** Repeat until we've computed numTaps-4 coefficients. */
488 while(tapCnt > 0u)
489 {
490 /* Read the Read b[numTaps-1] coefficients */
491 c0 = *pb++;
492
493 /* Read x[n-numTaps-1] and sample */
494 x0 = *px++;
495
496 /* Perform the multiply-accumulate */
497 sum0 += x0 * c0;
498
499 /* Read the b[numTaps-2] coefficient */
500 c0 = *pb++;
501
502 /* Read x[n-numTaps-2] and sample */
503 x0 = *px++;
504
505 /* Perform the multiply-accumulate */
506 sum0 += x0 * c0;
507
508 /* Read the b[numTaps-3] coefficients */
509 c0 = *pb++;
510
511 /* Read x[n-numTaps-3] sample */
512 x0 = *px++;
513
514 /* Perform the multiply-accumulate */
515 sum0 += x0 * c0;
516
517 /* Read the b[numTaps-4] coefficient */
518 c0 = *pb++;
519
520 /* Read x[n-numTaps-4] sample */
521 x0 = *px++;
522
523 /* Perform the multiply-accumulate */
524 sum0 += x0 * c0;
525
526 /* Decrement the loop counter */
527 tapCnt--;
528 }
529
530 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
531 tapCnt = numTaps % 0x4u;
532
533 while(tapCnt > 0u)
534 {
535 /* Read coefficients */
536 c0 = *pb++;
537
538 /* Fetch 1 state variable */
539 x0 = *px++;
540
541 /* Perform the multiply-accumulate */
542 sum0 += x0 * c0;
543
544 /* Decrement the loop counter */
545 tapCnt--;
546 }
547
548 /* Advance the state pointer by the decimation factor
549 * to process the next group of decimation factor number samples */
550 pState = pState + S->M;
551
552 /* Store filter output, smlad returns the values in 2.14 format */
553 /* so downsacle by 15 to get output in 1.15 */
554 *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
555
556 /* Decrement the loop counter */
557 blkCntN3--;
558 }
559
560 /* Processing is complete.
561 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
562 ** This prepares the state buffer for the next function call. */
563
564 /* Points to the start of the state buffer */
565 pStateCurnt = S->pState;
566
567 i = (numTaps - 1u) >> 2u;
568
569 /* copy data */
570 while(i > 0u)
571 {
572 *pStateCurnt++ = *pState++;
573 *pStateCurnt++ = *pState++;
574 *pStateCurnt++ = *pState++;
575 *pStateCurnt++ = *pState++;
576
577 /* Decrement the loop counter */
578 i--;
579 }
580
581 i = (numTaps - 1u) % 0x04u;
582
583 /* copy data */
584 while(i > 0u)
585 {
586 *pStateCurnt++ = *pState++;
587
588 /* Decrement the loop counter */
589 i--;
590 }
591 }
592
593
594 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
595
596 /**
597 * @} end of FIR_decimate group
598 */
Imprint / Impressum