1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
4 * $Date: 17. January 2013
7 * Project: CMSIS DSP Library
8 * Title: arm_fir_decimate_fast_q15.c
10 * Description: Fast Q15 FIR Decimator.
12 * Target Processor: Cortex-M4/Cortex-M3
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
44 * @ingroup groupFilters
48 * @addtogroup FIR_decimate
53 * @brief Processing function for the Q15 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.
54 * @param[in] *S points to an instance of the Q15 FIR decimator structure.
55 * @param[in] *pSrc points to the block of input data.
56 * @param[out] *pDst points to the block of output data
57 * @param[in] blockSize number of input samples to process per call.
61 * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
62 * In this case input, output, state buffers should be aligned by 32-bit
64 * <b>Scaling and Overflow Behavior:</b>
66 * This fast version uses a 32-bit accumulator with 2.30 format.
67 * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
68 * Thus, if the accumulator result overflows it wraps around and distorts the result.
69 * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (log2 is read as log to the base 2).
70 * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
73 * Refer to the function <code>arm_fir_decimate_q15()</code> for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
74 * Both the slow and the fast versions use the same instance structure.
75 * Use the function <code>arm_fir_decimate_init_q15()</code> to initialize the filter structure.
78 #ifndef UNALIGNED_SUPPORT_DISABLE
80 void arm_fir_decimate_fast_q15(
81 const arm_fir_decimate_instance_q15
* S
,
86 q15_t
*pState
= S
->pState
; /* State pointer */
87 q15_t
*pCoeffs
= S
->pCoeffs
; /* Coefficient pointer */
88 q15_t
*pStateCurnt
; /* Points to the current sample of the state */
89 q15_t
*px
; /* Temporary pointer for state buffer */
90 q15_t
*pb
; /* Temporary pointer coefficient buffer */
91 q31_t x0
, x1
, c0
, c1
; /* Temporary variables to hold state and coefficient values */
92 q31_t sum0
; /* Accumulators */
96 uint32_t numTaps
= S
->numTaps
; /* Number of taps */
97 uint32_t i
, blkCnt
, tapCnt
, outBlockSize
= blockSize
/ S
->M
; /* Loop counters */
100 /* S->pState buffer contains previous frame (numTaps - 1) samples */
101 /* pStateCurnt points to the location where the new input data should be written */
102 pStateCurnt
= S
->pState
+ (numTaps
- 1u);
105 /* Total number of output samples to be computed */
106 blkCnt
= outBlockSize
/ 2;
107 blkCntN3
= outBlockSize
- (2 * blkCnt
);
112 /* Copy decimation factor number of new input samples into the state buffer */
117 *pStateCurnt
++ = *pSrc
++;
121 /* Set accumulator to zero */
125 /* Initialize state pointer */
131 /* Initialize coeff pointer */
134 /* Loop unrolling. Process 4 taps at a time. */
135 tapCnt
= numTaps
>> 2;
137 /* Loop over the number of taps. Unroll by a factor of 4.
138 ** Repeat until we've computed numTaps-4 coefficients. */
141 /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */
142 c0
= *__SIMD32(pb
)++;
144 /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
145 x0
= *__SIMD32(px0
)++;
147 x1
= *__SIMD32(px1
)++;
149 /* Perform the multiply-accumulate */
150 acc0
= __SMLAD(x0
, c0
, acc0
);
152 acc1
= __SMLAD(x1
, c0
, acc1
);
154 /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
155 c0
= *__SIMD32(pb
)++;
157 /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
158 x0
= *__SIMD32(px0
)++;
160 x1
= *__SIMD32(px1
)++;
162 /* Perform the multiply-accumulate */
163 acc0
= __SMLAD(x0
, c0
, acc0
);
165 acc1
= __SMLAD(x1
, c0
, acc1
);
167 /* Decrement the loop counter */
171 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
172 tapCnt
= numTaps
% 0x4u
;
176 /* Read coefficients */
179 /* Fetch 1 state variable */
184 /* Perform the multiply-accumulate */
185 acc0
= __SMLAD(x0
, c0
, acc0
);
186 acc1
= __SMLAD(x1
, c0
, acc1
);
188 /* Decrement the loop counter */
192 /* Advance the state pointer by the decimation factor
193 * to process the next group of decimation factor number samples */
194 pState
= pState
+ S
->M
* 2;
196 /* Store filter output, smlad returns the values in 2.14 format */
197 /* so downsacle by 15 to get output in 1.15 */
198 *pDst
++ = (q15_t
) (__SSAT((acc0
>> 15), 16));
199 *pDst
++ = (q15_t
) (__SSAT((acc1
>> 15), 16));
201 /* Decrement the loop counter */
209 /* Copy decimation factor number of new input samples into the state buffer */
214 *pStateCurnt
++ = *pSrc
++;
221 /* Initialize state pointer */
224 /* Initialize coeff pointer */
227 /* Loop unrolling. Process 4 taps at a time. */
228 tapCnt
= numTaps
>> 2;
230 /* Loop over the number of taps. Unroll by a factor of 4.
231 ** Repeat until we've computed numTaps-4 coefficients. */
234 /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */
235 c0
= *__SIMD32(pb
)++;
237 /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
238 x0
= *__SIMD32(px
)++;
240 /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
241 c1
= *__SIMD32(pb
)++;
243 /* Perform the multiply-accumulate */
244 sum0
= __SMLAD(x0
, c0
, sum0
);
246 /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
247 x0
= *__SIMD32(px
)++;
249 /* Perform the multiply-accumulate */
250 sum0
= __SMLAD(x0
, c1
, sum0
);
252 /* Decrement the loop counter */
256 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
257 tapCnt
= numTaps
% 0x4u
;
261 /* Read coefficients */
264 /* Fetch 1 state variable */
267 /* Perform the multiply-accumulate */
268 sum0
= __SMLAD(x0
, c0
, sum0
);
270 /* Decrement the loop counter */
274 /* Advance the state pointer by the decimation factor
275 * to process the next group of decimation factor number samples */
276 pState
= pState
+ S
->M
;
278 /* Store filter output, smlad returns the values in 2.14 format */
279 /* so downsacle by 15 to get output in 1.15 */
280 *pDst
++ = (q15_t
) (__SSAT((sum0
>> 15), 16));
282 /* Decrement the loop counter */
286 /* Processing is complete.
287 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
288 ** This prepares the state buffer for the next function call. */
290 /* Points to the start of the state buffer */
291 pStateCurnt
= S
->pState
;
293 i
= (numTaps
- 1u) >> 2u;
298 *__SIMD32(pStateCurnt
)++ = *__SIMD32(pState
)++;
299 *__SIMD32(pStateCurnt
)++ = *__SIMD32(pState
)++;
301 /* Decrement the loop counter */
305 i
= (numTaps
- 1u) % 0x04u
;
310 *pStateCurnt
++ = *pState
++;
312 /* Decrement the loop counter */
320 void arm_fir_decimate_fast_q15(
321 const arm_fir_decimate_instance_q15
* S
,
326 q15_t
*pState
= S
->pState
; /* State pointer */
327 q15_t
*pCoeffs
= S
->pCoeffs
; /* Coefficient pointer */
328 q15_t
*pStateCurnt
; /* Points to the current sample of the state */
329 q15_t
*px
; /* Temporary pointer for state buffer */
330 q15_t
*pb
; /* Temporary pointer coefficient buffer */
331 q15_t x0
, x1
, c0
; /* Temporary variables to hold state and coefficient values */
332 q31_t sum0
; /* Accumulators */
336 uint32_t numTaps
= S
->numTaps
; /* Number of taps */
337 uint32_t i
, blkCnt
, tapCnt
, outBlockSize
= blockSize
/ S
->M
; /* Loop counters */
340 /* S->pState buffer contains previous frame (numTaps - 1) samples */
341 /* pStateCurnt points to the location where the new input data should be written */
342 pStateCurnt
= S
->pState
+ (numTaps
- 1u);
345 /* Total number of output samples to be computed */
346 blkCnt
= outBlockSize
/ 2;
347 blkCntN3
= outBlockSize
- (2 * blkCnt
);
351 /* Copy decimation factor number of new input samples into the state buffer */
356 *pStateCurnt
++ = *pSrc
++;
360 /* Set accumulator to zero */
364 /* Initialize state pointer */
370 /* Initialize coeff pointer */
373 /* Loop unrolling. Process 4 taps at a time. */
374 tapCnt
= numTaps
>> 2;
376 /* Loop over the number of taps. Unroll by a factor of 4.
377 ** Repeat until we've computed numTaps-4 coefficients. */
380 /* Read the Read b[numTaps-1] coefficients */
383 /* Read x[n-numTaps-1] for sample 0 and for sample 1 */
387 /* Perform the multiply-accumulate */
391 /* Read the b[numTaps-2] coefficient */
394 /* Read x[n-numTaps-2] for sample 0 and sample 1 */
398 /* Perform the multiply-accumulate */
402 /* Read the b[numTaps-3] coefficients */
405 /* Read x[n-numTaps-3] for sample 0 and sample 1 */
409 /* Perform the multiply-accumulate */
413 /* Read the b[numTaps-4] coefficient */
416 /* Read x[n-numTaps-4] for sample 0 and sample 1 */
420 /* Perform the multiply-accumulate */
424 /* Decrement the loop counter */
428 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
429 tapCnt
= numTaps
% 0x4u
;
433 /* Read coefficients */
436 /* Fetch 1 state variable */
440 /* Perform the multiply-accumulate */
444 /* Decrement the loop counter */
448 /* Advance the state pointer by the decimation factor
449 * to process the next group of decimation factor number samples */
450 pState
= pState
+ S
->M
* 2;
452 /* Store filter output, smlad returns the values in 2.14 format */
453 /* so downsacle by 15 to get output in 1.15 */
455 *pDst
++ = (q15_t
) (__SSAT((acc0
>> 15), 16));
456 *pDst
++ = (q15_t
) (__SSAT((acc1
>> 15), 16));
459 /* Decrement the loop counter */
465 /* Copy decimation factor number of new input samples into the state buffer */
470 *pStateCurnt
++ = *pSrc
++;
477 /* Initialize state pointer */
480 /* Initialize coeff pointer */
483 /* Loop unrolling. Process 4 taps at a time. */
484 tapCnt
= numTaps
>> 2;
486 /* Loop over the number of taps. Unroll by a factor of 4.
487 ** Repeat until we've computed numTaps-4 coefficients. */
490 /* Read the Read b[numTaps-1] coefficients */
493 /* Read x[n-numTaps-1] and sample */
496 /* Perform the multiply-accumulate */
499 /* Read the b[numTaps-2] coefficient */
502 /* Read x[n-numTaps-2] and sample */
505 /* Perform the multiply-accumulate */
508 /* Read the b[numTaps-3] coefficients */
511 /* Read x[n-numTaps-3] sample */
514 /* Perform the multiply-accumulate */
517 /* Read the b[numTaps-4] coefficient */
520 /* Read x[n-numTaps-4] sample */
523 /* Perform the multiply-accumulate */
526 /* Decrement the loop counter */
530 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
531 tapCnt
= numTaps
% 0x4u
;
535 /* Read coefficients */
538 /* Fetch 1 state variable */
541 /* Perform the multiply-accumulate */
544 /* Decrement the loop counter */
548 /* Advance the state pointer by the decimation factor
549 * to process the next group of decimation factor number samples */
550 pState
= pState
+ S
->M
;
552 /* Store filter output, smlad returns the values in 2.14 format */
553 /* so downsacle by 15 to get output in 1.15 */
554 *pDst
++ = (q15_t
) (__SSAT((sum0
>> 15), 16));
556 /* Decrement the loop counter */
560 /* Processing is complete.
561 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
562 ** This prepares the state buffer for the next function call. */
564 /* Points to the start of the state buffer */
565 pStateCurnt
= S
->pState
;
567 i
= (numTaps
- 1u) >> 2u;
572 *pStateCurnt
++ = *pState
++;
573 *pStateCurnt
++ = *pState
++;
574 *pStateCurnt
++ = *pState
++;
575 *pStateCurnt
++ = *pState
++;
577 /* Decrement the loop counter */
581 i
= (numTaps
- 1u) % 0x04u
;
586 *pStateCurnt
++ = *pState
++;
588 /* Decrement the loop counter */
594 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
597 * @} end of FIR_decimate group