1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
4 * $Date: 17. January 2013
7 * Project: CMSIS DSP Library
8 * Title: arm_fir_fast_q31.c
10 * Description: Processing function for the Q31 Fast FIR filter.
12 * Target Processor: Cortex-M4/Cortex-M3
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
44 * @ingroup groupFilters
53 * @param[in] *S points to an instance of the Q31 structure.
54 * @param[in] *pSrc points to the block of input data.
55 * @param[out] *pDst points to the block output data.
56 * @param[in] blockSize number of samples to process per call.
59 * <b>Scaling and Overflow Behavior:</b>
62 * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
63 * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
64 * These intermediate results are added to a 2.30 accumulator.
65 * Finally, the accumulator is saturated and converted to a 1.31 result.
66 * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
67 * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
70 * Refer to the function <code>arm_fir_q31()</code> for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision. Both the slow and the fast versions use the same instance structure.
71 * Use the function <code>arm_fir_init_q31()</code> to initialize the filter structure.
74 IAR_ONLY_LOW_OPTIMIZATION_ENTER
75 void arm_fir_fast_q31(
76 const arm_fir_instance_q31
* S
,
81 q31_t
*pState
= S
->pState
; /* State pointer */
82 q31_t
*pCoeffs
= S
->pCoeffs
; /* Coefficient pointer */
83 q31_t
*pStateCurnt
; /* Points to the current sample of the state */
84 q31_t x0
, x1
, x2
, x3
; /* Temporary variables to hold state */
85 q31_t c0
; /* Temporary variable to hold coefficient value */
86 q31_t
*px
; /* Temporary pointer for state */
87 q31_t
*pb
; /* Temporary pointer for coefficient buffer */
88 q31_t acc0
, acc1
, acc2
, acc3
; /* Accumulators */
89 uint32_t numTaps
= S
->numTaps
; /* Number of filter coefficients in the filter */
90 uint32_t i
, tapCnt
, blkCnt
; /* Loop counters */
92 /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
93 /* pStateCurnt points to the location where the new input data should be written */
94 pStateCurnt
= &(S
->pState
[(numTaps
- 1u)]);
96 /* Apply loop unrolling and compute 4 output values simultaneously.
97 * The variables acc0 ... acc3 hold output values that are being computed:
99 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
100 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
101 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
102 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
104 blkCnt
= blockSize
>> 2;
106 /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
107 ** a second loop below computes the remaining 1 to 3 samples. */
110 /* Copy four new input samples into the state buffer */
111 *pStateCurnt
++ = *pSrc
++;
112 *pStateCurnt
++ = *pSrc
++;
113 *pStateCurnt
++ = *pSrc
++;
114 *pStateCurnt
++ = *pSrc
++;
116 /* Set all accumulators to zero */
122 /* Initialize state pointer */
125 /* Initialize coefficient pointer */
128 /* Read the first three samples from the state buffer:
129 * x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
134 /* Loop unrolling. Process 4 taps at a time. */
135 tapCnt
= numTaps
>> 2;
140 /* Read the b[numTaps] coefficient */
143 /* Read x[n-numTaps-3] sample */
146 /* acc0 += b[numTaps] * x[n-numTaps] */
147 multAcc_32x32_keep32_R(acc0
, x0
, c0
);
149 /* acc1 += b[numTaps] * x[n-numTaps-1] */
150 multAcc_32x32_keep32_R(acc1
, x1
, c0
);
152 /* acc2 += b[numTaps] * x[n-numTaps-2] */
153 multAcc_32x32_keep32_R(acc2
, x2
, c0
);
155 /* acc3 += b[numTaps] * x[n-numTaps-3] */
156 multAcc_32x32_keep32_R(acc3
, x3
, c0
);
158 /* Read the b[numTaps-1] coefficient */
161 /* Read x[n-numTaps-4] sample */
164 /* Perform the multiply-accumulates */
165 multAcc_32x32_keep32_R(acc0
, x1
, c0
);
166 multAcc_32x32_keep32_R(acc1
, x2
, c0
);
167 multAcc_32x32_keep32_R(acc2
, x3
, c0
);
168 multAcc_32x32_keep32_R(acc3
, x0
, c0
);
170 /* Read the b[numTaps-2] coefficient */
173 /* Read x[n-numTaps-5] sample */
176 /* Perform the multiply-accumulates */
177 multAcc_32x32_keep32_R(acc0
, x2
, c0
);
178 multAcc_32x32_keep32_R(acc1
, x3
, c0
);
179 multAcc_32x32_keep32_R(acc2
, x0
, c0
);
180 multAcc_32x32_keep32_R(acc3
, x1
, c0
);
182 /* Read the b[numTaps-3] coefficients */
185 /* Read x[n-numTaps-6] sample */
188 /* Perform the multiply-accumulates */
189 multAcc_32x32_keep32_R(acc0
, x3
, c0
);
190 multAcc_32x32_keep32_R(acc1
, x0
, c0
);
191 multAcc_32x32_keep32_R(acc2
, x1
, c0
);
192 multAcc_32x32_keep32_R(acc3
, x2
, c0
);
196 /* If the filter length is not a multiple of 4, compute the remaining filter taps */
198 i
= numTaps
- (tapCnt
* 4u);
201 /* Read coefficients */
204 /* Fetch 1 state variable */
207 /* Perform the multiply-accumulates */
208 multAcc_32x32_keep32_R(acc0
, x0
, c0
);
209 multAcc_32x32_keep32_R(acc1
, x1
, c0
);
210 multAcc_32x32_keep32_R(acc2
, x2
, c0
);
211 multAcc_32x32_keep32_R(acc3
, x3
, c0
);
213 /* Reuse the present sample states for next sample */
218 /* Decrement the loop counter */
222 /* Advance the state pointer by 4 to process the next group of 4 samples */
225 /* The results in the 4 accumulators are in 2.30 format. Convert to 1.31
226 ** Then store the 4 outputs in the destination buffer. */
227 *pDst
++ = (q31_t
) (acc0
<< 1);
228 *pDst
++ = (q31_t
) (acc1
<< 1);
229 *pDst
++ = (q31_t
) (acc2
<< 1);
230 *pDst
++ = (q31_t
) (acc3
<< 1);
232 /* Decrement the samples loop counter */
237 /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
238 ** No loop unrolling is used. */
239 blkCnt
= blockSize
% 4u;
243 /* Copy one sample at a time into state buffer */
244 *pStateCurnt
++ = *pSrc
++;
246 /* Set the accumulator to zero */
249 /* Initialize state pointer */
252 /* Initialize Coefficient pointer */
257 /* Perform the multiply-accumulates */
260 multAcc_32x32_keep32_R(acc0
, (*px
++), (*(pb
++)));
264 /* The result is in 2.30 format. Convert to 1.31
265 ** Then store the output in the destination buffer. */
266 *pDst
++ = (q31_t
) (acc0
<< 1);
268 /* Advance state pointer by 1 for the next sample */
271 /* Decrement the samples loop counter */
275 /* Processing is complete.
276 ** Now copy the last numTaps - 1 samples to the start of the state buffer.
277 ** This prepares the state buffer for the next function call. */
279 /* Points to the start of the state buffer */
280 pStateCurnt
= S
->pState
;
282 /* Calculate remaining number of copies */
283 tapCnt
= (numTaps
- 1u);
285 /* Copy the remaining q31_t data */
288 *pStateCurnt
++ = *pState
++;
290 /* Decrement the loop counter */
296 IAR_ONLY_LOW_OPTIMIZATION_EXIT
298 * @} end of FIR group