1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
4 * $Date: 17. January 2013
7 * Project: CMSIS DSP Library
8 * Title: arm_fir_sparse_q7.c
10 * Description: Q7 sparse FIR filter processing function.
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * ------------------------------------------------------------------- */
44 * @ingroup groupFilters
48 * @addtogroup FIR_Sparse
54 * @brief Processing function for the Q7 sparse FIR filter.
55 * @param[in] *S points to an instance of the Q7 sparse FIR structure.
56 * @param[in] *pSrc points to the block of input data.
57 * @param[out] *pDst points to the block of output data
58 * @param[in] *pScratchIn points to a temporary buffer of size blockSize.
59 * @param[in] *pScratchOut points to a temporary buffer of size blockSize.
60 * @param[in] blockSize number of input samples to process per call.
63 * <b>Scaling and Overflow Behavior:</b>
65 * The function is implemented using a 32-bit internal accumulator.
66 * Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result.
67 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
68 * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
69 * The accumulator is then converted to 18.7 format by discarding the low 7 bits.
70 * Finally, the result is truncated to 1.7 format.
73 void arm_fir_sparse_q7(
74 arm_fir_sparse_instance_q7
* S
,
82 q7_t
*pState
= S
->pState
; /* State pointer */
83 q7_t
*pCoeffs
= S
->pCoeffs
; /* Coefficient pointer */
84 q7_t
*px
; /* Scratch buffer pointer */
85 q7_t
*py
= pState
; /* Temporary pointers for state buffer */
86 q7_t
*pb
= pScratchIn
; /* Temporary pointers for scratch buffer */
87 q7_t
*pOut
= pDst
; /* Destination pointer */
88 int32_t *pTapDelay
= S
->pTapDelay
; /* Pointer to the array containing offset of the non-zero tap values. */
89 uint32_t delaySize
= S
->maxDelay
+ blockSize
; /* state length */
90 uint16_t numTaps
= S
->numTaps
; /* Filter order */
91 int32_t readIndex
; /* Read index of the state buffer */
92 uint32_t tapCnt
, blkCnt
; /* loop counters */
93 q7_t coeff
= *pCoeffs
++; /* Read the coefficient value */
94 q31_t
*pScr2
= pScratchOut
; /* Working pointer for scratch buffer of output values */
98 #ifndef ARM_MATH_CM0_FAMILY
100 /* Run the below code for Cortex-M4 and Cortex-M3 */
102 q7_t in1
, in2
, in3
, in4
;
104 /* BlockSize of Input samples are copied into the state buffer */
105 /* StateIndex points to the starting position to write in the state buffer */
106 arm_circularWrite_q7(py
, (int32_t) delaySize
, &S
->stateIndex
, 1, pSrc
, 1,
109 /* Loop over the number of taps. */
112 /* Read Index, from where the state buffer should be read, is calculated. */
113 readIndex
= ((int32_t) S
->stateIndex
- (int32_t) blockSize
) - *pTapDelay
++;
115 /* Wraparound of readIndex */
118 readIndex
+= (int32_t) delaySize
;
121 /* Working pointer for state buffer is updated */
124 /* blockSize samples are read from the state buffer */
125 arm_circularRead_q7(py
, (int32_t) delaySize
, &readIndex
, 1, pb
, pb
,
126 (int32_t) blockSize
, 1, blockSize
);
128 /* Working pointer for the scratch buffer of state values */
131 /* Working pointer for scratch buffer of output values */
134 /* Loop over the blockSize. Unroll by a factor of 4.
135 * Compute 4 multiplications at a time. */
136 blkCnt
= blockSize
>> 2;
140 /* Perform multiplication and store in the scratch buffer */
141 *pScratchOut
++ = ((q31_t
) * px
++ * coeff
);
142 *pScratchOut
++ = ((q31_t
) * px
++ * coeff
);
143 *pScratchOut
++ = ((q31_t
) * px
++ * coeff
);
144 *pScratchOut
++ = ((q31_t
) * px
++ * coeff
);
146 /* Decrement the loop counter */
150 /* If the blockSize is not a multiple of 4,
151 * compute the remaining samples */
152 blkCnt
= blockSize
% 0x4u
;
156 /* Perform multiplication and store in the scratch buffer */
157 *pScratchOut
++ = ((q31_t
) * px
++ * coeff
);
159 /* Decrement the loop counter */
163 /* Load the coefficient value and
164 * increment the coefficient buffer for the next set of state values */
167 /* Read Index, from where the state buffer should be read, is calculated. */
168 readIndex
= ((int32_t) S
->stateIndex
- (int32_t) blockSize
) - *pTapDelay
++;
170 /* Wraparound of readIndex */
173 readIndex
+= (int32_t) delaySize
;
176 /* Loop over the number of taps. */
177 tapCnt
= (uint32_t) numTaps
- 1u;
181 /* Working pointer for state buffer is updated */
184 /* blockSize samples are read from the state buffer */
185 arm_circularRead_q7(py
, (int32_t) delaySize
, &readIndex
, 1, pb
, pb
,
186 (int32_t) blockSize
, 1, blockSize
);
188 /* Working pointer for the scratch buffer of state values */
191 /* Working pointer for scratch buffer of output values */
194 /* Loop over the blockSize. Unroll by a factor of 4.
195 * Compute 4 MACS at a time. */
196 blkCnt
= blockSize
>> 2;
200 /* Perform Multiply-Accumulate */
201 in
= *pScratchOut
+ ((q31_t
) * px
++ * coeff
);
203 in
= *pScratchOut
+ ((q31_t
) * px
++ * coeff
);
205 in
= *pScratchOut
+ ((q31_t
) * px
++ * coeff
);
207 in
= *pScratchOut
+ ((q31_t
) * px
++ * coeff
);
210 /* Decrement the loop counter */
214 /* If the blockSize is not a multiple of 4,
215 * compute the remaining samples */
216 blkCnt
= blockSize
% 0x4u
;
220 /* Perform Multiply-Accumulate */
221 in
= *pScratchOut
+ ((q31_t
) * px
++ * coeff
);
224 /* Decrement the loop counter */
228 /* Load the coefficient value and
229 * increment the coefficient buffer for the next set of state values */
232 /* Read Index, from where the state buffer should be read, is calculated. */
233 readIndex
= ((int32_t) S
->stateIndex
-
234 (int32_t) blockSize
) - *pTapDelay
++;
236 /* Wraparound of readIndex */
239 readIndex
+= (int32_t) delaySize
;
242 /* Decrement the tap loop counter */
246 /* All the output values are in pScratchOut buffer.
247 Convert them into 1.15 format, saturate and store in the destination buffer. */
248 /* Loop over the blockSize. */
249 blkCnt
= blockSize
>> 2;
253 in1
= (q7_t
) __SSAT(*pScr2
++ >> 7, 8);
254 in2
= (q7_t
) __SSAT(*pScr2
++ >> 7, 8);
255 in3
= (q7_t
) __SSAT(*pScr2
++ >> 7, 8);
256 in4
= (q7_t
) __SSAT(*pScr2
++ >> 7, 8);
258 *__SIMD32(pOut
)++ = __PACKq7(in1
, in2
, in3
, in4
);
260 /* Decrement the blockSize loop counter */
264 /* If the blockSize is not a multiple of 4,
265 remaining samples are processed in the below loop */
266 blkCnt
= blockSize
% 0x4u
;
270 *pOut
++ = (q7_t
) __SSAT(*pScr2
++ >> 7, 8);
272 /* Decrement the blockSize loop counter */
278 /* Run the below code for Cortex-M0 */
280 /* BlockSize of Input samples are copied into the state buffer */
281 /* StateIndex points to the starting position to write in the state buffer */
282 arm_circularWrite_q7(py
, (int32_t) delaySize
, &S
->stateIndex
, 1, pSrc
, 1,
285 /* Loop over the number of taps. */
288 /* Read Index, from where the state buffer should be read, is calculated. */
289 readIndex
= ((int32_t) S
->stateIndex
- (int32_t) blockSize
) - *pTapDelay
++;
291 /* Wraparound of readIndex */
294 readIndex
+= (int32_t) delaySize
;
297 /* Working pointer for state buffer is updated */
300 /* blockSize samples are read from the state buffer */
301 arm_circularRead_q7(py
, (int32_t) delaySize
, &readIndex
, 1, pb
, pb
,
302 (int32_t) blockSize
, 1, blockSize
);
304 /* Working pointer for the scratch buffer of state values */
307 /* Working pointer for scratch buffer of output values */
310 /* Loop over the blockSize */
315 /* Perform multiplication and store in the scratch buffer */
316 *pScratchOut
++ = ((q31_t
) * px
++ * coeff
);
318 /* Decrement the loop counter */
322 /* Load the coefficient value and
323 * increment the coefficient buffer for the next set of state values */
326 /* Read Index, from where the state buffer should be read, is calculated. */
327 readIndex
= ((int32_t) S
->stateIndex
- (int32_t) blockSize
) - *pTapDelay
++;
329 /* Wraparound of readIndex */
332 readIndex
+= (int32_t) delaySize
;
335 /* Loop over the number of taps. */
336 tapCnt
= (uint32_t) numTaps
- 1u;
340 /* Working pointer for state buffer is updated */
343 /* blockSize samples are read from the state buffer */
344 arm_circularRead_q7(py
, (int32_t) delaySize
, &readIndex
, 1, pb
, pb
,
345 (int32_t) blockSize
, 1, blockSize
);
347 /* Working pointer for the scratch buffer of state values */
350 /* Working pointer for scratch buffer of output values */
353 /* Loop over the blockSize */
358 /* Perform Multiply-Accumulate */
359 in
= *pScratchOut
+ ((q31_t
) * px
++ * coeff
);
362 /* Decrement the loop counter */
366 /* Load the coefficient value and
367 * increment the coefficient buffer for the next set of state values */
370 /* Read Index, from where the state buffer should be read, is calculated. */
372 ((int32_t) S
->stateIndex
- (int32_t) blockSize
) - *pTapDelay
++;
374 /* Wraparound of readIndex */
377 readIndex
+= (int32_t) delaySize
;
380 /* Decrement the tap loop counter */
384 /* All the output values are in pScratchOut buffer.
385 Convert them into 1.15 format, saturate and store in the destination buffer. */
386 /* Loop over the blockSize. */
391 *pOut
++ = (q7_t
) __SSAT(*pScr2
++ >> 7, 8);
393 /* Decrement the blockSize loop counter */
397 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
402 * @} end of FIR_Sparse group