]> git.gir.st - tmk_keyboard.git/blob - tool/mbed/mbed-sdk/libraries/dsp/cmsis_dsp/FilteringFunctions/arm_fir_sparse_q7.c
Squashed 'tmk_core/' changes from 7967731..b9e0ea0
[tmk_keyboard.git] / tool / mbed / mbed-sdk / libraries / dsp / cmsis_dsp / FilteringFunctions / arm_fir_sparse_q7.c
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.
3 *
4 * $Date: 17. January 2013
5 * $Revision: V1.4.1
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_fir_sparse_q7.c
9 *
10 * Description: Q7 sparse FIR filter processing function.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * ------------------------------------------------------------------- */
40 #include "arm_math.h"
41
42
43 /**
44 * @ingroup groupFilters
45 */
46
47 /**
48 * @addtogroup FIR_Sparse
49 * @{
50 */
51
52
53 /**
54 * @brief Processing function for the Q7 sparse FIR filter.
55 * @param[in] *S points to an instance of the Q7 sparse FIR structure.
56 * @param[in] *pSrc points to the block of input data.
57 * @param[out] *pDst points to the block of output data
58 * @param[in] *pScratchIn points to a temporary buffer of size blockSize.
59 * @param[in] *pScratchOut points to a temporary buffer of size blockSize.
60 * @param[in] blockSize number of input samples to process per call.
61 * @return none.
62 *
63 * <b>Scaling and Overflow Behavior:</b>
64 * \par
65 * The function is implemented using a 32-bit internal accumulator.
66 * Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result.
67 * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
68 * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
69 * The accumulator is then converted to 18.7 format by discarding the low 7 bits.
70 * Finally, the result is truncated to 1.7 format.
71 */
72
73 void arm_fir_sparse_q7(
74 arm_fir_sparse_instance_q7 * S,
75 q7_t * pSrc,
76 q7_t * pDst,
77 q7_t * pScratchIn,
78 q31_t * pScratchOut,
79 uint32_t blockSize)
80 {
81
82 q7_t *pState = S->pState; /* State pointer */
83 q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
84 q7_t *px; /* Scratch buffer pointer */
85 q7_t *py = pState; /* Temporary pointers for state buffer */
86 q7_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
87 q7_t *pOut = pDst; /* Destination pointer */
88 int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
89 uint32_t delaySize = S->maxDelay + blockSize; /* state length */
90 uint16_t numTaps = S->numTaps; /* Filter order */
91 int32_t readIndex; /* Read index of the state buffer */
92 uint32_t tapCnt, blkCnt; /* loop counters */
93 q7_t coeff = *pCoeffs++; /* Read the coefficient value */
94 q31_t *pScr2 = pScratchOut; /* Working pointer for scratch buffer of output values */
95 q31_t in;
96
97
98 #ifndef ARM_MATH_CM0_FAMILY
99
100 /* Run the below code for Cortex-M4 and Cortex-M3 */
101
102 q7_t in1, in2, in3, in4;
103
104 /* BlockSize of Input samples are copied into the state buffer */
105 /* StateIndex points to the starting position to write in the state buffer */
106 arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1,
107 blockSize);
108
109 /* Loop over the number of taps. */
110 tapCnt = numTaps;
111
112 /* Read Index, from where the state buffer should be read, is calculated. */
113 readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
114
115 /* Wraparound of readIndex */
116 if(readIndex < 0)
117 {
118 readIndex += (int32_t) delaySize;
119 }
120
121 /* Working pointer for state buffer is updated */
122 py = pState;
123
124 /* blockSize samples are read from the state buffer */
125 arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
126 (int32_t) blockSize, 1, blockSize);
127
128 /* Working pointer for the scratch buffer of state values */
129 px = pb;
130
131 /* Working pointer for scratch buffer of output values */
132 pScratchOut = pScr2;
133
134 /* Loop over the blockSize. Unroll by a factor of 4.
135 * Compute 4 multiplications at a time. */
136 blkCnt = blockSize >> 2;
137
138 while(blkCnt > 0u)
139 {
140 /* Perform multiplication and store in the scratch buffer */
141 *pScratchOut++ = ((q31_t) * px++ * coeff);
142 *pScratchOut++ = ((q31_t) * px++ * coeff);
143 *pScratchOut++ = ((q31_t) * px++ * coeff);
144 *pScratchOut++ = ((q31_t) * px++ * coeff);
145
146 /* Decrement the loop counter */
147 blkCnt--;
148 }
149
150 /* If the blockSize is not a multiple of 4,
151 * compute the remaining samples */
152 blkCnt = blockSize % 0x4u;
153
154 while(blkCnt > 0u)
155 {
156 /* Perform multiplication and store in the scratch buffer */
157 *pScratchOut++ = ((q31_t) * px++ * coeff);
158
159 /* Decrement the loop counter */
160 blkCnt--;
161 }
162
163 /* Load the coefficient value and
164 * increment the coefficient buffer for the next set of state values */
165 coeff = *pCoeffs++;
166
167 /* Read Index, from where the state buffer should be read, is calculated. */
168 readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
169
170 /* Wraparound of readIndex */
171 if(readIndex < 0)
172 {
173 readIndex += (int32_t) delaySize;
174 }
175
176 /* Loop over the number of taps. */
177 tapCnt = (uint32_t) numTaps - 1u;
178
179 while(tapCnt > 0u)
180 {
181 /* Working pointer for state buffer is updated */
182 py = pState;
183
184 /* blockSize samples are read from the state buffer */
185 arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
186 (int32_t) blockSize, 1, blockSize);
187
188 /* Working pointer for the scratch buffer of state values */
189 px = pb;
190
191 /* Working pointer for scratch buffer of output values */
192 pScratchOut = pScr2;
193
194 /* Loop over the blockSize. Unroll by a factor of 4.
195 * Compute 4 MACS at a time. */
196 blkCnt = blockSize >> 2;
197
198 while(blkCnt > 0u)
199 {
200 /* Perform Multiply-Accumulate */
201 in = *pScratchOut + ((q31_t) * px++ * coeff);
202 *pScratchOut++ = in;
203 in = *pScratchOut + ((q31_t) * px++ * coeff);
204 *pScratchOut++ = in;
205 in = *pScratchOut + ((q31_t) * px++ * coeff);
206 *pScratchOut++ = in;
207 in = *pScratchOut + ((q31_t) * px++ * coeff);
208 *pScratchOut++ = in;
209
210 /* Decrement the loop counter */
211 blkCnt--;
212 }
213
214 /* If the blockSize is not a multiple of 4,
215 * compute the remaining samples */
216 blkCnt = blockSize % 0x4u;
217
218 while(blkCnt > 0u)
219 {
220 /* Perform Multiply-Accumulate */
221 in = *pScratchOut + ((q31_t) * px++ * coeff);
222 *pScratchOut++ = in;
223
224 /* Decrement the loop counter */
225 blkCnt--;
226 }
227
228 /* Load the coefficient value and
229 * increment the coefficient buffer for the next set of state values */
230 coeff = *pCoeffs++;
231
232 /* Read Index, from where the state buffer should be read, is calculated. */
233 readIndex = ((int32_t) S->stateIndex -
234 (int32_t) blockSize) - *pTapDelay++;
235
236 /* Wraparound of readIndex */
237 if(readIndex < 0)
238 {
239 readIndex += (int32_t) delaySize;
240 }
241
242 /* Decrement the tap loop counter */
243 tapCnt--;
244 }
245
246 /* All the output values are in pScratchOut buffer.
247 Convert them into 1.15 format, saturate and store in the destination buffer. */
248 /* Loop over the blockSize. */
249 blkCnt = blockSize >> 2;
250
251 while(blkCnt > 0u)
252 {
253 in1 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
254 in2 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
255 in3 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
256 in4 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
257
258 *__SIMD32(pOut)++ = __PACKq7(in1, in2, in3, in4);
259
260 /* Decrement the blockSize loop counter */
261 blkCnt--;
262 }
263
264 /* If the blockSize is not a multiple of 4,
265 remaining samples are processed in the below loop */
266 blkCnt = blockSize % 0x4u;
267
268 while(blkCnt > 0u)
269 {
270 *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
271
272 /* Decrement the blockSize loop counter */
273 blkCnt--;
274 }
275
276 #else
277
278 /* Run the below code for Cortex-M0 */
279
280 /* BlockSize of Input samples are copied into the state buffer */
281 /* StateIndex points to the starting position to write in the state buffer */
282 arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1,
283 blockSize);
284
285 /* Loop over the number of taps. */
286 tapCnt = numTaps;
287
288 /* Read Index, from where the state buffer should be read, is calculated. */
289 readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
290
291 /* Wraparound of readIndex */
292 if(readIndex < 0)
293 {
294 readIndex += (int32_t) delaySize;
295 }
296
297 /* Working pointer for state buffer is updated */
298 py = pState;
299
300 /* blockSize samples are read from the state buffer */
301 arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
302 (int32_t) blockSize, 1, blockSize);
303
304 /* Working pointer for the scratch buffer of state values */
305 px = pb;
306
307 /* Working pointer for scratch buffer of output values */
308 pScratchOut = pScr2;
309
310 /* Loop over the blockSize */
311 blkCnt = blockSize;
312
313 while(blkCnt > 0u)
314 {
315 /* Perform multiplication and store in the scratch buffer */
316 *pScratchOut++ = ((q31_t) * px++ * coeff);
317
318 /* Decrement the loop counter */
319 blkCnt--;
320 }
321
322 /* Load the coefficient value and
323 * increment the coefficient buffer for the next set of state values */
324 coeff = *pCoeffs++;
325
326 /* Read Index, from where the state buffer should be read, is calculated. */
327 readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
328
329 /* Wraparound of readIndex */
330 if(readIndex < 0)
331 {
332 readIndex += (int32_t) delaySize;
333 }
334
335 /* Loop over the number of taps. */
336 tapCnt = (uint32_t) numTaps - 1u;
337
338 while(tapCnt > 0u)
339 {
340 /* Working pointer for state buffer is updated */
341 py = pState;
342
343 /* blockSize samples are read from the state buffer */
344 arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
345 (int32_t) blockSize, 1, blockSize);
346
347 /* Working pointer for the scratch buffer of state values */
348 px = pb;
349
350 /* Working pointer for scratch buffer of output values */
351 pScratchOut = pScr2;
352
353 /* Loop over the blockSize */
354 blkCnt = blockSize;
355
356 while(blkCnt > 0u)
357 {
358 /* Perform Multiply-Accumulate */
359 in = *pScratchOut + ((q31_t) * px++ * coeff);
360 *pScratchOut++ = in;
361
362 /* Decrement the loop counter */
363 blkCnt--;
364 }
365
366 /* Load the coefficient value and
367 * increment the coefficient buffer for the next set of state values */
368 coeff = *pCoeffs++;
369
370 /* Read Index, from where the state buffer should be read, is calculated. */
371 readIndex =
372 ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
373
374 /* Wraparound of readIndex */
375 if(readIndex < 0)
376 {
377 readIndex += (int32_t) delaySize;
378 }
379
380 /* Decrement the tap loop counter */
381 tapCnt--;
382 }
383
384 /* All the output values are in pScratchOut buffer.
385 Convert them into 1.15 format, saturate and store in the destination buffer. */
386 /* Loop over the blockSize. */
387 blkCnt = blockSize;
388
389 while(blkCnt > 0u)
390 {
391 *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
392
393 /* Decrement the blockSize loop counter */
394 blkCnt--;
395 }
396
397 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
398
399 }
400
401 /**
402 * @} end of FIR_Sparse group
403 */
Imprint / Impressum