optimize mul_8c / mul_84
[Chiptunes.git] / foo.S
1 /* REGISTER NAMES */
2 #define zero r16
3 #define acc r17
4 #define i0 r18
5 #define i1 r19
6 #define i2 r20
7 #define i3 r21
8 #define n r22
9 #define s r23
10 #define _ r24
11 ; r25
12 #define x r26 //==Xlo==Mh
13 #define t r27 //==Xhi==Ml
14 ; r28
15 ; r29
16 ; r30 Zlo
17 ; r31 Zhi
18 ; aliases:
19 #define Xlo r26
20 #define Xhi r27
21 #define Mh r26 //mod3 vars
22 #define Ml r27 // -"-
23
24 /* I/O REGISTERS */
25 OCR0AL = 0x26
26 DDRB = 0x01
27 PORTB = 0x02
28 PUEB = 0x03
29 SPL = 0x3D
30 SPH = 0x3E
31 CCP = 0x3C
32 CLKPSR = 0x36
33 WDTCSR = 0x31
34 SMCR = 0x3A
35 TCCR0A = 0x2E
36 TCCR0B = 0x2D
37 TIMSK0 = 0x2B
38 TIFR0 = 0x2A
39
40 .section .data
41 data:
42 .byte 0x84, 0x9d, 0xb0, 0x69, 0x9d, 0x84, 0x69, 0x58
43 .byte 0x75, 0x8c, 0xb0, 0x69, 0x8c, 0x75, 0x69, 0x58
44
45 .section .text
46 .org 0x0000 ; RESET interrupt
47 RJMP main
48 .org 0x0008 ; TIM0_OVF interrupt
49 RJMP sample
50
51 mod3: ; mod3(Mh.Ml) -> t
52 #define tmp _
53 ADD Ml, Mh
54 CLR Mh
55 ADC Mh, zero
56 MOV tmp, Ml
57 SWAP tmp
58 ANDI tmp, 0x0f
59 SWAP Mh
60 OR tmp, Mh
61 ANDI Ml, 0x0f
62 ADD Ml, tmp
63 MOV tmp, Ml
64 LSR tmp
65 LSR tmp
66 ANDI Ml, 0x03
67 ADD Ml, tmp
68 MOV tmp, Ml
69 LSR tmp
70 LSR tmp
71 ANDI Ml, 0x03
72 ADD Ml, tmp
73 CPI Ml, 3
74 BRPL skip
75 SUBI Ml, 3
76 skip:
77 RET
78 #undef tmp
79
80 ; definitions to mul-tree readable:
81 #define a1 x
82 #define a2 _
83 .macro always _bit ; nop; for when a test() is not necessary (see tree)
84 .endm
85 .macro never _bit ; nop; for when a test() is not necessary (see tree)
86 .endm
87 .macro test _bit,_jmpto
88 SBRC t, \_bit
89 RJMP \_jmpto
90 .endm
91 .macro i_test _bit,_jmpto ; inverted test (for reordered 0x8_)
92 SBRS t, \_bit
93 RJMP \_jmpto
94 .endm
95 .macro shift16
96 LSR a2
97 ROR a1
98 .endm
99 .macro shift8 ; top three bits don't need to be corrrect, so save cycles by not carrying
100 LSR a1
101 .endm
102 .macro shift0 ; nop; last shift is common
103 .endm
104 .macro add16
105 ADD a1, i0
106 ADC a2, i1
107 .endm
108 .macro add8 ; ditto with carrying
109 ADD a1, i0
110 .endm
111 #undef a2
112 #undef a1
113
114 g: ; g(i, t) -> t
115 #define tmp _
116 ANDI t, 0x07
117 MOV tmp, i2
118 ANDI tmp, 3
119 CPSE tmp, zero
120 SUBI t, -8
121 #undef tmp
122
123 ;TODO: check correctness!
124 #define tmp _
125 MOV tmp, t ; NOTE: must move value away from `t`, as that is also hi(X)
126 LDI Xhi, hi8(data) ; hi(data) always zero, but still need to clear the register
127 LDI Xlo, lo8(data)
128 ADD Xlo, tmp ;<-- the offset (formerly `t`) into data[]
129 ;ADC Xhi, zero ; data == 0x40 t <= 0x10, so can never overflow
130 LD tmp, X
131 MOV t, tmp
132 #undef tmp
133
134 #define a1 x
135 #define a2 _
136 #define a0 t
137 CLR a2
138 CLR a1
139
140 /* decision tree multiplication saves cycles and (hopefully) reduces code size
141 _xxx?
142 / \
143 _xx?0 _xx1?
144 | |
145 _x?00 _x?01
146 / \ / \
147 _?000 _?100 _?001 _?101
148 / \ / \ | / \
149 _0000 _1000 _0100 _1100 _1001 _0101 _1101
150 | | | | | | |
151 ... ... ... ... ... ... ...
152 | | | | | | |
153 B0 58 84 8C 69 75 9D */
154 test 0, m____1
155 m____0: shift16
156 never 1
157 m___00: shift16
158 test 2, m__100
159 m__000: shift16
160 test 3, m_1000
161 m_0000: shift16
162 always 4
163 add16 $ shift16
164 always 5
165 add8 $ shift8
166 never 6
167 shift8
168 always 7
169 add8 $ shift0
170 RJMP end_mul ; calc'd 0xb0
171
172 m_1000: add16 $ shift16
173 always 4
174 add16 $ shift16
175 never 5
176 shift8
177 always 6
178 add8 $ shift8
179 never 7
180 shift0
181 RJMP end_mul ; calc'd 0x58
182
183 m__100: add16 $ shift16
184 i_test 3, m_0100
185 m_1100: add16
186 m_0100: shift16
187 never 4
188 shift16
189 never 5
190 shift8
191 never 6
192 shift8
193 always 7
194 add8 $ shift0
195 RJMP end_mul ; calc'd 0x8c / 0x84
196
197 m____1: add16 $ shift16
198 never 1
199 m___01: shift16
200 test 2, m__101
201 m__001: shift16
202 always 3
203 m_1001: add16 $ shift16
204 never 4
205 shift16
206 always 5
207 add8 $ shift8
208 always 6
209 add8 $ shift8
210 never 7
211 shift0
212 RJMP end_mul ; calc'd 0x69
213
214 m__101: add16 $ shift16
215 test 3, m_1101
216 m_0101: shift16
217 always 4
218 add16 $ shift16
219 always 5
220 add8 $ shift8
221 always 6
222 add8 $ shift8
223 never 7
224 shift0
225 RJMP end_mul ; calc'd 0x75
226
227 m_1101: add16 $ shift16
228 always 4
229 add16 $ shift16
230 never 5
231 shift8
232 never 6
233 shift8
234 always 7
235 add8 $ shift0
236 ; calc'd 0x9d
237
238 end_mul:
239 LSR a1 ;final shift is a common operation for all
240
241 MOV t, a1 ;;TODO: use a1 in main() directly
242 #undef a0
243 #undef a1
244 #undef a2
245 RET ; TODO: replace CALL/RET with IJMP?
246
247 main: ; setup routine
248 CLR zero
249 CLR i0
250 CLR i1
251 CLR i2
252 CLR i3
253 CLR acc ; we output a dummy sample before the actual first one
254
255 #define one _
256 LDI one, 1
257 LDI x, 0x5f ; RAMEND
258 OUT SPL, x ; init stack ptr
259 OUT SPH, zero ; -"-
260 OUT PUEB, zero ; disable pullups
261 LDI x, 0x05
262 OUT DDRB, x ; PORTB0:pwm, PORTB2:debug
263 LDI x, 0xd8
264 OUT CCP, x ; change protected ioregs
265 OUT CLKPSR, one ; clock prescaler 1/2 (4Mhz)
266 OUT WDTCSR, zero; turn off watchdog ;;TODO: incomplete - see datasheet pg48
267 ; OUT SMCR, 2 ; sleep mode 'power down' ('idle' (default) has faster response time)
268
269 ;set timer/counter0 to 8bit fastpwm, non-inverting, no prescaler
270 LDI x, 0x81
271 OUT TCCR0A, x
272 LDI x, 0x09
273 OUT TCCR0B, x
274 OUT TIMSK0, one ; enable tim0_ovf
275 OUT TIFR0, one ; TODO: why?
276 SEI
277 #undef one
278 RJMP sample
279
280 loop:
281 SLEEP ; wait for interrupt
282 RJMP loop
283
284 sample:
285 ; potential TODO: softcounter in r25 to only update duty cicle every n iterations
286 ; potential TODO: save/restore status register (SREG=0x3f) (only if something in mainloop)
287
288 OUT OCR0AL, acc ; start by outputting a sample, because routine has variable runtime
289 SBI PORTB, 2 ; to measure runtime
290
291 MOV n, i2
292 LSL n
293 LSL n
294 #define tmp _
295 MOV tmp, i1
296 SWAP tmp
297 ANDI tmp, 0x0f
298 LSR tmp
299 LSR tmp
300 OR n, tmp
301 #undef tmp
302 MOV s, i3
303 LSR s
304 ROR s
305 ANDI s, 0x80
306 #define tmp _
307 MOV tmp, i2
308 LSR tmp
309 OR s, tmp
310 #undef tmp
311
312 ; voice 1:
313 MOV t, n
314 RCALL g
315 SWAP t
316 ANDI t, 1
317 MOV acc, t
318
319 ; voice 2:
320 #define tmp _
321 MOV tmp, i2
322 LSL tmp
323 LSL tmp
324 LSL tmp
325 MOV t, i1
326 SWAP t
327 ANDI t, 0xf
328 LSR t
329 OR t, tmp
330 #undef tmp
331 EOR t, n
332 RCALL g
333 LSR t
334 LSR t
335 ANDI t, 3
336 AND t, s
337 ADD acc, t
338
339 ; voice 3:
340 MOV Ml, i2
341 SWAP Ml
342 ANDI Ml, 0xf0
343 LSL Ml
344 #define tmp _
345 MOV tmp, i1
346 LSR tmp
347 LSR tmp
348 LSR tmp
349 OR Ml, tmp
350 #undef tmp
351 MOV Mh, i3
352 SWAP Mh
353 ANDI Mh, 0xf0
354 LSL Mh
355 #define tmp _
356 MOV tmp, i2
357 LSR tmp
358 LSR tmp
359 LSR tmp
360 OR Mh, tmp
361 #undef tmp
362 RCALL mod3
363 ADD t, n
364 RCALL g
365 LSR t
366 LSR t
367 ANDI t, 3
368 MOV x, s
369 INC x
370 #define tmp _
371 MOV tmp, x
372 LSR tmp
373 LSR tmp
374 ADD tmp, x
375 ROR tmp
376 LSR tmp
377 ADD tmp, x
378 ROR tmp
379 LSR tmp
380 ADD tmp, x
381 ROR tmp
382 LSR tmp
383 AND t, tmp
384 #undef tmp
385 ADD acc, t
386
387 ; voice 4:
388 MOV Ml, i2
389 SWAP Ml
390 ANDI Ml, 0xf0
391 LSL Ml
392 LSL Ml
393 #define tmp _
394 MOV tmp, i1
395 LSR tmp
396 LSR tmp
397 OR Ml, tmp
398 #undef tmp
399 MOV Mh, i3
400 SWAP Mh
401 ANDI Mh, 0xf0
402 LSL Mh
403 LSL Mh
404 #define tmp _
405 MOV tmp, i2
406 LSR tmp
407 LSR tmp
408 OR Mh, tmp
409 #undef tmp
410 RCALL mod3
411 SUB t, n
412 NEG t
413 SUBI t, -8
414 RCALL g
415 LSR t
416 ANDI t, 3
417 INC s
418 #define tmp _
419 MOV tmp, s
420 LSR tmp
421 ADD tmp, s
422 ROR tmp
423 LSR tmp
424 LSR tmp
425 ADD tmp, s
426 ROR tmp
427 ADD tmp, s
428 ROR tmp
429 LSR tmp
430 LSR tmp
431 AND t, tmp
432 #undef tmp
433 ADD acc, t
434
435 SWAP acc ; acc<<4, to be passed to OCR0AL
436
437 SUBI i0, -1
438 SBCI i1, -1
439 SBCI i2, -1
440 SBCI i3, -1
441
442 CBI PORTB, 2 ; end runtime measurement
443 ;TODO: to reduce jitter: clear pending tim0_ovf (TIFR0[TOV0] <- 1) ?
444 RETI ; reenables interrupts
Imprint / Impressum