add note about decision tree multiplication
[Chiptunes.git] / foo.S
CommitLineData
f180febe 1/* REGISTER NAMES */
6d672b87
TG
2#define acc r16
3#define i0 r17
4#define i1 r18
5#define i2 r19
6#define i3 r20
7#define n r21
8#define s r22
9#define t r23 //==Ml
f99fd6f3
TG
10#define x r24 //==a1==Mh
11#define _ r25 //==a2
12#define Xlo r26
13#define Xhi r27
ce618731 14#define one r28
f180febe
TG
15; r29
16; r30 Zlo
17; r31 Zhi
18; aliases:
ce618731
TG
19#define Ml t //mod3 vars
20#define Mh x // -"-
21#define a1 x //mul_ vars
22#define a2 _ // -"-
da32ed67 23
f180febe
TG
24/* I/O REGISTERS */
25OCR0AL = 0x26
26DDRB = 0x01
34fa6d04 27PORTB = 0x02
f180febe
TG
28PUEB = 0x03
29SPL = 0x3D
30SPH = 0x3E
31CCP = 0x3C
32CLKPSR = 0x36
2af726bc 33OSCCAL = 0x39
19e320a6 34WDTCSR = 0x31
f180febe
TG
35SMCR = 0x3A
36TCCR0A = 0x2E
37TCCR0B = 0x2D
38TIMSK0 = 0x2B
39TIFR0 = 0x2A
f99fd6f3
TG
40RAMEND = 0x5F
41FLASHM = 0x4000
4466dd8b 42
f180febe 43.section .text
19e320a6 44.org 0x0000 ; RESET interrupt
cf36b4c3
TG
45 CLR i0
46 CLR i1
47 CLR i2
19e320a6
TG
48 RJMP main
49.org 0x0008 ; TIM0_OVF interrupt
50 RJMP sample
4466dd8b 51
f99fd6f3
TG
52notes:
53 .byte 0x84, 0x9d, 0xb0, 0x69, 0x9d, 0x84, 0x69, 0x58
54 .byte 0x75, 0x8c, 0xb0, 0x69, 0x8c, 0x75, 0x69, 0x58
55
f180febe 56mod3: ; mod3(Mh.Ml) -> t
8d8c00e4 57 #define tmp _
65aa7cd6
TG
58 ADD Ml, Mh
59 CLR Mh
6d672b87 60 ADC Mh, Mh ; store carry in Mh
65aa7cd6
TG
61 MOV tmp, Ml
62 SWAP tmp
63 ANDI tmp, 0x0f
64 SWAP Mh
65 OR tmp, Mh
66 ANDI Ml, 0x0f
67 ADD Ml, tmp
68 MOV tmp, Ml
69 LSR tmp
70 LSR tmp
71 ANDI Ml, 0x03
72 ADD Ml, tmp
73 MOV tmp, Ml
74 LSR tmp
75 LSR tmp
76 ANDI Ml, 0x03
77 ADD Ml, tmp
78 CPI Ml, 3
26799bab 79 BRCS skip
65aa7cd6
TG
80 SUBI Ml, 3
81 skip:
4283632d 82 RET
8d8c00e4 83 #undef tmp
f180febe
TG
84
85; definitions to mul-tree readable:
86.macro always _bit ; nop; for when a test() is not necessary (see tree)
87.endm
88.macro never _bit ; nop; for when a test() is not necessary (see tree)
89.endm
90.macro test _bit,_jmpto
65958d9d
TG
91 SBRC t, \_bit
92 RJMP \_jmpto
f180febe 93.endm
f8861a90
TG
94.macro i_test _bit,_jmpto ; inverted test (for reordered 0x8_)
95 SBRS t, \_bit
96 RJMP \_jmpto
97.endm
f180febe
TG
98.macro shift16
99 LSR a2
100 ROR a1
101.endm
102.macro shift8 ; top three bits don't need to be corrrect, so save cycles by not carrying
103 LSR a1
104.endm
105.macro shift0 ; nop; last shift is common
106.endm
ea40b11f 107.macro add16
f180febe
TG
108 ADD a1, i0
109 ADC a2, i1
f180febe 110.endm
ea40b11f 111.macro add8 ; ditto with carrying
f180febe
TG
112 ADD a1, i0
113.endm
114
115g: ; g(i, t) -> t
6d672b87
TG
116 CLR a1
117
d35c3d70 118 #define tmp _
6d672b87 119 #define zero a1
65aa7cd6
TG
120 ANDI t, 0x07
121 MOV tmp, i2
122 ANDI tmp, 3
65aa7cd6
TG
123 CPSE tmp, zero
124 SUBI t, -8
6d672b87 125 #undef zero
02f61e33 126 #undef tmp
4466dd8b 127
d44d4b47
TG
128 LDI Xlo, lo8(notes)
129 ADD Xlo, t ; NOTE: can't overflow, since RAMEND == 0x5F
130 LD t, X
4466dd8b 131
986f12ae
TG
132 CLR a2
133
58515c6c
TG
134 /* decision tree multiplication:
135 there is only a limited number of coefficients, so we heavily
136 optimize for those only, and only compute the bits we
137 actually need. this reduces cycle count from 38 for the
138 (optimized) classic approach to 31. instruction count
139 increases from 38 to 100. in the end it turned out that we
140 would've had enough cycles to spare to just use the standard
141 algorithm.
4466dd8b
TG
142 _xxx?
143 / \
144 _xx?0 _xx1?
145 | |
146 _x?00 _x?01
147 / \ / \
148 _?000 _?100 _?001 _?101
149 / \ / \ | / \
150 _0000 _1000 _0100 _1100 _1001 _0101 _1101
151 | | | | | | |
152 ... ... ... ... ... ... ...
153 | | | | | | |
58515c6c
TG
154 B0 58 84 8C 69 75 9D
155 27cy 28cy 26cy 28cy 26cy 31cy 30cy */
65aa7cd6 156 test 0, m____1
4466dd8b 157 m____0: shift16
65aa7cd6 158 never 1
4466dd8b 159 m___00: shift16
65aa7cd6 160 test 2, m__100
4466dd8b 161 m__000: shift16
65aa7cd6 162 test 3, m_1000
4466dd8b 163 m_0000: shift16
65aa7cd6 164 always 4
ea40b11f 165 add16 $ shift16
65aa7cd6 166 always 5
ea40b11f 167 add8 $ shift8
65aa7cd6 168 never 6
4466dd8b 169 shift8
65aa7cd6 170 always 7
ea40b11f 171 add8 $ shift0
65aa7cd6 172 RJMP end_mul ; calc'd 0xb0
d0324785 173
ea40b11f 174 m_1000: add16 $ shift16
65aa7cd6 175 always 4
ea40b11f 176 add16 $ shift16
65aa7cd6 177 never 5
4466dd8b 178 shift8
65aa7cd6 179 always 6
ea40b11f 180 add8 $ shift8
65aa7cd6 181 never 7
4466dd8b 182 shift0
65aa7cd6 183 RJMP end_mul ; calc'd 0x58
d0324785 184
ea40b11f 185 m__100: add16 $ shift16
f8861a90
TG
186 i_test 3, m_0100
187 m_1100: add16
4466dd8b 188 m_0100: shift16
65aa7cd6 189 never 4
4466dd8b 190 shift16
65aa7cd6 191 never 5
4466dd8b 192 shift8
65aa7cd6 193 never 6
4466dd8b 194 shift8
65aa7cd6 195 always 7
ea40b11f 196 add8 $ shift0
f8861a90 197 RJMP end_mul ; calc'd 0x8c / 0x84
d0324785 198
ea40b11f 199 m____1: add16 $ shift16
65aa7cd6 200 never 1
4466dd8b 201 m___01: shift16
65aa7cd6 202 test 2, m__101
4466dd8b 203 m__001: shift16
65aa7cd6 204 always 3
ea40b11f 205 m_1001: add16 $ shift16
65aa7cd6 206 never 4
4466dd8b 207 shift16
65aa7cd6 208 always 5
ea40b11f 209 add8 $ shift8
65aa7cd6 210 always 6
ea40b11f 211 add8 $ shift8
65aa7cd6 212 never 7
4466dd8b 213 shift0
65aa7cd6 214 RJMP end_mul ; calc'd 0x69
d0324785 215
ea40b11f 216 m__101: add16 $ shift16
65aa7cd6 217 test 3, m_1101
4466dd8b 218 m_0101: shift16
65aa7cd6 219 always 4
ea40b11f 220 add16 $ shift16
65aa7cd6 221 always 5
ea40b11f 222 add8 $ shift8
65aa7cd6 223 always 6
ea40b11f 224 add8 $ shift8
65aa7cd6 225 never 7
4466dd8b 226 shift0
65aa7cd6 227 RJMP end_mul ; calc'd 0x75
d0324785 228
ea40b11f 229 m_1101: add16 $ shift16
65aa7cd6 230 always 4
ea40b11f 231 add16 $ shift16
65aa7cd6 232 never 5
4466dd8b 233 shift8
65aa7cd6 234 never 6
4466dd8b 235 shift8
65aa7cd6 236 always 7
ea40b11f 237 add8 $ shift0
65aa7cd6 238 ; calc'd 0x9d
d0324785 239
4466dd8b 240 end_mul:
65aa7cd6 241 LSR a1 ;final shift is a common operation for all
4466dd8b 242
2af726bc 243 MOV t, a1 ;;TODO: use a1 in loop: directly
ce618731 244 RET
61fab018 245
19e320a6 246main: ; setup routine
cf36b4c3 247 ; NOTE: clr i0..i2 moved to .ord 0x0
65aa7cd6 248 CLR i3
19e320a6 249 CLR acc ; we output a dummy sample before the actual first one
f99fd6f3 250 LDI Xhi, hi8(FLASHM + notes) ; never changes
ce618731 251 LDI one, 1 ; mostly for clearing TIM0_OVF bit
19e320a6 252
6d672b87 253 #define zero i0
f99fd6f3 254 LDI x, RAMEND
19e320a6
TG
255 OUT SPL, x ; init stack ptr
256 OUT SPH, zero ; -"-
257 OUT PUEB, zero ; disable pullups
ea40b11f
TG
258 LDI x, 0x05
259 OUT DDRB, x ; PORTB0:pwm, PORTB2:debug
19e320a6
TG
260 LDI x, 0xd8
261 OUT CCP, x ; change protected ioregs
262 OUT CLKPSR, one ; clock prescaler 1/2 (4Mhz)
2af726bc
TG
263 LDI x, 0xa7 ; determined by trial-and-error (->PORTB2)
264 OUT OSCCAL, x ; set oscillator calibration
ce618731 265 OUT WDTCSR, zero; turn off watchdog
19e320a6
TG
266
267 ;set timer/counter0 to 8bit fastpwm, non-inverting, no prescaler
268 LDI x, 0x81
269 OUT TCCR0A, x
270 LDI x, 0x09
271 OUT TCCR0B, x
272 OUT TIMSK0, one ; enable tim0_ovf
19e320a6 273 SEI
6d672b87 274 #undef zero
19e320a6
TG
275
276loop:
277 SLEEP ; wait for interrupt
278 RJMP loop
279
280sample:
19e320a6 281 OUT OCR0AL, acc ; start by outputting a sample, because routine has variable runtime
ce618731 282#ifdef DEBUG
34fa6d04 283 SBI PORTB, 2 ; to measure runtime
ce618731 284#endif // DEBUG
19e320a6 285
65aa7cd6
TG
286 MOV n, i2
287 LSL n
288 LSL n
f6ef1520 289 #define tmp _
65aa7cd6
TG
290 MOV tmp, i1
291 SWAP tmp
292 ANDI tmp, 0x0f
293 LSR tmp
294 LSR tmp
295 OR n, tmp
f6ef1520 296 #undef tmp
65aa7cd6
TG
297 MOV s, i3
298 LSR s
299 ROR s
300 ANDI s, 0x80
f6ef1520 301 #define tmp _
65aa7cd6
TG
302 MOV tmp, i2
303 LSR tmp
304 OR s, tmp
f6ef1520 305 #undef tmp
3b86ca43 306
65aa7cd6
TG
307 ; voice 1:
308 MOV t, n
309 RCALL g
310 SWAP t
311 ANDI t, 1
312 MOV acc, t
3b86ca43 313
65aa7cd6 314 ; voice 2:
f6ef1520 315 #define tmp _
65aa7cd6
TG
316 MOV tmp, i2
317 LSL tmp
318 LSL tmp
319 LSL tmp
320 MOV t, i1
321 SWAP t
322 ANDI t, 0xf
323 LSR t
324 OR t, tmp
f6ef1520 325 #undef tmp
65aa7cd6
TG
326 EOR t, n
327 RCALL g
328 LSR t
329 LSR t
330 ANDI t, 3
331 AND t, s
332 ADD acc, t
3b86ca43 333
65aa7cd6
TG
334 ; voice 3:
335 MOV Ml, i2
336 SWAP Ml
337 ANDI Ml, 0xf0
338 LSL Ml
f6ef1520 339 #define tmp _
65aa7cd6
TG
340 MOV tmp, i1
341 LSR tmp
342 LSR tmp
343 LSR tmp
344 OR Ml, tmp
f6ef1520 345 #undef tmp
65aa7cd6
TG
346 MOV Mh, i3
347 SWAP Mh
348 ANDI Mh, 0xf0
349 LSL Mh
f6ef1520 350 #define tmp _
65aa7cd6
TG
351 MOV tmp, i2
352 LSR tmp
353 LSR tmp
354 LSR tmp
355 OR Mh, tmp
f6ef1520 356 #undef tmp
65aa7cd6
TG
357 RCALL mod3
358 ADD t, n
359 RCALL g
360 LSR t
361 LSR t
362 ANDI t, 3
363 MOV x, s
364 INC x
f6ef1520 365 #define tmp _
65aa7cd6
TG
366 MOV tmp, x
367 LSR tmp
368 LSR tmp
369 ADD tmp, x
370 ROR tmp
371 LSR tmp
372 ADD tmp, x
373 ROR tmp
374 LSR tmp
375 ADD tmp, x
376 ROR tmp
377 LSR tmp
378 AND t, tmp
f6ef1520 379 #undef tmp
65aa7cd6 380 ADD acc, t
f6ef1520 381
65aa7cd6
TG
382 ; voice 4:
383 MOV Ml, i2
384 SWAP Ml
385 ANDI Ml, 0xf0
386 LSL Ml
387 LSL Ml
f6ef1520 388 #define tmp _
65aa7cd6
TG
389 MOV tmp, i1
390 LSR tmp
391 LSR tmp
392 OR Ml, tmp
f6ef1520 393 #undef tmp
65aa7cd6
TG
394 MOV Mh, i3
395 SWAP Mh
396 ANDI Mh, 0xf0
397 LSL Mh
398 LSL Mh
f6ef1520 399 #define tmp _
65aa7cd6
TG
400 MOV tmp, i2
401 LSR tmp
402 LSR tmp
403 OR Mh, tmp
f6ef1520 404 #undef tmp
65aa7cd6
TG
405 RCALL mod3
406 SUB t, n
407 NEG t
408 SUBI t, -8
409 RCALL g
410 LSR t
411 ANDI t, 3
412 INC s
f6ef1520 413 #define tmp _
65aa7cd6
TG
414 MOV tmp, s
415 LSR tmp
416 ADD tmp, s
417 ROR tmp
418 LSR tmp
419 LSR tmp
420 ADD tmp, s
421 ROR tmp
422 ADD tmp, s
423 ROR tmp
424 LSR tmp
425 LSR tmp
426 AND t, tmp
f6ef1520 427 #undef tmp
65aa7cd6 428 ADD acc, t
3b86ca43 429
19e320a6
TG
430 SWAP acc ; acc<<4, to be passed to OCR0AL
431
f6ef1520
TG
432 SUBI i0, -1
433 SBCI i1, -1
434 SBCI i2, -1
435 SBCI i3, -1
bfce2f8c 436
ce618731
TG
437#ifdef DEBUG
438 CBI PORTB, 2 ; end runtime measurement
439#endif // DEBUG
440 OUT TIFR0, one ; clear pending interrupt (routine takes two intr.cycles)
19e320a6 441 RETI ; reenables interrupts
Imprint / Impressum