From: Tobias Girstmair <t@thi3nkpad.lan>
Date: Mon, 2 Mar 2020 08:26:03 +0000 (+0100)
Subject: optimize multiplication for space
X-Git-Tag: attiny4
X-Git-Url: https://git.gir.st/Chiptunes.git/commitdiff_plain/dd3193aba3b9fc9d5805c257722850dfeeb62d9f

optimize multiplication for space

AVR Memory Usage
----------------
Device: attiny4

Program:     474 bytes (92.6% Full)
(.text + .data + .bootloader)

Data:          0 bytes (0.0% Full)
(.data + .bss + .noinit)
---

diff --git a/Makefile b/Makefile
index 2d1a3d9..9d65907 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 .PHONY: all clean flash
-CHIP ?= 9
+CHIP ?= 4
 
 all: foo.elf
 
diff --git a/foo.S b/foo.S
index d89dec5..8fbb0c5 100644
--- a/foo.S
+++ b/foo.S
@@ -82,36 +82,6 @@ mod3: ; mod3(Mh.Ml) -> t
 	RET
 	#undef tmp
 
-; definitions to mul-tree readable:
-.macro always _bit ; nop; for when a test() is not necessary (see tree)
-.endm
-.macro never _bit  ; nop; for when a test() is not necessary (see tree)
-.endm
-.macro test _bit,_jmpto
-	SBRC	t, \_bit
-	RJMP	\_jmpto
-.endm
-.macro i_test _bit,_jmpto ; inverted test (for reordered 0x8_)
-	SBRS	t, \_bit
-	RJMP	\_jmpto
-.endm
-.macro shift16
-	LSR	a2
-	ROR	a1
-.endm
-.macro shift8 ; top three bits don't need to be corrrect, so save cycles by not carrying
-	LSR	a1
-.endm
-.macro shift0 ; nop; last shift is common
-.endm
-.macro add16
-	ADD	a1, i0
-	ADC	a2, i1
-.endm
-.macro add8 ; ditto with carrying
-	ADD	a1, i0
-.endm
-
 g: ; g(i, t) -> t
 	CLR	a1
 
@@ -131,114 +101,57 @@ g: ; g(i, t) -> t
 
 	CLR	a2
 
-	/* decision tree multiplication:
-	   there is only a limited number of coefficients, so we heavily
-	   optimize for those only, and only compute the bits we
-	   actually need. this reduces cycle count from 38 for the
-	   (optimized) classic approach to 31. instruction count
-	   increases from 38 to 100. in the end it turned out that we
-	   would've had enough cycles to spare to just use the standard
-	   algorithm.
-	                     _xxx?
-	                 /           \
-	           _xx?0                _xx1?
-	             |                    |
-	           _x?00                _x?01
-	         /       \             /     \
-	    _?000         _?100      _?001    _?101
-	    /   \         /   \        |      /   \
-	 _0000 _1000   _0100 _1100   _1001 _0101 _1101
-	   |     |       |     |       |     |     |
-	  ...   ...     ...   ...     ...   ...   ...
-	   |     |       |     |       |     |     |
-	   B0    58     84    8C      69     75    9D
-	  27cy  28cy   26cy  28cy    26cy   31cy  30cy  */
-		test	0, m____1
-	m____0:	shift16
-		never	1
-	m___00:	shift16
-		test	2, m__100
-	m__000:	shift16
-		test	3, m_1000
-	m_0000:	shift16
-		always	4
-		add16 $ shift16
-		always	5
-		add8 $ shift8
-		never	6
-		shift8
-		always	7
-		add8 $ shift0
-		RJMP	end_mul ; calc'd 0xb0
-
-	m_1000:	add16 $ shift16
-		always	4
-		add16 $ shift16
-		never	5
-		shift8
-		always	6
-		add8 $ shift8
-		never	7
-		shift0
-		RJMP	end_mul ; calc'd 0x58
-
-	m__100:	add16 $ shift16
-		i_test	3, m_0100
-	m_1100:	add16
-	m_0100:	shift16
-		never	4
-		shift16
-		never	5
-		shift8
-		never	6
-		shift8
-		always	7
-		add8 $ shift0
-		RJMP	end_mul ; calc'd 0x8c / 0x84
-
-	m____1: add16 $ shift16
-		never	1
-	m___01:	shift16
-		test	2, m__101
-	m__001:	shift16
-		always	3
-	m_1001:	add16 $ shift16
-		never	4
-		shift16
-		always	5
-		add8 $ shift8
-		always	6
-		add8 $ shift8
-		never	7
-		shift0
-		RJMP	end_mul ; calc'd 0x69
-
-	m__101:	add16 $ shift16
-		test	3, m_1101
-	m_0101:	shift16
-		always	4
-		add16 $ shift16
-		always	5
-		add8 $ shift8
-		always	6
-		add8 $ shift8
-		never	7
-		shift0
-		RJMP	end_mul ; calc'd 0x75
-
-	m_1101:	add16 $ shift16
-		always	4
-		add16 $ shift16
-		never	5
-		shift8
-		never	6
-		shift8
-		always	7
-		add8 $ shift0
-		; calc'd 0x9d
-
-	end_mul:
-		LSR a1 ;final shift is a common operation for all
+	; begin of mulitiplication:
+		LSR t
+		BRCC skip1
+		ADD a1, i0
+		ADC a2, i1
+	skip1:
+		LSR a2
+		ROR a1
+		LSR t
+	;        BRCC skip2 -- this bit is always zero
+	;        ADD a1, i0
+	;        ADC a2, i1
+	;skip2:
+		LSR a2
+		ROR a1
+		LSR t
+		BRCC skip3
+		ADD a1, i0
+		ADC a2, i1
+	skip3:
+		LSR a2
+		ROR a1
+		LSR t
+		BRCC skip4
+		ADD a1, i0
+		ADC a2, i1
+	skip4:
+		LSR a2
+		ROR a1
+		LSR t
+		BRCC skip5
+		ADD a1, i0
+		ADC a2, i1
+	skip5:
+		LSR a2
+		ROR a1
+		LSR t
+		BRCC skip6	;sbrc t, NNN
+		ADD a1, i0
+	skip6:
+		LSR a1
+		LSR t
+		BRCC skip7
+		ADD a1, i0
+	skip7:
+		LSR a1
+		LSR t
+		BRCC skip8
+		ADD a1, i0
+	skip8:
+		LSR a1
 
 	MOV	t, a1 ;;TODO: use a1 in loop: directly
 	RET