summaryrefslogtreecommitdiff
path: root/thesis/reg-alloc.tex
blob: 0fc764ede61342f88128d792acd8d26f4558c59b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
\section{Optimising register allocation}
\label{sec:reg-alloc}

\begin{figure}[b!]
	\small
	\begin{subfigure}{.5\linewidth}
		\begin{tikzpicture}
			\begin{axis}
				[ xbar
				, xmin=0
				, width=.9\linewidth
				, height=1.1\linewidth
				, symbolic y coords={S0,A ptr.,A0,Heap ptr.,A1,B0,A2,B ptr.,B1,S1,B2,B3,B4,Heap ctr.,A3}
				, ytick=data
				, scaled x ticks=real:1
				, xtick scale label code/.code={}
				, ytick pos=left
				]
				\addplot coordinates {
					(378618,S0)
					(270274,A ptr.)
					(218018,A0)
					(166284,Heap ptr.)
					(152821,A1)
					(110390,B0)
					(107481,A2)
					(102640,B ptr.)
					( 64496,B1)
					( 55526,S1)
					( 41092,B2)
					( 25924,B3)
					( 15699,B4)
					( 14930,Heap ctr.)
					(  9413,A3)
				};
				\draw ($({axis cs:85000,S0})-(0,2em)$) -- ($({axis cs:85000,A3})+(0,2em)$);
			\end{axis}
		\end{tikzpicture}
		\subcaption{In the Clean compiler}
	\end{subfigure}%
	\begin{subfigure}{.5\linewidth}
		\begin{tikzpicture}
			\begin{axis}
				[ xbar
				, xmin=0
				, width=.9\linewidth
				, height=1.1\linewidth
				, symbolic y coords={S0,A ptr.,A0,Heap ptr.,A1,B0,A2,B ptr.,B1,S1,B2,B3,B4,Heap ctr.,A3}
				, ytick=data
				, ytick pos=left
				]
				\addplot coordinates {
					(387,S0)
					(372,A ptr.)
					(283,B1)
					(257,A3)
					(217,B0)
					(207,A0)
					(185,A2)
					(185,A1)
					(137,S1)
					(115,B ptr.)
					(113,Heap ptr.)
					( 24,Heap ctr.)
					( 13,B2)
					(  2,B3)
					(  0,B4)
				};
				\draw ($({axis cs:155,S0})-(0,2em)$) -- ($({axis cs:155,A3})+(0,2em)$);
			\end{axis}
		\end{tikzpicture}
		\subcaption{In the RTS\label{fig:reg-usage:rts}}
	\end{subfigure}%
	\caption{Register usage with the Thumb-2 backend\label{fig:reg-usage}}
\end{figure}

% pi@rasppi:~/clean/src/compiler$ ~/count-regs.sh 
% r0:     15699
% r1:     25924
% r2:     41092
% r3:     64496
% r4:     110390
% r5:     14930
% r6:     218018
% r7:     152821
% r8:     107481
% r9:     270274
% sb:     0
% r10:    166284
% sl:     0
% r11:    9413
% fp:     0
% r12:    378618
% ip:     0
% r13:    0
% sp:     102640
% r14:    55526
% lr:     0
% r15:    0
% pc:     61145

% pi@rasppi:~/rts$ ~/count-regs.sh thumb2*.s
% BSTACK_0:       217
% BSTACK_1:       283
% BSTACK_2:       13
% BSTACK_3:       2
% BSTACK_4:       0
% BSTACK_PTR:     0
% ASTACK_0:       207
% ASTACK_1:       185
% ASTACK_2:       350
% ASTACK_3:       257
% ASTACK_PTR:     372
% HEAP_FREE:      24
% HEAP_PTR:       113
% SCRATCH_REG:    387
% LINK_REG:       0
% sp:     115
% lr:     137
% pc:     72

\begin{multicols}{2}

\subsection{Introduction}
\label{sec:reg-alloc:intro}
In Thumb-1, i.e., the 16-bit subset of Thumb-2, register fields are 3 bits wide,
	allowing the programmer to access \ual{r0} through \ual{r7}.
Special encoding variants are defined to access \ual{sp} (\ual{r13}) and \ual{pc} (\ual{r15}),
	though these instructions have limited functionality compared to those working on the eight low registers.
The 32-bit instructions defined by Thumb-2 have access to all sixteen registers.

This introduces an interesting code size optimisation vector.
If we put the most-used registers in the eight low registers, we may save code size.

\subsection{Usage in Clean}
\label{sec:reg-alloc:clean}
In Clean programs, the ARM registers are used as follows~\parencite[\texttt{armstartup.s}]{armcg}:

\begin{itemize}[itemsep=0pt]
	\item Five registers for the top of the B-stack,
		which we call B0 through B4.
	\item Four registers for the top of the A-stack,
		A0 through A3.
	\item Three registers for the A-stack, B-stack and heap pointers:
		A ptr., B ptr., Heap ptr., respectively.
		The B-stack is interleaved with the C-stack and uses SP as its pointer.
	\item One register for the number of free words on the heap, Heap ctr.
	\item Two scratch registers S0 and S1.
	\item The program counter PC.
\end{itemize}

In \cref{fig:reg-usage}, we count for each register the number of instructions it occurs in, in the code generated for the Clean compiler~\parencite{cocl}.
When a register is used multiple times in the same instruction, this counts as one occurrence.
The vertical line at indicates the number of occurrences that a variable should have to justify putting it in the lower registers,
	if we were to put the most-used variables in the lower registers
	(e.g., there are eight registers with over 85,000 occurrences in the Clean compiler).

The RTS shows an entirely different pattern (\cref{fig:reg-usage:rts}),
	because many registers have another meaning in the RTS than in generated code.
For large programs, register usage in the RTS is negligible.
For small programs it may not be negligible,
	but these programs are not likely to need rigorous optimisation for code size,
	considering that they will be small anyway.

\subsection{Optimisation}
\label{sec:reg-alloc:optimisation}
\Cref{fig:reg-usage} suggests that we put all variables that occur more than 85,000 times in the low registers.
However, the B-stack pointer has to be SP:
	the B- and C-stacks are combined into one
	(an optimisation suggested early on by \cite{execspecs} \todo{exact ref.})
	and the C-stack pointer should be the system pointer to allow for an efficient foreign function interface.

% Before optimisation:
% pi@rasppi:~/clean/src/compiler$ ls -l a.out 
% -rwxr-xr-x 1 pi pi 9987016 Dec  7 21:46 a.out
% pi@rasppi:~/clean/src/compiler$ strip --strip-all a.out 
% pi@rasppi:~/clean/src/compiler$ ls -l a.out 
% -rwxr-xr-x 1 pi pi 5034316 Dec  7 21:46 a.out

% After optimisation as suggested by John:
%  1 r9 <-> r5
%  2 r0 -> r10
%  3 r1 -> r11
%  4 r10 -> r1
%  5 r11 -> r12
%  6 r12 -> r0
% static char *register_name[15]={"sp","r1","r5","r12","r8","r7","r6","r4","r3","r2","r11","r10","r9","r0","r14"};
% static int register_order[15]={13,1,5,12,8,7,6,4,3,2,11,10,9,0,14};
% pi@rasppi:~/clean/src/compiler$ ls -l a.out 
% -rwxr-xr-x 1 pi pi 7591584 Dec  7 22:16 a.out

\end{multicols}