diff options
author | Camil Staps | 2016-12-07 23:27:05 +0100 |
---|---|---|
committer | Camil Staps | 2016-12-07 23:27:05 +0100 |
commit | 8defd59c2fae56045085218aed98761ecab0587d (patch) | |
tree | 75ecfab1887b9bf099d8f8453061020a61d4e2c6 | |
parent | Finish load offsets (diff) |
Text about register allocation optimisation
-rw-r--r-- | thesis/preamble.tex | 3 | ||||
-rw-r--r-- | thesis/reg-alloc.tex | 192 | ||||
-rw-r--r-- | thesis/thesis.bib | 11 | ||||
-rw-r--r-- | thesis/thesis.tex | 1 |
4 files changed, 206 insertions, 1 deletions
diff --git a/thesis/preamble.tex b/thesis/preamble.tex index 7650d20..9738740 100644 --- a/thesis/preamble.tex +++ b/thesis/preamble.tex @@ -55,8 +55,9 @@ \newmintinline[bash]{bash}{style=bw} \usepackage{subcaption} +\usepackage{pgfplots} \usepackage{tikz} -\usetikzlibrary{positioning} +\usetikzlibrary{positioning,calc} \let\oldtexttt\texttt \def\texttt#1{\oldtexttt{\footnotesize #1}} diff --git a/thesis/reg-alloc.tex b/thesis/reg-alloc.tex new file mode 100644 index 0000000..711e94f --- /dev/null +++ b/thesis/reg-alloc.tex @@ -0,0 +1,192 @@ +\section{Optimising register allocation} +\label{sec:reg-alloc} + +\begin{figure}[b!] + \small + \begin{subfigure}{.5\linewidth} + \begin{tikzpicture} + \begin{axis} + [ xbar + , xmin=0 + , width=.9\linewidth + , height=1.1\linewidth + , symbolic y coords={S0,A ptr.,A0,Heap ptr.,A1,B0,A2,B ptr.,B1,S1,B2,B3,B4,Heap ctr.,A3} + , ytick=data + , scaled x ticks=real:1 + , xtick scale label code/.code={} + , ytick pos=left + ] + \addplot coordinates { + (378618,S0) + (270274,A ptr.) + (218018,A0) + (166284,Heap ptr.) + (152821,A1) + (110390,B0) + (107481,A2) + (102640,B ptr.) + ( 64496,B1) + ( 55526,S1) + ( 41092,B2) + ( 25924,B3) + ( 15699,B4) + ( 14930,Heap ctr.) + ( 9413,A3) + }; + \draw ($({axis cs:100000,S0})-(0,1.6em)$) -- ($({axis cs:100000,A3})+(0,1.6em)$); + \end{axis} + \end{tikzpicture} + \subcaption{In the Clean compiler} + \end{subfigure}% + \begin{subfigure}{.5\linewidth} + \begin{tikzpicture} + \begin{axis} + [ xbar + , xmin=0 + , width=.9\linewidth + , height=1.1\linewidth + , symbolic y coords={S0,A ptr.,A0,Heap ptr.,A1,B0,A2,B ptr.,B1,S1,B2,B3,B4,Heap ctr.,A3} + , ytick=data + , ytick pos=left + ] + \addplot coordinates { + (387,S0) + (372,A ptr.) + (283,B1) + (257,A3) + (217,B0) + (207,A0) + (185,A2) + (185,A1) + (137,S1) + (115,B ptr.) + (113,Heap ptr.) + ( 24,Heap ctr.) + ( 13,B2) + ( 2,B3) + ( 0,B4) + }; + \draw ($({axis cs:150,S0})-(0,1.6em)$) -- ($({axis cs:150,A3})+(0,1.6em)$); + \end{axis} + \end{tikzpicture} + \subcaption{In the RTS\label{fig:reg-usage:rts}} + \end{subfigure}% + \caption{Register usage with the Thumb-2 backend\label{fig:reg-usage}} +\end{figure} + +% pi@rasppi:~/clean/src/compiler$ ~/count-regs.sh +% r0: 15699 +% r1: 25924 +% r2: 41092 +% r3: 64496 +% r4: 110390 +% r5: 14930 +% r6: 218018 +% r7: 152821 +% r8: 107481 +% r9: 270274 +% sb: 0 +% r10: 166284 +% sl: 0 +% r11: 9413 +% fp: 0 +% r12: 378618 +% ip: 0 +% r13: 0 +% sp: 102640 +% r14: 55526 +% lr: 0 +% r15: 0 +% pc: 61145 + +% pi@rasppi:~/rts$ ~/count-regs.sh thumb2*.s +% BSTACK_0: 217 +% BSTACK_1: 283 +% BSTACK_2: 13 +% BSTACK_3: 2 +% BSTACK_4: 0 +% BSTACK_PTR: 0 +% ASTACK_0: 207 +% ASTACK_1: 185 +% ASTACK_2: 350 +% ASTACK_3: 257 +% ASTACK_PTR: 372 +% HEAP_FREE: 24 +% HEAP_PTR: 113 +% SCRATCH_REG: 387 +% LINK_REG: 0 +% sp: 115 +% lr: 137 +% pc: 72 + +\begin{multicols}{2} + +\subsection{Introduction} +\label{sec:reg-alloc:intro} +In Thumb-1, i.e., the 16-bit subset of Thumb-2, register fields are 3 bits wide, + allowing the programmer to access \ual{r0} through \ual{r7}. +Special encoding variants are defined to access \ual{sp} (\ual{r13}) and \ual{pc} (\ual{r15}), + though these instructions have limited functionality compared to those working on the eight low registers. +The 32-bit instructions defined by Thumb-2 have access to all sixteen registers. + +This introduces an interesting code size optimisation vector. +If we put the most-used registers in the eight low registers, we may save code size. + +\subsection{Usage in Clean} +\label{sec:reg-alloc:clean} +In Clean programs, the ARM registers are used as follows~\parencite[\texttt{armstartup.s}]{armcg}: + +\begin{itemize}[itemsep=0pt] + \item Five registers for the top of the B-stack, + which we call B0 through B4. + \item Four registers for the top of the A-stack, + A0 through A3. + \item Three registers for the A-stack, B-stack and heap pointers: + A ptr., B ptr., Heap ptr., respectively. + The B-stack is interleaved with the C-stack and uses SP as its pointer. + \item One register for the number of free words on the heap, Heap ctr. + \item Two scratch registers S0 and S1. + \item The program counter PC. +\end{itemize} + +In \cref{fig:reg-usage}, we count for each register the number of instructions it occurs in, in the code generated for the Clean compiler~\parencite{cocl}. +When a register is used multiple times in the same instruction, this counts as one occurrence. +The vertical line at indicates the number of occurrences that a variable should have to justify putting it in the lower registers, + if we were to put the most-used variables in the lower registers + (e.g., there are eight registers with over 100,000 occurrences in the Clean compiler). + +The RTS shows an entirely different pattern (\cref{fig:reg-usage:rts}), + because many registers have another meaning in the RTS than in generated code. +For large programs, register usage in the RTS is negligible. +For small programs it may not be negligible, + but these programs are not likely to need rigorous optimisation for code size, + considering that they will be small anyway. + +\subsection{Optimisation} +\label{sec:reg-alloc:optimisation} +\Cref{fig:reg-usage} suggests that we put all variables that occur more than 100,000 times in the low registers. +However, the B-stack pointer has to be SP: + the B- and C-stacks are combined into one + (an optimisation suggested early on by \cite{execspecs} \todo{exact ref.}) + and the C-stack pointer should be the system pointer to allow for an efficient foreign function interface. + +% Before optimisation: +% pi@rasppi:~/clean/src/compiler$ ls -l a.out +% -rwxr-xr-x 1 pi pi 9987016 Dec 7 21:46 a.out +% pi@rasppi:~/clean/src/compiler$ strip --strip-all a.out +% pi@rasppi:~/clean/src/compiler$ ls -l a.out +% -rwxr-xr-x 1 pi pi 5034316 Dec 7 21:46 a.out + +% After optimisation as suggested by John: +% 1 r9 <-> r5 +% 2 r0 -> r10 +% 3 r1 -> r11 +% 4 r10 -> r1 +% 5 r11 -> r12 +% 6 r12 -> r0 +% static char *register_name[15]={"sp","r1","r5","r12","r8","r7","r6","r4","r3","r2","r11","r10","r9","r0","r14"}; +% static int register_order[15]={13,1,5,12,8,7,6,4,3,2,11,10,9,0,14}; +% pi@rasppi:~/clean/src/compiler$ ls -l a.out +% -rwxr-xr-x 1 pi pi 7591584 Dec 7 22:16 a.out + +\end{multicols} diff --git a/thesis/thesis.bib b/thesis/thesis.bib index 30f945c..279fbd3 100644 --- a/thesis/thesis.bib +++ b/thesis/thesis.bib @@ -71,3 +71,14 @@ year=2016, urldate="2016-11-15" } + +@Online{cocl, + key="Clean compiler", + label="Clean compiler", + title="The Clean compiler", + subtitle="Bootstrap from intermediate ABC files, 32-bit.", + organization="Radboud University Nijmegen", + url="http://clean.cs.ru.nl/Download_Clean", + year=2016, + urldate="2016-12-07" +} diff --git a/thesis/thesis.tex b/thesis/thesis.tex index 9cc6482..2a4977d 100644 --- a/thesis/thesis.tex +++ b/thesis/thesis.tex @@ -52,6 +52,7 @@ The code generator proposed here can be used in situations where small code size \input{storing-pc} \input{code-addresses} \input{load-offsets} +\input{reg-alloc} \input{results} \ifdraft\else |