summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCamil Staps2016-12-07 23:27:05 +0100
committerCamil Staps2016-12-07 23:27:05 +0100
commit8defd59c2fae56045085218aed98761ecab0587d (patch)
tree75ecfab1887b9bf099d8f8453061020a61d4e2c6
parentFinish load offsets (diff)
Text about register allocation optimisation
-rw-r--r--thesis/preamble.tex3
-rw-r--r--thesis/reg-alloc.tex192
-rw-r--r--thesis/thesis.bib11
-rw-r--r--thesis/thesis.tex1
4 files changed, 206 insertions, 1 deletions
diff --git a/thesis/preamble.tex b/thesis/preamble.tex
index 7650d20..9738740 100644
--- a/thesis/preamble.tex
+++ b/thesis/preamble.tex
@@ -55,8 +55,9 @@
\newmintinline[bash]{bash}{style=bw}
\usepackage{subcaption}
+\usepackage{pgfplots}
\usepackage{tikz}
-\usetikzlibrary{positioning}
+\usetikzlibrary{positioning,calc}
\let\oldtexttt\texttt
\def\texttt#1{\oldtexttt{\footnotesize #1}}
diff --git a/thesis/reg-alloc.tex b/thesis/reg-alloc.tex
new file mode 100644
index 0000000..711e94f
--- /dev/null
+++ b/thesis/reg-alloc.tex
@@ -0,0 +1,192 @@
+\section{Optimising register allocation}
+\label{sec:reg-alloc}
+
+\begin{figure}[b!]
+ \small
+ \begin{subfigure}{.5\linewidth}
+ \begin{tikzpicture}
+ \begin{axis}
+ [ xbar
+ , xmin=0
+ , width=.9\linewidth
+ , height=1.1\linewidth
+ , symbolic y coords={S0,A ptr.,A0,Heap ptr.,A1,B0,A2,B ptr.,B1,S1,B2,B3,B4,Heap ctr.,A3}
+ , ytick=data
+ , scaled x ticks=real:1
+ , xtick scale label code/.code={}
+ , ytick pos=left
+ ]
+ \addplot coordinates {
+ (378618,S0)
+ (270274,A ptr.)
+ (218018,A0)
+ (166284,Heap ptr.)
+ (152821,A1)
+ (110390,B0)
+ (107481,A2)
+ (102640,B ptr.)
+ ( 64496,B1)
+ ( 55526,S1)
+ ( 41092,B2)
+ ( 25924,B3)
+ ( 15699,B4)
+ ( 14930,Heap ctr.)
+ ( 9413,A3)
+ };
+ \draw ($({axis cs:100000,S0})-(0,1.6em)$) -- ($({axis cs:100000,A3})+(0,1.6em)$);
+ \end{axis}
+ \end{tikzpicture}
+ \subcaption{In the Clean compiler}
+ \end{subfigure}%
+ \begin{subfigure}{.5\linewidth}
+ \begin{tikzpicture}
+ \begin{axis}
+ [ xbar
+ , xmin=0
+ , width=.9\linewidth
+ , height=1.1\linewidth
+ , symbolic y coords={S0,A ptr.,A0,Heap ptr.,A1,B0,A2,B ptr.,B1,S1,B2,B3,B4,Heap ctr.,A3}
+ , ytick=data
+ , ytick pos=left
+ ]
+ \addplot coordinates {
+ (387,S0)
+ (372,A ptr.)
+ (283,B1)
+ (257,A3)
+ (217,B0)
+ (207,A0)
+ (185,A2)
+ (185,A1)
+ (137,S1)
+ (115,B ptr.)
+ (113,Heap ptr.)
+ ( 24,Heap ctr.)
+ ( 13,B2)
+ ( 2,B3)
+ ( 0,B4)
+ };
+ \draw ($({axis cs:150,S0})-(0,1.6em)$) -- ($({axis cs:150,A3})+(0,1.6em)$);
+ \end{axis}
+ \end{tikzpicture}
+ \subcaption{In the RTS\label{fig:reg-usage:rts}}
+ \end{subfigure}%
+ \caption{Register usage with the Thumb-2 backend\label{fig:reg-usage}}
+\end{figure}
+
+% pi@rasppi:~/clean/src/compiler$ ~/count-regs.sh
+% r0: 15699
+% r1: 25924
+% r2: 41092
+% r3: 64496
+% r4: 110390
+% r5: 14930
+% r6: 218018
+% r7: 152821
+% r8: 107481
+% r9: 270274
+% sb: 0
+% r10: 166284
+% sl: 0
+% r11: 9413
+% fp: 0
+% r12: 378618
+% ip: 0
+% r13: 0
+% sp: 102640
+% r14: 55526
+% lr: 0
+% r15: 0
+% pc: 61145
+
+% pi@rasppi:~/rts$ ~/count-regs.sh thumb2*.s
+% BSTACK_0: 217
+% BSTACK_1: 283
+% BSTACK_2: 13
+% BSTACK_3: 2
+% BSTACK_4: 0
+% BSTACK_PTR: 0
+% ASTACK_0: 207
+% ASTACK_1: 185
+% ASTACK_2: 350
+% ASTACK_3: 257
+% ASTACK_PTR: 372
+% HEAP_FREE: 24
+% HEAP_PTR: 113
+% SCRATCH_REG: 387
+% LINK_REG: 0
+% sp: 115
+% lr: 137
+% pc: 72
+
+\begin{multicols}{2}
+
+\subsection{Introduction}
+\label{sec:reg-alloc:intro}
+In Thumb-1, i.e., the 16-bit subset of Thumb-2, register fields are 3 bits wide,
+ allowing the programmer to access \ual{r0} through \ual{r7}.
+Special encoding variants are defined to access \ual{sp} (\ual{r13}) and \ual{pc} (\ual{r15}),
+ though these instructions have limited functionality compared to those working on the eight low registers.
+The 32-bit instructions defined by Thumb-2 have access to all sixteen registers.
+
+This introduces an interesting code size optimisation vector.
+If we put the most-used registers in the eight low registers, we may save code size.
+
+\subsection{Usage in Clean}
+\label{sec:reg-alloc:clean}
+In Clean programs, the ARM registers are used as follows~\parencite[\texttt{armstartup.s}]{armcg}:
+
+\begin{itemize}[itemsep=0pt]
+ \item Five registers for the top of the B-stack,
+ which we call B0 through B4.
+ \item Four registers for the top of the A-stack,
+ A0 through A3.
+ \item Three registers for the A-stack, B-stack and heap pointers:
+ A ptr., B ptr., Heap ptr., respectively.
+ The B-stack is interleaved with the C-stack and uses SP as its pointer.
+ \item One register for the number of free words on the heap, Heap ctr.
+ \item Two scratch registers S0 and S1.
+ \item The program counter PC.
+\end{itemize}
+
+In \cref{fig:reg-usage}, we count for each register the number of instructions it occurs in, in the code generated for the Clean compiler~\parencite{cocl}.
+When a register is used multiple times in the same instruction, this counts as one occurrence.
+The vertical line at indicates the number of occurrences that a variable should have to justify putting it in the lower registers,
+ if we were to put the most-used variables in the lower registers
+ (e.g., there are eight registers with over 100,000 occurrences in the Clean compiler).
+
+The RTS shows an entirely different pattern (\cref{fig:reg-usage:rts}),
+ because many registers have another meaning in the RTS than in generated code.
+For large programs, register usage in the RTS is negligible.
+For small programs it may not be negligible,
+ but these programs are not likely to need rigorous optimisation for code size,
+ considering that they will be small anyway.
+
+\subsection{Optimisation}
+\label{sec:reg-alloc:optimisation}
+\Cref{fig:reg-usage} suggests that we put all variables that occur more than 100,000 times in the low registers.
+However, the B-stack pointer has to be SP:
+ the B- and C-stacks are combined into one
+ (an optimisation suggested early on by \cite{execspecs} \todo{exact ref.})
+ and the C-stack pointer should be the system pointer to allow for an efficient foreign function interface.
+
+% Before optimisation:
+% pi@rasppi:~/clean/src/compiler$ ls -l a.out
+% -rwxr-xr-x 1 pi pi 9987016 Dec 7 21:46 a.out
+% pi@rasppi:~/clean/src/compiler$ strip --strip-all a.out
+% pi@rasppi:~/clean/src/compiler$ ls -l a.out
+% -rwxr-xr-x 1 pi pi 5034316 Dec 7 21:46 a.out
+
+% After optimisation as suggested by John:
+% 1 r9 <-> r5
+% 2 r0 -> r10
+% 3 r1 -> r11
+% 4 r10 -> r1
+% 5 r11 -> r12
+% 6 r12 -> r0
+% static char *register_name[15]={"sp","r1","r5","r12","r8","r7","r6","r4","r3","r2","r11","r10","r9","r0","r14"};
+% static int register_order[15]={13,1,5,12,8,7,6,4,3,2,11,10,9,0,14};
+% pi@rasppi:~/clean/src/compiler$ ls -l a.out
+% -rwxr-xr-x 1 pi pi 7591584 Dec 7 22:16 a.out
+
+\end{multicols}
diff --git a/thesis/thesis.bib b/thesis/thesis.bib
index 30f945c..279fbd3 100644
--- a/thesis/thesis.bib
+++ b/thesis/thesis.bib
@@ -71,3 +71,14 @@
year=2016,
urldate="2016-11-15"
}
+
+@Online{cocl,
+ key="Clean compiler",
+ label="Clean compiler",
+ title="The Clean compiler",
+ subtitle="Bootstrap from intermediate ABC files, 32-bit.",
+ organization="Radboud University Nijmegen",
+ url="http://clean.cs.ru.nl/Download_Clean",
+ year=2016,
+ urldate="2016-12-07"
+}
diff --git a/thesis/thesis.tex b/thesis/thesis.tex
index 9cc6482..2a4977d 100644
--- a/thesis/thesis.tex
+++ b/thesis/thesis.tex
@@ -52,6 +52,7 @@ The code generator proposed here can be used in situations where small code size
\input{storing-pc}
\input{code-addresses}
\input{load-offsets}
+\input{reg-alloc}
\input{results}
\ifdraft\else