diff options
author | Camil Staps | 2017-01-09 09:38:23 +0100 |
---|---|---|
committer | Camil Staps | 2017-01-09 09:38:23 +0100 |
commit | 6c60b567ea734a077a4b416e667d598b63add7d3 (patch) | |
tree | a6c41b296e4c21100d3909bd8ea575c8dae488d8 /presentation | |
parent | Change order appendices, fix organisation section (diff) |
Presentation
Diffstat (limited to 'presentation')
-rw-r--r-- | presentation/pres.tex | 222 |
1 files changed, 222 insertions, 0 deletions
diff --git a/presentation/pres.tex b/presentation/pres.tex new file mode 100644 index 0000000..70aa594 --- /dev/null +++ b/presentation/pres.tex @@ -0,0 +1,222 @@ +\documentclass[british]{beamer} + +\usepackage[british]{babel} +\usepackage[babel=true]{csquotes} + +\usepackage{tikz} +\usetikzlibrary{calc} +\usepackage{pgfplots} + +\title{Code generation for the Thumb-2 instruction set} +\author[Camil Staps]{ + Camil Staps\\[1em]\small{ + \emph{Supervisors:}\\ + prof. dr. dr.h.c. ir. M.J. Plasmeijer\\ + drs. J.H.G. van Groningen}} +\date{Monday 9\textsuperscript{th} January, 2017} + +\begin{document} + +\maketitle + +\section{Introduction} + +\begin{frame}{ARM and Thumb} + \begin{itemize} + \item Widely used in embedded systems + \item Three instruction sets: + \begin{itemize} + \item ARM (32-bit) + \item Thumb (16-bit) + \item Thumb-2 (mixture) + \end{itemize} + \item Thumb-2 advantages: + \begin{itemize} + \item Smaller code than ARM + \item Simpler devices than ARM + \item Faster than Thumb + \end{itemize} + \item Interesting differences for code generation: + \begin{itemize} + \item How to deal with instructions that do not exist in Thumb-2? + \item How to use as many 16-bit instructions as possible? + \end{itemize} + \end{itemize} +\end{frame} + +\begin{frame}{Clean} + \begin{minipage}{.65\linewidth} + \begin{itemize} + \item Purely functional, lazy programming language + \item Compilation in several steps + \item Already had an ARM code generator + \item Made one for Thumb-2 + \end{itemize} + \end{minipage}% + \begin{minipage}{.35\linewidth} + \centering + \footnotesize + \begin{tikzpicture}[every node/.style={rectangle,draw},every path/.style={draw},->] + \node (clean) {Clean}; + \node[below of=clean] (core) {Core Clean}; + \node[below of=core] (abc) {ABC-code}; + \node[below of=abc] (abstr) {Abstract Von-Neumann}; + \node[below of=abstr] (target) {Target machine}; + \path (clean) -- (core) -- (abc) -- (abstr) -- (target); + \end{tikzpicture} + \end{minipage} +\end{frame} + +\section{Register allocation} + +\begin{frame}{Register allocation --- introduction} + \begin{itemize} + \item ARM has 16 registers; Thumb only lower eight + \item Higher registers can be accessed through 32-bit instructions in Thumb-2 + \item Want to put frequently used registers in lower half + \end{itemize} +\end{frame} + +\begin{frame}{Register allocation --- collecting data} + \begin{itemize} + \item Count register usage in the Clean compiler: + \end{itemize} + + \begin{center} + \begin{tikzpicture} + \scriptsize + \begin{axis} + [ xbar + , bar width=.5em + , xmin=0 + , height=0.6\textheight + , symbolic y coords={S0,A ptr.,A0,Heap ptr.,A1,B0,A2,B ptr.,B1,S1,B2,B3,B4,Heap ctr.,A3} + , ytick=data + , scaled x ticks=real:1 + , xtick scale label code/.code={} + , axis lines*=left + , compat={1.3} + ] + \addplot coordinates { + (378618,S0) + (270274,A ptr.) + (218018,A0) + (166284,Heap ptr.) + (152821,A1) + (110390,B0) + (107481,A2) + (102640,B ptr.) + ( 64496,B1) + ( 55526,S1) + ( 41092,B2) + ( 25924,B3) + ( 15699,B4) + ( 14930,Heap ctr.) + ( 9413,A3) + }; + \draw ($({axis cs:85000,S0})-(0,2em)$) -- ($({axis cs:85000,A3})+(0,2em)$); + \end{axis} + \end{tikzpicture} +\end{center} +\end{frame} + +\begin{frame}{Register allocation --- the foreign function interface} + \begin{itemize} + \item ARM, Inc. describes how function calls should take place + \item Need to adhere to the specification to be able to link with C + \item Lowest four registers are not preserved upon a function call + \item Trade-off: code size vs. efficient foreign function interface + \end{itemize} +\end{frame} + +\begin{frame}{Register allocation --- results} + \begin{center} + \begin{tikzpicture} + \begin{axis} + [ xbar + , width=.7\textwidth + , height=.5\textheight + , xmin=0, xmax=6000000 + , xlabel={Bytes} + , ytick=\empty + , reverse stacked plots + , ymin=-1, ymax=1 + , axis lines*=left + , nodes near coords, nodes near coords align=west + , reverse legend + , every axis plot/.append style={point meta=explicit symbolic} + , legend style={at={(0.5,-0.5)},anchor=north,legend columns=-1} + ] + \addplot coordinates { (3827868,0) [81.6\%] }; + \addplot coordinates { (4385964,0) [93.5\%] }; + \addplot coordinates { (4692628,0) [100\%] }; + \legend{Thumb (Code size), Thumb (FFI), ARM} + \end{axis} + \end{tikzpicture} + \end{center} +\end{frame} + +\section{Results} + +\begin{frame}{Results} + \begin{itemize} + \item Benchmarks with Clean's \enquote{small examples} + \item Code size decrease: $\overline{x}=17.3\%, \sigma=6.3\text{pp.}$\\ + Without tiny programs: $\overline{x}=21.0\%, \sigma=2.6\text{pp.}$ + \item Running time increase: $\overline{x}=4.8\%, \sigma=3.1\text{pp.}$\\ + Without tiny programs: $\overline{x}=3.7\%, \sigma=2.04\text{pp.}$ + \item Subroutine calls are expensive, + so Thumb-2 performance is worse for highly recursive programs + \end{itemize} +\end{frame} + +\begin{frame}{Matching programs and instruction sets?} + \begin{center} + \begin{tikzpicture} + \begin{axis} + [ width=.8\textwidth + , xlabel={Size decrease (\%)} + , ylabel={Running time increase (\%)} + , mark options={scale=2} + , scatter/classes={% + a={mark=x,draw=red}, + b={mark=+,draw=black}} + ] + \addplot[scatter,only marks,scatter src=explicit symbolic] + table[meta=label,row sep=crcr] { + x y label\\ + 12.8 11.9 a\\ + 18.6 3.6 b\\ + 9.8 5.1 a\\ + 18.9 5.9 b\\ + 18.9 3.7 b\\ + 20.1 5.4 a\\ + 22.9 0.9 a\\ + 0.0 2.6 a\\ + 20.0 5.2 b\\ + 22.0 0.0 b\\ + 12.1 8.0 a\\ + }; + \end{axis} + \end{tikzpicture} + \end{center} +\end{frame} + +\section{Further work} + +\begin{frame}{Further work} + \begin{itemize} + \item Branch optimisation (up to 1.2pp. code size win) + \item Subroutine calls need extra instructions on Thumb-2; + optimise this to save up to 4pp. on code size + \end{itemize} + + Overall expected code size win: $\approx$25\% (comparable to GCC) + + \begin{itemize} + \item Copying collector needs some fine-tuning + \item Two other garbage collectors not considered yet + \end{itemize} +\end{frame} + +\end{document} |