\documentclass[british]{beamer} \usepackage[british]{babel} \usepackage[babel=true]{csquotes} \usepackage{tikz} \usetikzlibrary{calc} \usepackage{pgfplots} \title{Code generation for the Thumb-2 instruction set} \author[Camil Staps]{ Camil Staps\\[1em]\small{ \emph{Supervisors:}\\ prof. dr. dr.h.c. ir. M.J. Plasmeijer\\ drs. J.H.G. van Groningen}} \date{Monday 9\textsuperscript{th} January, 2017} \begin{document} \maketitle \section{Introduction} \begin{frame}{ARM and Thumb} \begin{itemize} \item Widely used in embedded systems \item Three instruction sets: \begin{itemize} \item ARM (32-bit) \item Thumb (16-bit) \item Thumb-2 (mixture) \end{itemize} \item Thumb-2 advantages: \begin{itemize} \item Smaller code than ARM \item Simpler devices than ARM \item Faster than Thumb \end{itemize} \item Interesting differences for code generation: \begin{itemize} \item How to deal with instructions that do not exist in Thumb-2? \item How to use as many 16-bit instructions as possible? \end{itemize} \end{itemize} \end{frame} \begin{frame}{Clean} \begin{minipage}{.65\linewidth} \begin{itemize} \item Purely functional, lazy programming language \item Compilation in several steps \item Already had an ARM code generator \item Made one for Thumb-2 \end{itemize} \end{minipage}% \begin{minipage}{.35\linewidth} \centering \footnotesize \begin{tikzpicture}[every node/.style={rectangle,draw},every path/.style={draw},->] \node (clean) {Clean}; \node[below of=clean] (core) {Core Clean}; \node[below of=core] (abc) {ABC-code}; \node[below of=abc] (abstr) {Abstract Von-Neumann}; \node[below of=abstr] (target) {Target machine}; \path (clean) -- (core) -- (abc) -- (abstr) -- (target); \end{tikzpicture} \end{minipage} \end{frame} \section{Register allocation} \begin{frame}{Register allocation --- introduction} \begin{itemize} \item ARM has 16 registers; Thumb only lower eight \item Higher registers can be accessed through 32-bit instructions in Thumb-2 \item Want to put frequently used registers in lower half \end{itemize} \end{frame} \begin{frame}{Register allocation --- collecting data} \begin{itemize} \item Count register usage in the Clean compiler: \end{itemize} \begin{center} \begin{tikzpicture} \scriptsize \begin{axis} [ xbar , bar width=.5em , xmin=0 , height=0.6\textheight , symbolic y coords={S0,A ptr.,A0,Heap ptr.,A1,B0,A2,B ptr.,B1,S1,B2,B3,B4,Heap ctr.,A3} , ytick=data , scaled x ticks=real:1 , xtick scale label code/.code={} , axis lines*=left , compat={1.3} ] \addplot coordinates { (378618,S0) (270274,A ptr.) (218018,A0) (166284,Heap ptr.) (152821,A1) (110390,B0) (107481,A2) (102640,B ptr.) ( 64496,B1) ( 55526,S1) ( 41092,B2) ( 25924,B3) ( 15699,B4) ( 14930,Heap ctr.) ( 9413,A3) }; \draw ($({axis cs:85000,S0})-(0,2em)$) -- ($({axis cs:85000,A3})+(0,2em)$); \end{axis} \end{tikzpicture} \end{center} \end{frame} \begin{frame}{Register allocation --- the foreign function interface} \begin{itemize} \item ARM, Inc. describes how function calls should take place \item Need to adhere to the specification to be able to link with C \item Lowest four registers are not preserved upon a function call \item Trade-off: code size vs. efficient foreign function interface \end{itemize} \end{frame} \begin{frame}{Register allocation --- results} \begin{center} \begin{tikzpicture} \begin{axis} [ xbar , width=.7\textwidth , height=.5\textheight , xmin=0, xmax=6000000 , xlabel={Bytes} , ytick=\empty , reverse stacked plots , ymin=-1, ymax=1 , axis lines*=left , nodes near coords, nodes near coords align=west , reverse legend , every axis plot/.append style={point meta=explicit symbolic} , legend style={at={(0.5,-0.5)},anchor=north,legend columns=-1} ] \addplot coordinates { (3827868,0) [81.6\%] }; \addplot coordinates { (4385964,0) [93.5\%] }; \addplot coordinates { (4692628,0) [100\%] }; \legend{Thumb (Code size), Thumb (FFI), ARM} \end{axis} \end{tikzpicture} \end{center} \end{frame} \section{Results} \begin{frame}{Results} \begin{itemize} \item Benchmarks with Clean's \enquote{small examples} \item Code size decrease: $\overline{x}=17.3\%, \sigma=6.3\text{pp.}$\\ Without tiny programs: $\overline{x}=21.0\%, \sigma=2.6\text{pp.}$ \item Running time increase: $\overline{x}=4.8\%, \sigma=3.1\text{pp.}$\\ Without tiny programs: $\overline{x}=3.7\%, \sigma=2.04\text{pp.}$ \item Subroutine calls are expensive, so Thumb-2 performance is worse for highly recursive programs \end{itemize} \end{frame} \begin{frame}{Matching programs and instruction sets?} \begin{center} \begin{tikzpicture} \begin{axis} [ width=.8\textwidth , xlabel={Size decrease (\%)} , ylabel={Running time increase (\%)} , mark options={scale=2} , scatter/classes={% a={mark=x,draw=red}, b={mark=+,draw=black}} ] \addplot[scatter,only marks,scatter src=explicit symbolic] table[meta=label,row sep=crcr] { x y label\\ 12.8 11.9 a\\ 18.6 3.6 b\\ 9.8 5.1 a\\ 18.9 5.9 b\\ 18.9 3.7 b\\ 20.1 5.4 a\\ 22.9 0.9 a\\ 0.0 2.6 a\\ 20.0 5.2 b\\ 22.0 0.0 b\\ 12.1 8.0 a\\ }; \end{axis} \end{tikzpicture} \end{center} \end{frame} \section{Further work} \begin{frame}{Further work} \begin{itemize} \item Branch optimisation (up to 1.2pp. code size win) \item Subroutine calls need extra instructions on Thumb-2; optimise this to save up to 4pp. on code size \end{itemize} Overall expected code size win: $\approx$25\% (comparable to GCC) \begin{itemize} \item Copying collector needs some fine-tuning \item Two other garbage collectors not considered yet \end{itemize} \end{frame} \end{document}