Presentation

author: Camil Staps 2017-01-09 09:38:23 +0100
committer: Camil Staps 2017-01-09 09:38:23 +0100
commit: 6c60b567ea734a077a4b416e667d598b63add7d3 (patch)
tree: a6c41b296e4c21100d3909bd8ea575c8dae488d8 /presentation
parent: Change order appendices, fix organisation section (diff)
1 files changed, 222 insertions, 0 deletions
diff --git a/presentation/pres.tex b/presentation/pres.tex
new file mode 100644
index 0000000..70aa594
--- /dev/null
+++ b/presentation/pres.tex
@@ -0,0 +1,222 @@
+\documentclass[british]{beamer}
+
+\usepackage[british]{babel}
+\usepackage[babel=true]{csquotes}
+
+\usepackage{tikz}
+\usetikzlibrary{calc}
+\usepackage{pgfplots}
+
+\title{Code generation for the Thumb-2 instruction set}
+\author[Camil Staps]{
+	Camil Staps\\[1em]\small{
+		\emph{Supervisors:}\\
+		prof. dr. dr.h.c. ir. M.J. Plasmeijer\\
+		drs. J.H.G. van Groningen}}
+\date{Monday 9\textsuperscript{th} January, 2017}
+
+\begin{document}
+
+\maketitle
+
+\section{Introduction}
+
+\begin{frame}{ARM and Thumb}
+	\begin{itemize}
+		\item Widely used in embedded systems
+		\item Three instruction sets:
+			\begin{itemize}
+				\item ARM (32-bit)
+				\item Thumb (16-bit)
+				\item Thumb-2 (mixture)
+			\end{itemize}
+		\item Thumb-2 advantages:
+			\begin{itemize}
+				\item Smaller code than ARM
+				\item Simpler devices than ARM
+				\item Faster than Thumb
+			\end{itemize}
+		\item Interesting differences for code generation:
+			\begin{itemize}
+				\item How to deal with instructions that do not exist in Thumb-2?
+				\item How to use as many 16-bit instructions as possible?
+			\end{itemize}
+	\end{itemize}
+\end{frame}
+
+\begin{frame}{Clean}
+	\begin{minipage}{.65\linewidth}
+		\begin{itemize}
+			\item Purely functional, lazy programming language
+			\item Compilation in several steps
+			\item Already had an ARM code generator
+			\item Made one for Thumb-2
+		\end{itemize}
+	\end{minipage}%
+	\begin{minipage}{.35\linewidth}
+		\centering
+		\footnotesize
+		\begin{tikzpicture}[every node/.style={rectangle,draw},every path/.style={draw},->]
+			\node (clean) {Clean};
+			\node[below of=clean] (core) {Core Clean};
+			\node[below of=core] (abc) {ABC-code};
+			\node[below of=abc] (abstr) {Abstract Von-Neumann};
+			\node[below of=abstr] (target) {Target machine};
+			\path (clean) -- (core) -- (abc) -- (abstr) -- (target);
+		\end{tikzpicture}
+	\end{minipage}
+\end{frame}
+
+\section{Register allocation}
+
+\begin{frame}{Register allocation --- introduction}
+	\begin{itemize}
+		\item ARM has 16 registers; Thumb only lower eight
+		\item Higher registers can be accessed through 32-bit instructions in Thumb-2
+		\item Want to put frequently used registers in lower half
+	\end{itemize}
+\end{frame}
+
+\begin{frame}{Register allocation --- collecting data}
+	\begin{itemize}
+		\item Count register usage in the Clean compiler:
+	\end{itemize}
+
+	\begin{center}
+	\begin{tikzpicture}
+		\scriptsize
+		\begin{axis}
+			[ xbar
+			, bar width=.5em
+			, xmin=0
+			, height=0.6\textheight
+			, symbolic y coords={S0,A ptr.,A0,Heap ptr.,A1,B0,A2,B ptr.,B1,S1,B2,B3,B4,Heap ctr.,A3}
+			, ytick=data
+			, scaled x ticks=real:1
+			, xtick scale label code/.code={}
+			, axis lines*=left
+			, compat={1.3}
+			]
+			\addplot coordinates {
+				(378618,S0)
+				(270274,A ptr.)
+				(218018,A0)
+				(166284,Heap ptr.)
+				(152821,A1)
+				(110390,B0)
+				(107481,A2)
+				(102640,B ptr.)
+				( 64496,B1)
+				( 55526,S1)
+				( 41092,B2)
+				( 25924,B3)
+				( 15699,B4)
+				( 14930,Heap ctr.)
+				(  9413,A3)
+			};
+			\draw ($({axis cs:85000,S0})-(0,2em)$) -- ($({axis cs:85000,A3})+(0,2em)$);
+		\end{axis}
+	\end{tikzpicture}
+\end{center}
+\end{frame}
+
+\begin{frame}{Register allocation --- the foreign function interface}
+	\begin{itemize}
+		\item ARM, Inc. describes how function calls should take place
+		\item Need to adhere to the specification to be able to link with C
+		\item Lowest four registers are not preserved upon a function call
+		\item Trade-off: code size vs. efficient foreign function interface
+	\end{itemize}
+\end{frame}
+
+\begin{frame}{Register allocation --- results}
+	\begin{center}
+		\begin{tikzpicture}
+			\begin{axis}
+				[ xbar
+				, width=.7\textwidth
+				, height=.5\textheight
+				, xmin=0, xmax=6000000
+				, xlabel={Bytes}
+				, ytick=\empty
+				, reverse stacked plots
+				, ymin=-1, ymax=1
+				, axis lines*=left
+				, nodes near coords, nodes near coords align=west
+				, reverse legend
+				, every axis plot/.append style={point meta=explicit symbolic}
+				, legend style={at={(0.5,-0.5)},anchor=north,legend columns=-1}
+				]
+				\addplot coordinates { (3827868,0) [81.6\%] };
+				\addplot coordinates { (4385964,0) [93.5\%] };
+				\addplot coordinates { (4692628,0) [100\%] };
+				\legend{Thumb (Code size), Thumb (FFI), ARM}
+			\end{axis}
+		\end{tikzpicture}
+	\end{center}
+\end{frame}
+
+\section{Results}
+
+\begin{frame}{Results}
+	\begin{itemize}
+		\item Benchmarks with Clean's \enquote{small examples}
+		\item Code size decrease: $\overline{x}=17.3\%, \sigma=6.3\text{pp.}$\\
+			Without tiny programs: $\overline{x}=21.0\%, \sigma=2.6\text{pp.}$
+		\item Running time increase: $\overline{x}=4.8\%, \sigma=3.1\text{pp.}$\\
+			Without tiny programs: $\overline{x}=3.7\%, \sigma=2.04\text{pp.}$
+		\item Subroutine calls are expensive,
+			so Thumb-2 performance is worse for highly recursive programs
+	\end{itemize}
+\end{frame}
+
+\begin{frame}{Matching programs and instruction sets?}
+	\begin{center}
+		\begin{tikzpicture}
+			\begin{axis}
+				[ width=.8\textwidth
+				, xlabel={Size decrease (\%)}
+				, ylabel={Running time increase (\%)}
+				, mark options={scale=2}
+				, scatter/classes={%
+					a={mark=x,draw=red},
+					b={mark=+,draw=black}}
+				]
+				\addplot[scatter,only marks,scatter src=explicit symbolic]
+					table[meta=label,row sep=crcr] {
+						x     y    label\\
+						12.8  11.9 a\\
+						18.6  3.6  b\\
+						9.8   5.1  a\\
+						18.9  5.9  b\\
+						18.9  3.7  b\\
+						20.1  5.4  a\\
+						22.9  0.9  a\\
+						0.0   2.6  a\\
+						20.0  5.2  b\\
+						22.0  0.0  b\\
+						12.1  8.0  a\\
+					};
+			\end{axis}
+		\end{tikzpicture}
+	\end{center}
+\end{frame}
+
+\section{Further work}
+
+\begin{frame}{Further work}
+	\begin{itemize}
+		\item Branch optimisation (up to 1.2pp. code size win)
+		\item Subroutine calls need extra instructions on Thumb-2;
+			optimise this to save up to 4pp. on code size
+	\end{itemize}
+
+	Overall expected code size win: $\approx$25\% (comparable to GCC)
+
+	\begin{itemize}
+		\item Copying collector needs some fine-tuning
+		\item Two other garbage collectors not considered yet
+	\end{itemize}
+\end{frame}
+
+\end{document}
author	Camil Staps	2017-01-09 09:38:23 +0100
committer	Camil Staps	2017-01-09 09:38:23 +0100
commit	6c60b567ea734a077a4b416e667d598b63add7d3 (patch)
tree	a6c41b296e4c21100d3909bd8ea575c8dae488d8 /presentation
parent	Change order appendices, fix organisation section (diff)