9 files changed, 274 insertions, 42 deletions
diff --git a/thesis/abbr.tex b/thesis/abbr.tex
index 6124917..0c172ef 100644
--- a/thesis/abbr.tex
+++ b/thesis/abbr.tex
@@ -1,11 +1,20 @@
 \section{Abbreviations}
 \label{sec:abbreviations}
 
-\begin{multicols}{2}
-
 \begin{description}
+	\item[HNF]
+		Head normal form, the state of a graph node that cannot be rewritten itself (though it children may).
+	\item[ISA]
+		Instruction set architecture.
+	\item[LSB(s)]
+		Least significant bit(s), the bit(s) with the lowest value in a word.
 	\item[PC]
 		Program counter.
+	\item[RTS]
+		Run-time system.
+		Clean's run-time system ensures that the \clean{Start} node is reduced and printed.
+		It also takes care of garbage collecting.
+	\item[UAL]
+		Unified Assembly Language.
+		%todo
 \end{description}
-
-\end{multicols}
diff --git a/thesis/approach.tex b/thesis/approach.tex
deleted file mode 100644
index 0034035..0000000
--- a/thesis/approach.tex
+++ /dev/null
@@ -1,17 +0,0 @@
-\section{Approach}
-\label{sec:approach}
-
-\begin{multicols}{2}
-
-\subsection{Interworking}
-\label{sec:approach:interworking}
-It is possible to mix ARM and Thumb instructions:
-\enquote{[t]he ARM and Thumb instruction sets are designed to \emph{interwork} freely}~\citep{armv7m}.
-We could therefore decide to generate code for some sections using ARM instructions, and for other using Thumb instructions.
-This could give better end results,
-	if the code generator would recognise which instruction set is best in what case.
-However, some architectures, like ARMv7-M, only support Thumb instructions.
-I choose to look at Thumb instructions only,
-	so that the widest range of architectures possible can be supported.
-
-\end{multicols}
diff --git a/thesis/clean.sty b/thesis/clean.sty
new file mode 100644
index 0000000..2451bbd
--- /dev/null
+++ b/thesis/clean.sty
@@ -0,0 +1,71 @@
+\RequirePackage{minted}
+\newmintinline[clean]{clean}{style=bw}
+
+\RequirePackage{tikz}
+\pgfkeys{/tikz/node args/.store in=\clean@args}
+\pgfkeys{/tikz/clean/.style={%
+		every node/.style={
+			clean node,
+			node args=0,
+			draw,
+			minimum height=1.5em
+		},
+		->,
+		node distance=1em
+}}
+
+\newdimen\clean@ya
+\newdimen\clean@yb
+\newdimen\clean@x
+\pgfdeclareshape{clean node}{
+	\inheritsavedanchors[from=rectangle]
+	\inheritanchor[from=rectangle]{west}
+	\inheritanchor[from=rectangle]{north}
+	\inheritanchor[from=rectangle]{east}
+	\inheritanchor[from=rectangle]{south}
+	\inheritanchor[from=rectangle]{north west}
+	\inheritanchor[from=rectangle]{north east}
+	\inheritanchor[from=rectangle]{south west}
+	\inheritanchor[from=rectangle]{south east}
+	\anchor{arg1}{\pgf@process{\northeast}\pgf@xa=\pgf@x\pgf@process{\pgf@anchor@rectangle@center}\pgf@x=\pgf@xa\advance\pgf@x by 3pt}
+	\anchor{arg2}{\pgf@process{\northeast}\pgf@xa=\pgf@x\pgf@process{\pgf@anchor@rectangle@center}\pgf@x=\pgf@xa\advance\pgf@x by 9pt}
+	\anchor{arg3}{\pgf@process{\northeast}\pgf@xa=\pgf@x\pgf@process{\pgf@anchor@rectangle@center}\pgf@x=\pgf@xa\advance\pgf@x by 15pt}
+	\anchor{arg1.west}{\pgf@process{\northeast}\pgf@xa=\pgf@x\pgf@process{\pgf@anchor@rectangle@center}\pgf@x=\pgf@xa\advance\pgf@x by 0pt}
+	\anchor{arg2.west}{\pgf@process{\northeast}\pgf@xa=\pgf@x\pgf@process{\pgf@anchor@rectangle@center}\pgf@x=\pgf@xa\advance\pgf@x by 6pt}
+	\anchor{arg3.west}{\pgf@process{\northeast}\pgf@xa=\pgf@x\pgf@process{\pgf@anchor@rectangle@center}\pgf@x=\pgf@xa\advance\pgf@x by 12pt}
+	\anchor{arg1.north}{\pgf@process{\northeast}\advance\pgf@x by 3pt}
+	\anchor{arg2.north}{\pgf@process{\northeast}\advance\pgf@x by 9pt}
+	\anchor{arg3.north}{\pgf@process{\northeast}\advance\pgf@x by 15pt}
+	\anchor{arg1.east}{\pgf@process{\northeast}\pgf@xa=\pgf@x\pgf@process{\pgf@anchor@rectangle@center}\pgf@x=\pgf@xa\advance\pgf@x by 6pt}
+	\anchor{arg2.east}{\pgf@process{\northeast}\pgf@xa=\pgf@x\pgf@process{\pgf@anchor@rectangle@center}\pgf@x=\pgf@xa\advance\pgf@x by 12pt}
+	\anchor{arg3.east}{\pgf@process{\northeast}\pgf@xa=\pgf@x\pgf@process{\pgf@anchor@rectangle@center}\pgf@x=\pgf@xa\advance\pgf@x by 18pt}
+	\anchor{arg1.south}{\pgf@process{\southwest}\pgf@ya=\pgf@y\pgf@process{\northeast}\pgf@y=\pgf@ya\advance\pgf@x by 3pt}
+	\anchor{arg2.south}{\pgf@process{\southwest}\pgf@ya=\pgf@y\pgf@process{\northeast}\pgf@y=\pgf@ya\advance\pgf@x by 9pt}
+	\anchor{arg3.south}{\pgf@process{\southwest}\pgf@ya=\pgf@y\pgf@process{\northeast}\pgf@y=\pgf@ya\advance\pgf@x by 15pt}
+	\anchor{center}{\pgfpointorigin}
+	\backgroundpath{%
+    \pgfpathrectanglecorners
+  	  {\pgfpointadd
+				{\southwest}
+				{\pgfpoint{\pgfkeysvalueof{/pgf/outer xsep}}{\pgfkeysvalueof{/pgf/outer ysep}}}}
+  	  {\pgfpointadd
+				{\pgfpoint{\clean@args *6pt}{0}}
+				{\pgfpointadd
+					{\northeast}
+					{\pgfpointscale{-1}{\pgfpoint{\pgfkeysvalueof{/pgf/outer xsep}}{\pgfkeysvalueof{/pgf/outer ysep}}}}}}
+		\pgfusepathqstroke
+		\ifnum\clean@args>0
+			\southwest\clean@ya=\pgf@y
+			\northeast\clean@yb=\pgf@y
+			\clean@x=\pgf@x
+			\foreach\arg in {1,...,\clean@args}{%
+				\pgfpathmoveto{\pgfpoint{\clean@x}{\clean@yb}}
+				\pgfpathlineto{\pgfpoint{\clean@x}{\clean@ya}}
+				\pgfusepathqstroke
+				\pgfpathcircle{\pgfpointadd{\pgfpoint{3pt}{.5\clean@yb}}{\pgfpoint{\clean@x}{.5\clean@ya}}}{1pt}
+				\pgfusepathqfillstroke
+				\global\advance\clean@x by 6pt
+			}
+		\fi
+	}
+}
diff --git a/thesis/differences.tex b/thesis/differences.tex
deleted file mode 100644
index c89e683..0000000
--- a/thesis/differences.tex
+++ /dev/null
@@ -1,4 +0,0 @@
-\section{Differences}
-\label{sec:differences}
-
-\input{fix-storing-pc}
diff --git a/thesis/fix-storing-pc.tex b/thesis/fix-storing-pc.tex
index fbc5853..31381f0 100644
--- a/thesis/fix-storing-pc.tex
+++ b/thesis/fix-storing-pc.tex
@@ -1,9 +1,12 @@
-\subsection{Storing the program counter}
+\section{Storing the program counter}
 \label{sec:fix-storing-pc}
+
+\begin{multicols}{2}
+
 The first issue I ran into is that of storing the program counter.
 Storing the program counter on the stack is something done commonly in many languages during a function call.
 
-\subsubsection{ARM code example}
+\subsection{ARM code example}
 Usually, an offset to the actual program counter at the moment of the store instruction is stored,
 to allow for a branch after that instruction, and have the callee return to the address after that branch.
 The ARM architecture accommodates for this: an instruction that reads the program counter, actually reads
@@ -11,9 +14,9 @@ The ARM architecture accommodates for this: an instruction that reads the progra
 The following ARM assembly example illustrates this (\texttt{armstartup.s:540-2},~\cite{armrts}):
 
 \begin{minted}{ual}
-	str     pc,[sp,#-4]!   @ 0x20  Store PC on the stack
-	bl      init_clean     @ 0x24  Branch to init_clean
-	tst     r4,r4          @ 0x28
+	str     pc,[sp,#-4]!
+	bl      init_clean
+	tst     r4,r4
 \end{minted}
 
 \begin{lrbox}{\ualbox}
@@ -31,7 +34,7 @@ The processor branches to \ual{init_clean},
 The program counter is then \ual{0x28}.
 For the next instruction cycle, the \ual{tst} command is executed.
 
-\subsubsection{Adapting for Thumb-2}
+\subsection{Adapting for Thumb-2}
 There are two reasons why the above cannot be used in Thumb-2 code.
 
 First, \ual{pc} is not allowed as the first operand to a \ual{str} instruction.
@@ -39,10 +42,10 @@ Hence, we need to first move \ual{pc} to an auxiliary register, and then push th
 We then get:
 
 \begin{minted}{ual}
-	add     lr,pc,#9       @ 0x20  Save PC+4+9 to LR
-	str     lr,[sp,#-4]!   @ 0x24  Store LR on the stack
-	bl      init_clean     @ 0x28  Branch to init_clean
-	tst     r4,r4          @ 0x2c
+	add     lr,pc,#9
+	str     lr,[sp,#-4]!
+	bl      init_clean
+	tst     r4,r4
 \end{minted}
 
 We store the value \ual{0x2d}.
@@ -63,7 +66,7 @@ However, in this case \ual{bl} is located at \ual{0x2a}, and since this is a 32
 In hand-written code, we can solve this by adding labels for the addresses we want to store on the stack.
 In generated code, we need to keep track of the current alignment and add either 9 or 11 to the read program counter.
 
-\subsubsection{Other solutions}
+\subsection{Other solutions}
 Another solution than the one we present here is the use of the link register.
 Branch instructions as \ual{bl} store the address of the next instruction in the link register.
 We could therefore imagine a setup where the callee gets the return address from that register rather than from the stack.
@@ -75,8 +78,10 @@ It is an easier solution to have the caller responsible for storing the return a
 	which is why this approach is taken in Clean's ARM code generator\cite{armcg}
 	and why I continue along those lines.
 
-\subsubsection{Complexity analysis}
+\subsection{Complexity analysis}
 To be done. %TODO
 
 For every occurrence: +1 word code size;
 for every function call: +1 instruction.
+
+\end{multicols}
diff --git a/thesis/intro.tex b/thesis/intro.tex
index 5be42af..0b75b49 100644
--- a/thesis/intro.tex
+++ b/thesis/intro.tex
@@ -1,6 +1,56 @@
 \section{Introduction}
 \label{sec:intro}
 
+\begin{figure*}[b]
+	\centering
+	\begin{subfigure}[b]{.2\linewidth}
+		\centering
+		\begin{tikzpicture}[clean]
+			\node (start) {\clean{Start}};
+		\end{tikzpicture}
+		\caption{Initially}
+	\end{subfigure}%
+	\begin{subfigure}[b]{.2\linewidth}
+		\centering
+		\begin{tikzpicture}[clean]
+			\node[node args=1] (fac) {\clean{fac}};
+			\node[below=of fac.arg1.south] (4) {\clean{4}};
+			\draw (fac.arg1) -- (4.north);
+		\end{tikzpicture}
+		\caption{Applying \clean{Start}}
+	\end{subfigure}%
+	\begin{subfigure}[b]{.3\linewidth}
+		\centering
+		\begin{tikzpicture}[clean]
+			\node[node args=2] (times) {\clean{*}};
+			\node[below=of times.arg1.south] (4) {\clean{4}};
+			\node[node args=1,right=of times.arg2.east] (fac) {\clean{fac}};
+			\node[node args=2,right=of fac.arg1.east] (minus) {\clean{-}};
+			\node[right=of minus.arg2.east] (1) {\clean{1}};
+			\draw (times.arg1) -- (4.north);
+			\draw (times.arg2) -- (fac.west);
+			\draw (fac.arg1) -- (minus.west);
+			\draw (minus.arg1) |- (4.east);
+			\draw (minus.arg2) -- (1.west);
+		\end{tikzpicture}
+		\caption{Applying \clean{fac n}}
+	\end{subfigure}%
+	\begin{subfigure}[b]{.2\linewidth}
+		\centering
+		\begin{tikzpicture}[clean]
+			\node[node args=2] (times) {\clean{*}};
+			\node[below=of times.arg1.south] (4) {\clean{4}};
+			\node[node args=1,right=of times.arg2.east] (fac) {\clean{fac}};
+			\node[below=of fac.arg1.south] (3) {\clean{3}};
+			\draw (times.arg1) -- (4.north);
+			\draw (times.arg2) -- (fac.west);
+			\draw (fac.arg1) -- (3.north);
+		\end{tikzpicture}
+		\caption{Applying \clean{-}}
+	\end{subfigure}
+	\caption{Rewriting a Clean node\label{fig:intro:rewriting}}
+\end{figure*}
+
 \begin{multicols}{2}
 	
 \subsection{Introduction}
@@ -24,20 +74,70 @@ Starting from ARMv6T2, an extension to the Thumb instruction set, known as Thumb
 This gives the ARM and Thumb instruction sets `almost identical functionality'~\citep[A1.2]{armv7ar},
 	whereas Thumb gives a smaller code size.
 
+Using the Unified Assembly Language (UAL), one can write assembly code for both the ARM and the Thumb instruction set and fix the target ISA only at assemble-time.
+
+\label{sec:intro:interworking}
+The ARM and Thumb instruction sets are designed to \emph{interwork}:
+	different parts of a program can be assembled for different instruction sets
+	and it is possible to switch instruction set when the program counter is written~\citep[A4.1]{armv7ar}.
+The Thumb-2 code generator proposed in this thesis does not produce ARM code,
+	though the existence of the interworking facility has effects on the techniques that can be used in it.
+This will be covered in \autoref{sec:two-bits}.
+
 \subsection{Clean}
 \label{sec:intro:clean}
 Clean is `a general purpose, state-of-the-art, pure and lazy functional programming language designed for making real-world applications'~\cite{clean}.
 It evolved from LEAN, an intermediate language based on graph rewriting~\citep{lean}.
-As such, one of the steps in the compilation of a Clean program is the translation to bytecode for the ABC-machine, an abstract graph rewriting machine~\citep{execspecs}.
-A code generator is used to generate code equivalent to the ABC-code for concrete machines.
-%todo
+A Clean program consists of a list of rewrite rules, for example:
+
+\begin{minted}{clean}
+	fac 0 = 1
+	fac n = n * (fac (n - 1))
+	Start = fac 4
+\end{minted}
+
+Executing a Clean program means rewriting the \clean{Start} rule until no rewriting is possible any more.
+The first rewriting steps of the above program are shown in \autoref{fig:intro:rewriting}.
+
+A Clean program is compiled to ABC-code, a platform-independent bytecode for an abstract graph rewriting machine~\citep{execspecs}.
+A code generator is used to generate machine code equivalent to the ABC-code for concrete machines.
+This, again, is done in several steps, so that only a relatively small part of the compilation process is target-dependent.
+For adapting the ARM code generator to work for Thumb-2 one needs to have only very basic knowledge of the ABC-machine.
+For this reason we will not discuss its internals here ---
+	they have been described in \cite{execspecs} and \cite{m680x0}.
+
+After compilation, a Clean program is linked together with the Clean run-time system.
+The RTS ensures that that the \clean{Start} node is reduced and printed and takes care of garbage collecting.
+This system is written in C and assembly, so making Clean available on Thumb-2 inherently means adapting the platform-dependent parts of the RTS as well.
+
+\subsection{A Thumb backend for Clean}
+\label{sec:intro:clean-thumb}
+In this thesis, we propose a Thumb backend for Clean.
+The ARM code generator and RTS were taken as a starting point.
+Thanks to the Unified Assembly Language, only little time was needed to make these systems assemble for the Thumb instruction set.
+
+The proposed backend does not use ARM and Thumb interworking.
+The reason for this is threefold.
+First, there are several processors, like the ARMv7-M series~\citep[A4.1]{armv7m}, that do support Thumb instructions but cannot run ARM code.
+By only using Thumb, we target the widest possible range of devices.
+Second, we doubt that using interworking can be done efficiently.
+In the run-time system, only minimal time overhead is introduced by using Thumb instructions.
+For generated code it would be complicated to detect if the ARM or Thumb instruction set would give better results,
+	and this would give significantly better results only in specific cases.
+Third, the problem discussed in \autoref{sec:two-bits} could be solved efficiently only without interworking.
+Using interworking would introduce overhead at every branch instruction,
+	since the solution to this problem would have to be adapted.
 
 \subsection{Terminology}
 \label{sec:intro:terminology}
 In this thesis, I will usually write `Thumb' where the Thumb-2 extension is meant.
 Only when the distinction with pre-ARMv6T2 Thumb is important will I distinguish between (early) Thumb and Thumb-2.
+As for `ARM', it should be clear from the context whether the architecture or the instruction set is meant.
 
 For brevity, a number of common abbreviations is used.
 An overview can be found in \autoref{sec:abbreviations}.
 
+\subsection{Organisation}
+\label{sec:intro:organisation}
+
 \end{multicols}
diff --git a/thesis/thesis.bib b/thesis/thesis.bib
index fc4fa09..24b604f 100644
--- a/thesis/thesis.bib
+++ b/thesis/thesis.bib
@@ -30,3 +30,10 @@
 	school="Radboud University Nijmegen",
 	year=1990
 }
+
+@mastersthesis{m680x0,
+	title="Implementing the ABC-machine on M680x0 based architectures",
+	author="J. van Groningen",
+	school="Radboud University Nijmegen",
+	year=1990
+}
diff --git a/thesis/thesis.tex b/thesis/thesis.tex
index 09c61a9..14ef5bd 100644
--- a/thesis/thesis.tex
+++ b/thesis/thesis.tex
@@ -22,6 +22,11 @@
 \usepackage{latexgit}
 
 \usepackage{ual}
+\usepackage{clean}
+
+\usepackage{subcaption}
+\usepackage{tikz}
+\usetikzlibrary{positioning}
 
 \let\oldtexttt\texttt
 \def\texttt#1{\oldtexttt{\footnotesize #1}}
@@ -39,6 +44,8 @@
 \let\oldsection\section
 \renewcommand{\section}{\clearpage\oldsection}
 
+\raggedcolumns
+
 \title{Code generation for Thumb-2 processors}
 \author{Camil Staps}
 \date{\gitcommitdate[formatDate]}
@@ -60,8 +67,9 @@
 \cleardoublepage
 
 \input{intro}
-\input{approach}
-\input{differences}
+\input{two-bits}
+%\input{approach}
+\input{fix-storing-pc}
 
 \cleardoublepage
 \appendix
diff --git a/thesis/two-bits.tex b/thesis/two-bits.tex
new file mode 100644
index 0000000..4b08d89
--- /dev/null
+++ b/thesis/two-bits.tex
@@ -0,0 +1,53 @@
+\section{Two bits}
+\label{sec:two-bits}
+
+\begin{multicols}{2}
+
+\subsection{Introduction}
+\label{sec:two-bits:intro}
+The ARM ISA only has 32-bit instructions, meaning that the two least significant bits of a code address are always zero.
+When a branch is attempted to an address which has one or both of these bits set, the processor will automatically align the address by clearing these bits.
+
+Thumb mixes 32 and 16-bit instructions.
+Instructions are halfword-aligned, so bit 1 is part of the address.
+Additionally, bit 0 is used to facilitate ARM and Thumb interworking.
+When the PC is written to with a value where the LSB is cleared,
+	the processor jumps and switches to ARM mode.
+When the LSB is set, it switches to Thumb mode~\citep[A2.3.2]{armv7ar}.
+
+\subsection{Usage in Clean}
+\label{sec:two-bits:clean}
+The fact that in ARM mode the two lowest bits of a value written to the PC are cleared automatically has been exploited in the ARM code generator:
+	these two bits are used to store information.
+Bit 1 is used to mark a node as having reached head normal form%
+	\footnote{A node is in \emph{head} or \emph{root normal form} when it cannot be rewritten itself (though its children may)}.
+Bit 0 is used in Clean's default garbage collector, the copying collector, to record whether a node has been copied to the other semispace already:
+	when cleared (the normal state of an address as stored on the heap), it has not been copied yet.
+
+Porting to Thumb, we need to find a way to store this information without jumping to the middle of an instruction or accidentally switching to ARM mode.
+
+\subsection{Solution}
+\label{sec:two-bits:solution}
+The solution for bit 1 is straightforward.
+By aligning all code addresses that we will ever want to jump to (the node entry addresses) on words rather than half-words, we ensure that bit 1 is always cleared.
+When a node has reached head normal form, setting bit 1 will give a corrupt address.
+Jumping to it would either jump after the first instruction (if it is 16-bit) or to the middle of the first instruction (if it is 32-bit).
+However, when a node is in HNF, there is no pointing in jumping to its entry address,
+	so the corrupted address is not a problem.
+
+The Thumb backend for Clean proposed in this thesis does not use interworking.
+However, we do need to make sure that whenever the PC is written to, the LSB is set.
+To that end, we store node entry addresses with the LSB on heap and stack.
+This only introduces problems in the copying collector,
+	which will then not copy any node (since having bit 0 set means having been copied already).
+Flipping the meaning of the LSB in the copying collector fixes this issue.
+
+\subsection{Comparison}
+\label{sec:two-bits:comparison}
+Flipping the meaning of the LSB in the garbage collector amounts to swapping \ual{bne} and \ual{beq} and similar changes that do not effect the program's efficiency.
+By word-aligning all node entry addresses we lose one alignment byte per node entry address on average
+	(assuming that half of the node entry points are word-aligned already).
+This increases code size slightly, but since many instructions that were 32-bit in ARM are now 16-bit, the overall code size is still smaller.
+Aligning node entries has no effect on the program's efficiency, since the \ual{nop} instruction that is inserted above it will never be executed.
+
+\end{multicols}