interpreter/lex.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122

#include "lex.h"

#include <stdbool.h>
#include <string.h>

#include "mem.h"

static inline bool is_space_char(char input) {
	return input == '\t' || input == ' ' || input == '\n' || input == '\r';
}

static inline bool is_int_char(char input) {
	return '0' <= input && input <= '9';
}

/* The number of bytes that should be read to read an integer */
unsigned char lex_int_length(char *input) {
	unsigned char n = 0;
	while (is_int_char(*input++)) n++;
	return n;
}

static inline bool is_name_char(char input) {
	return (('A' <= input && input <= 'Z') ||
			('a' <= input && input <= 'z') ||
			input == '_');
}

/* The number of bytes that should be read to read a name */
unsigned char lex_name_length(char *input) {
	unsigned char n = 0;
	while (is_name_char(*input++)) n++;
	return n;
}

struct token_list *lex(struct token_list *list, char *input) {
	bool create_new_token;

	while (*input && is_space_char(*input)) input++;
	if (*input == 0)
		return list;
	
	if (!list) {
		list = my_calloc(1,
				sizeof(struct token_list) +
				INITIAL_TOKEN_LIST_SIZE * sizeof(struct token) + 1);
		list->length = INITIAL_TOKEN_LIST_SIZE;
		list->index = 0;
	}

	create_new_token = true;

	while (*input) {
		if (list->index >= list->length) {
			list = my_realloc(list,
					sizeof(struct token_list) +
					2 * list->length * sizeof(struct token) + 1);
			list->length *= 2;
		}

		list->elems[list->index].var = NULL;

		switch (*input) {
			case ';': list->elems[list->index].kind = TOKEN_SEMICOLON; break;
			case ':': list->elems[list->index].kind = TOKEN_COLON;     break;
			case '(': list->elems[list->index].kind = TOKEN_OPEN_P;    break;
			case ')': list->elems[list->index].kind = TOKEN_CLOSE_P;   break;
			case '[': list->elems[list->index].kind = TOKEN_OPEN_SQ;   break;
			case ']': list->elems[list->index].kind = TOKEN_CLOSE_SQ;  break;
			case '=': list->elems[list->index].kind = TOKEN_EQUALS;    break;
			case ',': list->elems[list->index].kind = TOKEN_COMMA;     break;
			default:
				if (input[0] == '/' && input[1] == '/') {
					while (input && input[0] != '\n') input++;
					create_new_token = false;
					break;
				} else if (input[0] == 'c' && input[1] == 'o' &&
						input[2] == 'd' && input[3] == 'e' &&
						is_space_char(input[4])) {
					list->elems[list->index].kind = TOKEN_CODE;
					input += 4;
					break;
				} else if (input[0] == 'i' && input[1] == 'm' &&
						input[2] == 'p' && input[3] == 'o' &&
						input[4] == 'r' && input[5] == 't' &&
						is_space_char(input[6])) {
					list->elems[list->index].kind = TOKEN_IMPORT;
					input += 6;
					break;
				} else if (is_int_char(*input)) {
					char *s;
					unsigned char len = lex_int_length(input);
					s = my_calloc(1, len + 1);
					list->elems[list->index].kind = TOKEN_INT;
					list->elems[list->index].var = my_calloc(1, sizeof(int));
					strncpy(s, input, len);
					*((int*) list->elems[list->index].var) = atoi(s);
					my_free(s);
					input += len - 1;
				} else if (is_name_char(*input)) {
					unsigned char len = lex_name_length(input);
					list->elems[list->index].kind = TOKEN_NAME;
					list->elems[list->index].var = my_calloc(1, len + 1);
					strncpy(list->elems[list->index].var, input, len);
					input += len - 1;
				} else if (is_space_char(*input)) {
					create_new_token = false;
				} else {
					free_token_list(list);
					my_free(list);
					return NULL;
				}
		}

		do input++; while (*input && is_space_char(*input));

		if (create_new_token)
			list->index++;
	}

	return list;
}