Skip to content

Commit 257e141

Browse files
authored
Escape sequence (#7)
1 parent 21a8cfe commit 257e141

3 files changed

Lines changed: 168 additions & 202 deletions

File tree

c-source.c

Lines changed: 77 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,11 @@
1414
#define IS_LETTER(C) ((C >= 'a' && C <= 'z') || (C >= 'A' && C <= 'Z'))
1515
#define IS_DIGIT(C) (C >= '0' && C <= '9')
1616
#define IS_HEX(C) (IS_DIGIT(C) || (C >= 'A' && C <= 'F'))
17-
#define IS_WHITESPACE(C) (C == ' ' || C == 9 || C == 10 || C == 13)
17+
#define IS_WHITESPACE(C) (C == ' ' || C == '\t' || C == '\n' || C == '\r')
1818
#define IS_PUNCT(P, A, B) (*P == A && P[1] == B)
1919
#define IS_TYPE(KIND) (KIND >= KW_int && KIND <= KW_void)
2020
#define ALIGN(x) ((x + 3) & -4)
21+
// @TODO: refactor error
2122
#define COMPILE_ERROR(...) { printf(__VA_ARGS__); exit(1); }
2223
#define PUSH(REG, VAL) instruction(Push | (REG << 24), VAL)
2324
#define POP(REG) instruction(Pop | (REG << 8), 0)
@@ -37,10 +38,10 @@
3738
#define CALL_ATTRIB(IDX, ATTRIB) g_calls[((IDX) * CallSize) + ATTRIB]
3839
#define OP(op, dest, src1, src2) ((op) | (dest << 8) | (src1 << 16) | (src2 << 24))
3940

40-
#define MAX_PRINF_ARGS 8
41+
#define MAX_PRINF_ARGS (8)
4142
#define CHUNK_SIZE (1 << 27)
42-
#define MAX_SCOPE 128
43-
#define MAX_CALLS 1024
43+
#define MAX_SCOPE (128)
44+
#define MAX_CALLS (1024)
4445

4546
enum { Undefined, Global, Param, Local, Func, Const };
4647
enum { EAX = 1, EBX, ECX, EDX, ESP, EBP, IMME };
@@ -70,8 +71,7 @@ int strlen(char* p) {
7071
}
7172
#pragma endregion utils
7273

73-
#pragma region token
74-
74+
//---------------------------------- TOKEN ----------------------------------//
7575
enum {
7676
_TK_START = 128, // 0-127 is reserved for ascii
7777
TK_INT, // int
@@ -111,6 +111,7 @@ enum {
111111
};
112112

113113
// @TODO: implement struct. Use enum and array to mimic array of struct for now
114+
#define GET_TK_FIELD(IDX, ATTRIB) (g_token_buffer[((IDX) * _TkFieldCount) + ATTRIB])
114115
enum {
115116
TkFieldKind,
116117
TkFieldValue, // store the value of token if char or int
@@ -121,19 +122,16 @@ enum {
121122
_TkFieldCount,
122123
};
123124

124-
int* g_token_buffer, // global int array to hold token information
125-
g_token_idx; // global index of current token
126-
127-
#define GET_TK_FIELD(IDX, ATTRIB) (g_token_buffer[((IDX) * _TkFieldCount) + ATTRIB])
125+
int* g_token_buffer; // global int array to hold token information
128126

129127
void check_if_token_keyword(int token_idx) {
130128
char* keywords = "int\0 char\0 void\0 break\0 continue\0"
131129
"else\0 enum\0 if\0 return\0 while\0 "
132130
"printf\0 fopen\0 fgetc\0 calloc\0 memset\0 "
133131
"exit\0 ";
134132

135-
int start = GET_TK_FIELD(token_idx, TkFieldBegin);
136-
int token_len = GET_TK_FIELD(token_idx, TkFieldEnd) - start;
133+
char* start = GET_TK_FIELD(token_idx, TkFieldBegin);
134+
int token_len = (char*)GET_TK_FIELD(token_idx, TkFieldEnd) - start;
137135

138136
int idx = 0;
139137
while (idx < (_KW_END - KW_int)) {
@@ -148,12 +146,8 @@ void check_if_token_keyword(int token_idx) {
148146
return;
149147
}
150148

151-
#pragma endregion token
152-
153149
// @TODO: refactor
154-
155150
char *g_ram, *g_src;
156-
157151
int g_reserved, g_bss,
158152
g_tkIter,
159153
*g_syms, g_symCnt,
@@ -163,114 +157,98 @@ int g_reserved, g_bss,
163157
g_scopeId, *g_scopes, g_scopeCnt,
164158
*g_calls, g_callCnt;
165159

166-
void lex() {
160+
//---------------------------------- PARSER ----------------------------------//
161+
int parse_escape_sequence(int letter, int ln) {
162+
if (letter == '0') return '\0';
163+
if (letter == 'n') return '\n';
164+
if (letter == 'r') return '\r';
165+
if (letter == 't') return '\t';
166+
if (letter == '\\') return '\\';
167+
if (letter == '\'') return '\'';
168+
if (letter == '"') return '"';
169+
170+
COMPILE_ERROR("error:%d: unknown escape sequence '\\%c'\n", ln, letter);
171+
return 0;
172+
}
173+
174+
int lex(char* p) {
175+
int token_idx = 0;
167176
int ln = 1;
168-
char *p = g_src;
169177
while (*p) {
170178
if (*p == '#' || (*p == '/' && p[1] == '/')) { // handle '#' and comment '//'
171-
while (*p && *p != 10) ++p;
179+
while (*p && *p != '\n') ++p;
172180
} else if (IS_WHITESPACE(*p)) { // handle whitespace
173-
ln += (*p == 10); ++p;
181+
ln += (*p == '\n'); ++p;
174182
} else {
175-
GET_TK_FIELD(g_token_idx, TkFieldLine) = ln;
176-
GET_TK_FIELD(g_token_idx, TkFieldBegin) = p;
183+
GET_TK_FIELD(token_idx, TkFieldLine) = ln;
184+
GET_TK_FIELD(token_idx, TkFieldBegin) = (int)p;
177185

178186
if (IS_LETTER(*p) || *p == '_') { // handle token or keyword
179-
GET_TK_FIELD(g_token_idx, TkFieldKind) = TK_IDENT;
187+
GET_TK_FIELD(token_idx, TkFieldKind) = TK_IDENT;
180188
++p;
181189
while (IS_LETTER(*p) || IS_DIGIT(*p) || *p == '_') {
182190
++p;
183191
}
184-
GET_TK_FIELD(g_token_idx, TkFieldEnd) = p;
185-
check_if_token_keyword(g_token_idx);
186-
g_token_idx += 1;
192+
GET_TK_FIELD(token_idx, TkFieldEnd) = (int)p;
193+
check_if_token_keyword(token_idx);
194+
token_idx += 1;
187195
} else if (*p == '0' && p[1] == 'x') { // handle hex number
188-
GET_TK_FIELD(g_token_idx, TkFieldKind) = TK_INT;
196+
GET_TK_FIELD(token_idx, TkFieldKind) = TK_INT;
189197
int result = 0;
190198
p += 2; while(IS_HEX(*p)) {
191199
result = (result << 4) + ((*p < 'A') ? (*p - '0') : (*p - 55));
192200
++p;
193201
}
194-
GET_TK_FIELD(g_token_idx, TkFieldValue) = result;
195-
GET_TK_FIELD(g_token_idx++, TkFieldEnd) = p;
202+
GET_TK_FIELD(token_idx, TkFieldValue) = result;
203+
GET_TK_FIELD(token_idx++, TkFieldEnd) = p;
196204
} else if (IS_DIGIT(*p)) { // handle decimal number
197-
GET_TK_FIELD(g_token_idx, TkFieldKind) = TK_INT;
205+
GET_TK_FIELD(token_idx, TkFieldKind) = TK_INT;
198206
int result = 0;
199207
while (IS_DIGIT(*p)) { result = result * 10 + (*p - '0'); ++p; }
200-
GET_TK_FIELD(g_token_idx, TkFieldValue) = result;
201-
GET_TK_FIELD(g_token_idx++, TkFieldEnd) = p;
208+
GET_TK_FIELD(token_idx, TkFieldValue) = result;
209+
GET_TK_FIELD(token_idx++, TkFieldEnd) = p;
202210
} else if (*p == '"') { // handle string
203-
GET_TK_FIELD(g_token_idx, TkFieldKind) = TK_STRING;
211+
GET_TK_FIELD(token_idx, TkFieldKind) = TK_STRING;
204212
++p; while (*p != '"') { ++p; };
205-
GET_TK_FIELD(g_token_idx++, TkFieldEnd) = ++p;
206-
} else if (*p == 39) { // ascii '''
207-
GET_TK_FIELD(g_token_idx, TkFieldKind) = TK_CHAR;
208-
GET_TK_FIELD(g_token_idx, TkFieldValue) = p[1];
209-
GET_TK_FIELD(g_token_idx++, TkFieldEnd) = (p += 3);
213+
GET_TK_FIELD(token_idx++, TkFieldEnd) = ++p;
214+
} else if (*p == '\'') {
215+
// @TODO: handle escape
216+
GET_TK_FIELD(token_idx, TkFieldKind) = TK_CHAR;
217+
int v = *(++p); // skip opening '
218+
if (v == '\\') {
219+
v = parse_escape_sequence(*(++p), ln);
220+
}
221+
GET_TK_FIELD(token_idx, TkFieldValue) = v;
222+
GET_TK_FIELD(token_idx++, TkFieldEnd) = (p += 2); // skip char and closing '
210223
} else {
211-
GET_TK_FIELD(g_token_idx, TkFieldKind) = *p;
224+
GET_TK_FIELD(token_idx, TkFieldKind) = *p;
212225

213-
if (IS_PUNCT(p, '=', '=')) { GET_TK_FIELD(g_token_idx, TkFieldKind) = TK_EQ; ++p; }
214-
else if (IS_PUNCT(p, '!', '=')) { GET_TK_FIELD(g_token_idx, TkFieldKind) = TK_NE; ++p; }
215-
else if (IS_PUNCT(p, '&', '&')) { GET_TK_FIELD(g_token_idx, TkFieldKind) = TK_AND; ++p; }
216-
else if (IS_PUNCT(p, '|', '|')) { GET_TK_FIELD(g_token_idx, TkFieldKind) = TK_OR; ++p; }
226+
if (IS_PUNCT(p, '=', '=')) { GET_TK_FIELD(token_idx, TkFieldKind) = TK_EQ; ++p; }
227+
else if (IS_PUNCT(p, '!', '=')) { GET_TK_FIELD(token_idx, TkFieldKind) = TK_NE; ++p; }
228+
else if (IS_PUNCT(p, '&', '&')) { GET_TK_FIELD(token_idx, TkFieldKind) = TK_AND; ++p; }
229+
else if (IS_PUNCT(p, '|', '|')) { GET_TK_FIELD(token_idx, TkFieldKind) = TK_OR; ++p; }
217230
else if (*p == '+') {
218-
if (p[1] == '+') { GET_TK_FIELD(g_token_idx, TkFieldKind) = TK_INC; ++p; }
219-
else if (p[1] == '=') { GET_TK_FIELD(g_token_idx, TkFieldKind) = TK_ADD_ASSIGN; ++p; }
231+
if (p[1] == '+') { GET_TK_FIELD(token_idx, TkFieldKind) = TK_INC; ++p; }
232+
else if (p[1] == '=') { GET_TK_FIELD(token_idx, TkFieldKind) = TK_ADD_ASSIGN; ++p; }
220233
} else if (*p == '-') {
221-
if (p[1] == '-') { GET_TK_FIELD(g_token_idx, TkFieldKind) = TK_DEC; ++p; }
222-
else if (p[1] == '=') { GET_TK_FIELD(g_token_idx, TkFieldKind) = TK_SUB_ASSIGN; ++p; }
234+
if (p[1] == '-') { GET_TK_FIELD(token_idx, TkFieldKind) = TK_DEC; ++p; }
235+
else if (p[1] == '=') { GET_TK_FIELD(token_idx, TkFieldKind) = TK_SUB_ASSIGN; ++p; }
223236
} else if (*p == '>') {
224-
if (p[1] == '=') { GET_TK_FIELD(g_token_idx, TkFieldKind) = TK_GE; ++p; }
225-
else if (p[1] == '>') { GET_TK_FIELD(g_token_idx, TkFieldKind) = TK_RSHIFT; ++p; }
237+
if (p[1] == '=') { GET_TK_FIELD(token_idx, TkFieldKind) = TK_GE; ++p; }
238+
else if (p[1] == '>') { GET_TK_FIELD(token_idx, TkFieldKind) = TK_RSHIFT; ++p; }
226239
} else if (*p == '<') {
227-
if (p[1] == '=') { GET_TK_FIELD(g_token_idx, TkFieldKind) = TK_LE; ++p; }
228-
else if (p[1] == '<') { GET_TK_FIELD(g_token_idx, TkFieldKind) = TK_LSHIFT; ++p; }
240+
if (p[1] == '=') { GET_TK_FIELD(token_idx, TkFieldKind) = TK_LE; ++p; }
241+
else if (p[1] == '<') { GET_TK_FIELD(token_idx, TkFieldKind) = TK_LSHIFT; ++p; }
229242
}
230243

231-
GET_TK_FIELD(g_token_idx++, TkFieldEnd) = ++p;
232-
}
233-
}
234-
}
235-
return;
236-
}
237-
238-
// debug
239-
void dump_tokens() {
240-
printf("-------- lex --------\n");
241-
int indent = 0, i = 0, ln = 0;
242-
while (i < g_token_idx) {
243-
int tkln = GET_TK_FIELD(i, TkFieldLine);
244-
int kind = GET_TK_FIELD(i, TkFieldKind);
245-
int start = GET_TK_FIELD(i, TkFieldBegin);
246-
int end = GET_TK_FIELD(i, TkFieldEnd);
247-
int len = end - start;
248-
if (kind == '{') { indent += 1; }
249-
else if (kind == '}') { indent -= 1; }
250-
if (ln != tkln) {
251-
printf("\n%-3d:%.*s", tkln, indent * 4, " ");
252-
ln = tkln;
253-
}
254-
char* names = "Int Char Void Break Cont Else Enum If "
255-
"Ret While Print Fopen Fgetc CallocMemsetExit ";
256-
printf("%.*s", len, start);
257-
if (kind >= KW_int) {
258-
printf("{");
259-
char *p = names + 6 * (kind - KW_int); int ii = 0;
260-
while (ii < 6) {
261-
if (*p == ' ') break;
262-
printf("%c", *p);
263-
++ii; ++p;
244+
GET_TK_FIELD(token_idx++, TkFieldEnd) = ++p;
264245
}
265-
printf("}");
266246
}
267-
printf(" ");
268-
++i;
269247
}
270-
printf("\n");
271-
return;
248+
return token_idx;
272249
}
273250

251+
//--------------------------------- CODEGEN ----------------------------------//
274252
void enter_scope() {
275253
if (g_scopeCnt >= MAX_SCOPE) {
276254
panic("scope overflow");
@@ -343,11 +321,8 @@ int primary_expr() {
343321
int i = 1;
344322
while (i < len) {
345323
int c = start[i];
346-
if (c == 92) { // '\'
347-
c = start[i += 1];
348-
if (c == 'n') { c = 10; }
349-
else if (c == '0') { c = 0; }
350-
else { COMPILE_ERROR("error:%d: unknown escape sequence '%c'\n", ln, c); }
324+
if (c == '\\') {
325+
c = parse_escape_sequence(start[i += 1], ln);
351326
}
352327
*((char*)g_bss++) = c;
353328
++i;
@@ -977,10 +952,10 @@ void obj() {
977952
return;
978953
}
979954

980-
void gen(int argc, char** argv) {
955+
void gen(int argc, char** argv, int token_count) {
981956
enter_scope();
982957

983-
while (g_tkIter < g_token_idx) {
958+
while (g_tkIter < token_count) {
984959
obj();
985960
}
986961

@@ -1102,16 +1077,18 @@ void dump_code() {
11021077
return;
11031078
}
11041079

1080+
#define FATAL_ERROR(fmt, ...) { printf("c.c: \033[31mfatal error\033[0m: " fmt "\ncompilation terminated.\n", ##__VA_ARGS__); exit(1); }
1081+
11051082
int main(int argc, char **argv) {
11061083
// @TODO: better error handling
11071084
if (argc == 1) {
1108-
printf("%s: fatal error: no input files\n compilation terminated.", *argv);
1085+
FATAL_ERROR("no input files");
11091086
return 1;
11101087
}
11111088

11121089
void* fp = fopen(argv[1], "r");
11131090
if (!fp) {
1114-
printf("%s: fatal error: %s : No such file or directory\n compilation terminated.", *argv, *(argv + 1));
1091+
FATAL_ERROR("%s: No such file or directory", *(argv + 1));
11151092
return 1;
11161093
}
11171094

@@ -1140,10 +1117,10 @@ int main(int argc, char **argv) {
11401117
g_src[src_len] = 0;
11411118

11421119
// lexing
1143-
lex();
1120+
int token_count = lex(g_src);
11441121

11451122
// code generation
1146-
gen(argc - 1, argv + 1);
1123+
gen(argc - 1, argv + 1, token_count);
11471124

11481125
// run
11491126
g_regs = g_ram + g_reserved - 4 * IMME;

0 commit comments

Comments
 (0)