-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathulexer.pas
237 lines (196 loc) · 5.75 KB
/
ulexer.pas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
unit ulexer;
{$mode objfpc}{$H+}
interface
uses Classes, SysUtils, ulist, uerror;
type TLPI_Lexer = class(TLPI_ErrorMessage)
public
function execute(input_string: AnsiString): TLightList;
end;
implementation
uses uconstants, utypes, uutils;
function TLPI_Lexer.execute(input_string: AnsiString): TLightList;
var c: Char;
line: Integer;
// preview next character
function next_ch: Char;
begin
Result := c;
end;
// return buffer and fill it with the next character
function get_ch: Char;
begin
if c = #10 then inc(line); // detect line feeds in Unix AND Windows
Result := c;
if input_string = '' then c := #0 // #0 is the internal symbol used for end of file / input
else
begin
c := input_string[1];
Delete(input_string, 1, 1);
end;
end;
procedure skip_whitespace;
begin
while (lpi_is_whitespace(next_ch)) do get_ch;
end;
// recognizes and reads a token (read = also deletes the token from the input stream)
procedure lpi_lex_read_token(tokenlist: TLightList);
var s: AnsiString;
function match_and_get_ch(ch: Char): Boolean;
begin
Result := false;
if UpperCase(next_ch) = ch then
begin
Result := true;
s := s + get_ch;
end;
end;
var id: Byte;
token: TToken;
nest_depth, current_line: Integer;
begin
s := get_ch;
if (lpi_is_alpha(s[1])) or (s[1] = '_') then // Identifier: keywords, variables, functions
begin
id := CtokenIdentifier;
while ((lpi_is_alpha(next_ch)) or (lpi_is_numeric(next_ch)) or (next_ch = '_')) do s := s + get_ch;
end
else
if (s[1] = '/') and (next_ch = '/') then // single line comment: // example comment
begin
id := CtokenComment;
get_ch; // skip second /
current_line := line;
// skip until next line or end of file
while (line = current_line) and (next_ch <> #0) do get_ch;
end
else
if lpi_is_operator(s[1]) then // Operator: + - * / > < = :
begin
id := CtokenOperator;
// only accumulate stuff like := <> >= etc, not -+ +- -- ++!
if not lpi_is_math_operator(s[1]) then
while (lpi_is_operator(next_ch) and (not lpi_is_math_operator(next_ch))) do s := s + get_ch;
end
else
if s[1] = '''' then // String: 'It''s working!'
begin
id := CtokenString;
s := ''; // clear string, we don't want the leading ' char in our internal string
while ((next_ch <> #0)) do
begin
case next_ch of
'''': begin
get_ch; // skip the closing ' and break unless we have the '' exception to continue
if next_ch <> '''' then Break;
end;
#10: LogError('String not terminated', line, Cunknown_operation);
end;
s := s + get_ch;
end;
end
else
if s[1] = '$' then // Hex Number: AA, F0 etc.
begin
id := CtokenSingle; // it might be a single $ instead
while (next_ch in ['0' .. '9', 'A' .. 'F']) do
begin
id := CtokenNumber; // found hex numbers, change type
s := s + get_ch;
end;
end
else
if lpi_is_numeric(s[1]) then // Number: 78 or 0.234 leading + and - are ignored and evaluated later
begin
id := CtokenNumber;
// skip any _ after the first number, allows for 100_000_000 notation
while (next_ch = '_') do get_ch;
while (lpi_is_numeric(next_ch) or (next_ch = '.')) do
begin
s := s + get_ch;
// skip any _ in the middle
while (next_ch = '_') do get_ch;
end;
// check for exponent notation e.g. 1.234E4
if match_and_get_ch('E') then
begin
// check for negative exponent or additional +
if not match_and_get_ch('-') then match_and_get_ch('+');
while lpi_is_numeric(next_ch) do s := s + get_ch;
end;
end
else
if s[1] = '{' then // multi line comment
begin
id := CtokenComment;
nest_depth := 1;
while ((nest_depth > 0) and (next_ch <> #0)) do
begin
get_ch;
if next_ch = '{' then inc(nest_depth);
if next_ch = '}' then dec(nest_depth);
end;
get_ch; // get ending }
end
else
if (s[1] = '(') and (next_ch = '*') then // "classic" multi line comment
begin
id := CtokenComment;
nest_depth := 1;
while ((nest_depth > 0) and (next_ch <> #0)) do
begin
s := get_ch;
if (s[1] = '(') and (next_ch = '*') then inc(nest_depth);
if (s[1] = '*') and (next_ch = ')') then dec(nest_depth);
end;
get_ch; // get ending )
end
else id := CtokenSingle; // single token: ( ) ;
// ignore comments, add everything else
if id <> CtokenComment then
begin
token := TToken.Create;
token.id := id;
token.s := s;
token.line := line;
tokenlist.Add(token);
end;
skip_whitespace; // skips any whitespace between two commands, before or after a linebreak doesn't matter
end;
var i: Integer;
tokenlist: TLightList;
token: TToken;
begin
Result := nil;
if ClpiDebugMode then
begin
messages.Add('');
messages.Add('*** Lexer ***');
end;
tokenlist := TLightList.Create;
line := 1; // start at line 1 of file or string
get_ch; // fill "next_ch" buffer for first time use
skip_whitespace; // skip the initial whitespace in the file, if any
while (next_ch <> #0) and (not isError) do lpi_lex_read_token(tokenlist); // #0 is our internal symbol for end of file / input, it is NOT allowed as character in the sourcecode!
if isError then
begin
// delete the token list...
for i := 0 to tokenlist.Count - 1 do
begin
token := TToken(tokenlist.Items(i));
FreeAndNil(token);
end;
FreeAndNil(tokenlist);
end
else
begin
// Debug: output our list of token...
if ClpiDebugMode then
for i := 0 to tokenlist.Count - 1 do
begin
with TToken(tokenlist.Items(i)) do
messages.Add(padstring(IntToStr(line), 2, ' ', True) + ' ' + padstring(tokentostr(id), 11) + s);
end;
end;
Result := tokenlist;
end;
end.