Context Navigation

Scanner.cs @ 15486

Visit:

Last change on this file since 15486 was 10412, checked in by gkronber, 11 years ago
#2026 also support C++-style comments
File size: 13.2 KB

Line
1
2	using System;
3	using System.IO;
4	using System.Collections;
5
6	namespace SyntaxAnalyzer {
7
8	public class Token {
9	public int kind; // token kind
10	public int pos; // token position in bytes in the source text (starting at 0)
11	public int charPos; // token position in characters in the source text (starting at 0)
12	public int col; // token column (starting at 1)
13	public int line; // token line (starting at 1)
14	public string val; // token value
15	public Token next; // ML 2005-03-11 Tokens are kept in linked list
16	}
17
18	//-----------------------------------------------------------------------------------
19	// Buffer
20	//-----------------------------------------------------------------------------------
21	public class Buffer {
22	// This Buffer supports the following cases:
23	// 1) seekable stream (file)
24	// a) whole stream in buffer
25	// b) part of stream in buffer
26	// 2) non seekable stream (network, console)
27
28	public const int EOF = char.MaxValue + 1;
29	const int MIN_BUFFER_LENGTH = 1024; // 1KB
30	const int MAX_BUFFER_LENGTH = MIN_BUFFER_LENGTH * 64; // 64KB
31	byte[] buf; // input buffer
32	int bufStart; // position of first byte in buffer relative to input stream
33	int bufLen; // length of buffer
34	int fileLen; // length of input stream (may change if the stream is no file)
35	int bufPos; // current position in buffer
36	Stream stream; // input stream (seekable)
37	bool isUserStream; // was the stream opened by the user?
38
39	public Buffer (Stream s, bool isUserStream) {
40	stream = s; this.isUserStream = isUserStream;
41
42	if (stream.CanSeek) {
43	fileLen = (int) stream.Length;
44	bufLen = Math.Min(fileLen, MAX_BUFFER_LENGTH);
45	bufStart = Int32.MaxValue; // nothing in the buffer so far
46	} else {
47	fileLen = bufLen = bufStart = 0;
48	}
49
50	buf = new byte[(bufLen>0) ? bufLen : MIN_BUFFER_LENGTH];
51	if (fileLen > 0) Pos = 0; // setup buffer to position 0 (start)
52	else bufPos = 0; // index 0 is already after the file, thus Pos = 0 is invalid
53	if (bufLen == fileLen && stream.CanSeek) Close();
54	}
55
56	protected Buffer(Buffer b) { // called in UTF8Buffer constructor
57	buf = b.buf;
58	bufStart = b.bufStart;
59	bufLen = b.bufLen;
60	fileLen = b.fileLen;
61	bufPos = b.bufPos;
62	stream = b.stream;
63	// keep destructor from closing the stream
64	b.stream = null;
65	isUserStream = b.isUserStream;
66	}
67
68	~Buffer() { Close(); }
69
70	protected void Close() {
71	if (!isUserStream && stream != null) {
72	stream.Close();
73	stream = null;
74	}
75	}
76
77	public virtual int Read () {
78	if (bufPos < bufLen) {
79	return buf[bufPos++];
80	} else if (Pos < fileLen) {
81	Pos = Pos; // shift buffer start to Pos
82	return buf[bufPos++];
83	} else if (stream != null && !stream.CanSeek && ReadNextStreamChunk() > 0) {
84	return buf[bufPos++];
85	} else {
86	return EOF;
87	}
88	}
89
90	public int Peek () {
91	int curPos = Pos;
92	int ch = Read();
93	Pos = curPos;
94	return ch;
95	}
96
97	// beg .. begin, zero-based, inclusive, in byte
98	// end .. end, zero-based, exclusive, in byte
99	public string GetString (int beg, int end) {
100	int len = 0;
101	char[] buf = new char[end - beg];
102	int oldPos = Pos;
103	Pos = beg;
104	while (Pos < end) buf[len++] = (char) Read();
105	Pos = oldPos;
106	return new String(buf, 0, len);
107	}
108
109	public int Pos {
110	get { return bufPos + bufStart; }
111	set {
112	if (value >= fileLen && stream != null && !stream.CanSeek) {
113	// Wanted position is after buffer and the stream
114	// is not seek-able e.g. network or console,
115	// thus we have to read the stream manually till
116	// the wanted position is in sight.
117	while (value >= fileLen && ReadNextStreamChunk() > 0);
118	}
119
120	if (value < 0 \|\| value > fileLen) {
121	throw new FatalError("buffer out of bounds access, position: " + value);
122	}
123
124	if (value >= bufStart && value < bufStart + bufLen) { // already in buffer
125	bufPos = value - bufStart;
126	} else if (stream != null) { // must be swapped in
127	stream.Seek(value, SeekOrigin.Begin);
128	bufLen = stream.Read(buf, 0, buf.Length);
129	bufStart = value; bufPos = 0;
130	} else {
131	// set the position to the end of the file, Pos will return fileLen.
132	bufPos = fileLen - bufStart;
133	}
134	}
135	}
136
137	// Read the next chunk of bytes from the stream, increases the buffer
138	// if needed and updates the fields fileLen and bufLen.
139	// Returns the number of bytes read.
140	private int ReadNextStreamChunk() {
141	int free = buf.Length - bufLen;
142	if (free == 0) {
143	// in the case of a growing input stream
144	// we can neither seek in the stream, nor can we
145	// foresee the maximum length, thus we must adapt
146	// the buffer size on demand.
147	byte[] newBuf = new byte[bufLen * 2];
148	Array.Copy(buf, newBuf, bufLen);
149	buf = newBuf;
150	free = bufLen;
151	}
152	int read = stream.Read(buf, bufLen, free);
153	if (read > 0) {
154	fileLen = bufLen = (bufLen + read);
155	return read;
156	}
157	// end of stream reached
158	return 0;
159	}
160	}
161
162	//-----------------------------------------------------------------------------------
163	// UTF8Buffer
164	//-----------------------------------------------------------------------------------
165	public class UTF8Buffer: Buffer {
166	public UTF8Buffer(Buffer b): base(b) {}
167
168	public override int Read() {
169	int ch;
170	do {
171	ch = base.Read();
172	// until we find a utf8 start (0xxxxxxx or 11xxxxxx)
173	} while ((ch >= 128) && ((ch & 0xC0) != 0xC0) && (ch != EOF));
174	if (ch < 128 \|\| ch == EOF) {
175	// nothing to do, first 127 chars are the same in ascii and utf8
176	// 0xxxxxxx or end of file character
177	} else if ((ch & 0xF0) == 0xF0) {
178	// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
179	int c1 = ch & 0x07; ch = base.Read();
180	int c2 = ch & 0x3F; ch = base.Read();
181	int c3 = ch & 0x3F; ch = base.Read();
182	int c4 = ch & 0x3F;
183	ch = (((((c1 << 6) \| c2) << 6) \| c3) << 6) \| c4;
184	} else if ((ch & 0xE0) == 0xE0) {
185	// 1110xxxx 10xxxxxx 10xxxxxx
186	int c1 = ch & 0x0F; ch = base.Read();
187	int c2 = ch & 0x3F; ch = base.Read();
188	int c3 = ch & 0x3F;
189	ch = (((c1 << 6) \| c2) << 6) \| c3;
190	} else if ((ch & 0xC0) == 0xC0) {
191	// 110xxxxx 10xxxxxx
192	int c1 = ch & 0x1F; ch = base.Read();
193	int c2 = ch & 0x3F;
194	ch = (c1 << 6) \| c2;
195	}
196	return ch;
197	}
198	}
199
200	//-----------------------------------------------------------------------------------
201	// Scanner
202	//-----------------------------------------------------------------------------------
203	public class Scanner {
204	const char EOL = '\n';
205	const int eofSym = 0; /* pdt */
206	const int maxT = 30;
207	const int noSym = 30;
208
209
210	public Buffer buffer; // scanner buffer
211
212	Token t; // current token
213	int ch; // current input character
214	int pos; // byte position of current character
215	int charPos; // position by unicode characters starting with 0
216	int col; // column number of current character
217	int line; // line number of current character
218	int oldEols; // EOLs that appeared in a comment;
219	static readonly Hashtable start; // maps first token character to start state
220
221	Token tokens; // list of tokens already peeked (first token is a dummy)
222	Token pt; // current peek token
223
224	char[] tval = new char[128]; // text of current token
225	int tlen; // length of current token
226
227	static Scanner() {
228	start = new Hashtable(128);
229	for (int i = 65; i <= 90; ++i) start[i] = 1;
230	for (int i = 97; i <= 122; ++i) start[i] = 1;
231	start[60] = 2;
232	start[62] = 4;
233	start[46] = 15;
234	start[61] = 7;
235	start[124] = 8;
236	start[40] = 9;
237	start[41] = 10;
238	start[91] = 11;
239	start[93] = 12;
240	start[123] = 13;
241	start[125] = 14;
242	start[Buffer.EOF] = -1;
243
244	}
245
246	public Scanner (string fileName) {
247	try {
248	Stream stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.Read);
249	buffer = new Buffer(stream, false);
250	Init();
251	} catch (IOException) {
252	throw new FatalError("Cannot open file " + fileName);
253	}
254	}
255
256	public Scanner (Stream s) {
257	buffer = new Buffer(s, true);
258	Init();
259	}
260
261	void Init() {
262	pos = -1; line = 1; col = 0; charPos = -1;
263	oldEols = 0;
264	NextCh();
265	if (ch == 0xEF) { // check optional byte order mark for UTF-8
266	NextCh(); int ch1 = ch;
267	NextCh(); int ch2 = ch;
268	if (ch1 != 0xBB \|\| ch2 != 0xBF) {
269	throw new FatalError(String.Format("illegal byte order mark: EF {0,2:X} {1,2:X}", ch1, ch2));
270	}
271	buffer = new UTF8Buffer(buffer); col = 0; charPos = -1;
272	NextCh();
273	}
274	pt = tokens = new Token(); // first token is a dummy
275	}
276
277	void NextCh() {
278	if (oldEols > 0) { ch = EOL; oldEols--; }
279	else {
280	pos = buffer.Pos;
281	// buffer reads unicode chars, if UTF8 has been detected
282	ch = buffer.Read(); col++; charPos++;
283	// replace isolated '\r' by '\n' in order to make
284	// eol handling uniform across Windows, Unix and Mac
285	if (ch == '\r' && buffer.Peek() != '\n') ch = EOL;
286	if (ch == EOL) { line++; col = 0; }
287	}
288
289	}
290
291	void AddCh() {
292	if (tlen >= tval.Length) {
293	char[] newBuf = new char[2 * tval.Length];
294	Array.Copy(tval, 0, newBuf, 0, tval.Length);
295	tval = newBuf;
296	}
297	if (ch != Buffer.EOF) {
298	tval[tlen++] = (char) ch;
299	NextCh();
300	}
301	}
302
303
304
305	bool Comment0() {
306	int level = 1, pos0 = pos, line0 = line, col0 = col, charPos0 = charPos;
307	NextCh();
308	if (ch == '/') {
309	NextCh();
310	for(;;) {
311	if (ch == 13) {
312	level--;
313	if (level == 0) { oldEols = line - line0; NextCh(); return true; }
314	NextCh();
315	} else if (ch == Buffer.EOF) return false;
316	else NextCh();
317	}
318	} else {
319	buffer.Pos = pos0; NextCh(); line = line0; col = col0; charPos = charPos0;
320	}
321	return false;
322	}
323
324	bool Comment1() {
325	int level = 1, pos0 = pos, line0 = line, col0 = col, charPos0 = charPos;
326	NextCh();
327	if (ch == '*') {
328	NextCh();
329	for(;;) {
330	if (ch == '*') {
331	NextCh();
332	if (ch == '/') {
333	level--;
334	if (level == 0) { oldEols = line - line0; NextCh(); return true; }
335	NextCh();
336	}
337	} else if (ch == '/') {
338	NextCh();
339	if (ch == '*') {
340	level++; NextCh();
341	}
342	} else if (ch == Buffer.EOF) return false;
343	else NextCh();
344	}
345	} else {
346	buffer.Pos = pos0; NextCh(); line = line0; col = col0; charPos = charPos0;
347	}
348	return false;
349	}
350
351
352	void CheckLiteral() {
353	switch (t.val) {
354	case "PROBLEM": t.kind = 2; break;
355	case "CODE": t.kind = 3; break;
356	case "INIT": t.kind = 6; break;
357	case "NONTERMINALS": t.kind = 7; break;
358	case "TERMINALS": t.kind = 8; break;
359	case "RULES": t.kind = 9; break;
360	case "MAXIMIZE": t.kind = 10; break;
361	case "MINIMIZE": t.kind = 11; break;
362	case "END": t.kind = 12; break;
363	case "LOCAL": t.kind = 14; break;
364	case "SEM": t.kind = 15; break;
365	case "CONSTRAINTS": t.kind = 16; break;
366	case "IN": t.kind = 17; break;
367	case "SET": t.kind = 18; break;
368	case "RANGE": t.kind = 19; break;
369	case "EPS": t.kind = 23; break;
370	default: break;
371	}
372	}
373
374	Token NextToken() {
375	while (ch == ' ' \|\|
376	ch >= 9 && ch <= 10 \|\| ch == 13
377	) NextCh();
378	if (ch == '/' && Comment0() \|\|ch == '/' && Comment1()) return NextToken();
379	int recKind = noSym;
380	int recEnd = pos;
381	t = new Token();
382	t.pos = pos; t.col = col; t.line = line; t.charPos = charPos;
383	int state;
384	if (start.ContainsKey(ch)) { state = (int) start[ch]; }
385	else { state = 0; }
386	tlen = 0; AddCh();
387
388	switch (state) {
389	case -1: { t.kind = eofSym; break; } // NextCh already done
390	case 0: {
391	if (recKind != noSym) {
392	tlen = recEnd - t.pos;
393	SetScannerBehindT();
394	}
395	t.kind = recKind; break;
396	} // NextCh already done
397	case 1:
398	recEnd = pos; recKind = 1;
399	if (ch >= '0' && ch <= '9' \|\| ch >= 'A' && ch <= 'Z' \|\| ch >= 'a' && ch <= 'z') {AddCh(); goto case 1;}
400	else {t.kind = 1; t.val = new String(tval, 0, tlen); CheckLiteral(); return t;}
401	case 2:
402	if (ch == '<') {AddCh(); goto case 3;}
403	else {goto case 0;}
404	case 3:
405	{t.kind = 4; break;}
406	case 4:
407	if (ch == '>') {AddCh(); goto case 5;}
408	else {goto case 0;}
409	case 5:
410	{t.kind = 5; break;}
411	case 6:
412	{t.kind = 20; break;}
413	case 7:
414	{t.kind = 21; break;}
415	case 8:
416	{t.kind = 22; break;}
417	case 9:
418	{t.kind = 24; break;}
419	case 10:
420	{t.kind = 25; break;}
421	case 11:
422	{t.kind = 26; break;}
423	case 12:
424	{t.kind = 27; break;}
425	case 13:
426	{t.kind = 28; break;}
427	case 14:
428	{t.kind = 29; break;}
429	case 15:
430	recEnd = pos; recKind = 13;
431	if (ch == '.') {AddCh(); goto case 6;}
432	else {t.kind = 13; break;}
433
434	}
435	t.val = new String(tval, 0, tlen);
436	return t;
437	}
438
439	private void SetScannerBehindT() {
440	buffer.Pos = t.pos;
441	NextCh();
442	line = t.line; col = t.col; charPos = t.charPos;
443	for (int i = 0; i < tlen; i++) NextCh();
444	}
445
446	// get the next token (possibly a token already seen during peeking)
447	public Token Scan () {
448	if (tokens.next == null) {
449	return NextToken();
450	} else {
451	pt = tokens = tokens.next;
452	return tokens;
453	}
454	}
455
456	// peek for the next token, ignore pragmas
457	public Token Peek () {
458	do {
459	if (pt.next == null) {
460	pt.next = NextToken();
461	}
462	pt = pt.next;
463	} while (pt.kind > maxT); // skip pragmas
464
465	return pt;
466	}
467
468	// make sure that peeking starts at the current scan position
469	public void ResetPeek () { pt = tokens; }
470
471	} // end Scanner
472	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/HeuristicLab.Problems.GPDL/SyntaxAnalyzer/Scanner.cs @ 15486

Download in other formats: