Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GPDL/SyntaxAnalyzer/Scanner.cs @ 16030

Last change on this file since 16030 was 10412, checked in by gkronber, 11 years ago

#2026 also support C++-style comments

File size: 13.2 KB
Line 
1
2using System;
3using System.IO;
4using System.Collections;
5
6namespace SyntaxAnalyzer {
7
8public class Token {
9  public int kind;    // token kind
10  public int pos;     // token position in bytes in the source text (starting at 0)
11  public int charPos;  // token position in characters in the source text (starting at 0)
12  public int col;     // token column (starting at 1)
13  public int line;    // token line (starting at 1)
14  public string val;  // token value
15  public Token next;  // ML 2005-03-11 Tokens are kept in linked list
16}
17
18//-----------------------------------------------------------------------------------
19// Buffer
20//-----------------------------------------------------------------------------------
21public class Buffer {
22  // This Buffer supports the following cases:
23  // 1) seekable stream (file)
24  //    a) whole stream in buffer
25  //    b) part of stream in buffer
26  // 2) non seekable stream (network, console)
27
28  public const int EOF = char.MaxValue + 1;
29  const int MIN_BUFFER_LENGTH = 1024; // 1KB
30  const int MAX_BUFFER_LENGTH = MIN_BUFFER_LENGTH * 64; // 64KB
31  byte[] buf;         // input buffer
32  int bufStart;       // position of first byte in buffer relative to input stream
33  int bufLen;         // length of buffer
34  int fileLen;        // length of input stream (may change if the stream is no file)
35  int bufPos;         // current position in buffer
36  Stream stream;      // input stream (seekable)
37  bool isUserStream;  // was the stream opened by the user?
38 
39  public Buffer (Stream s, bool isUserStream) {
40    stream = s; this.isUserStream = isUserStream;
41   
42    if (stream.CanSeek) {
43      fileLen = (int) stream.Length;
44      bufLen = Math.Min(fileLen, MAX_BUFFER_LENGTH);
45      bufStart = Int32.MaxValue; // nothing in the buffer so far
46    } else {
47      fileLen = bufLen = bufStart = 0;
48    }
49
50    buf = new byte[(bufLen>0) ? bufLen : MIN_BUFFER_LENGTH];
51    if (fileLen > 0) Pos = 0; // setup buffer to position 0 (start)
52    else bufPos = 0; // index 0 is already after the file, thus Pos = 0 is invalid
53    if (bufLen == fileLen && stream.CanSeek) Close();
54  }
55 
56  protected Buffer(Buffer b) { // called in UTF8Buffer constructor
57    buf = b.buf;
58    bufStart = b.bufStart;
59    bufLen = b.bufLen;
60    fileLen = b.fileLen;
61    bufPos = b.bufPos;
62    stream = b.stream;
63    // keep destructor from closing the stream
64    b.stream = null;
65    isUserStream = b.isUserStream;
66  }
67
68  ~Buffer() { Close(); }
69 
70  protected void Close() {
71    if (!isUserStream && stream != null) {
72      stream.Close();
73      stream = null;
74    }
75  }
76 
77  public virtual int Read () {
78    if (bufPos < bufLen) {
79      return buf[bufPos++];
80    } else if (Pos < fileLen) {
81      Pos = Pos; // shift buffer start to Pos
82      return buf[bufPos++];
83    } else if (stream != null && !stream.CanSeek && ReadNextStreamChunk() > 0) {
84      return buf[bufPos++];
85    } else {
86      return EOF;
87    }
88  }
89
90  public int Peek () {
91    int curPos = Pos;
92    int ch = Read();
93    Pos = curPos;
94    return ch;
95  }
96 
97  // beg .. begin, zero-based, inclusive, in byte
98  // end .. end, zero-based, exclusive, in byte
99  public string GetString (int beg, int end) {
100    int len = 0;
101    char[] buf = new char[end - beg];
102    int oldPos = Pos;
103    Pos = beg;
104    while (Pos < end) buf[len++] = (char) Read();
105    Pos = oldPos;
106    return new String(buf, 0, len);
107  }
108
109  public int Pos {
110    get { return bufPos + bufStart; }
111    set {
112      if (value >= fileLen && stream != null && !stream.CanSeek) {
113        // Wanted position is after buffer and the stream
114        // is not seek-able e.g. network or console,
115        // thus we have to read the stream manually till
116        // the wanted position is in sight.
117        while (value >= fileLen && ReadNextStreamChunk() > 0);
118      }
119
120      if (value < 0 || value > fileLen) {
121        throw new FatalError("buffer out of bounds access, position: " + value);
122      }
123
124      if (value >= bufStart && value < bufStart + bufLen) { // already in buffer
125        bufPos = value - bufStart;
126      } else if (stream != null) { // must be swapped in
127        stream.Seek(value, SeekOrigin.Begin);
128        bufLen = stream.Read(buf, 0, buf.Length);
129        bufStart = value; bufPos = 0;
130      } else {
131        // set the position to the end of the file, Pos will return fileLen.
132        bufPos = fileLen - bufStart;
133      }
134    }
135  }
136 
137  // Read the next chunk of bytes from the stream, increases the buffer
138  // if needed and updates the fields fileLen and bufLen.
139  // Returns the number of bytes read.
140  private int ReadNextStreamChunk() {
141    int free = buf.Length - bufLen;
142    if (free == 0) {
143      // in the case of a growing input stream
144      // we can neither seek in the stream, nor can we
145      // foresee the maximum length, thus we must adapt
146      // the buffer size on demand.
147      byte[] newBuf = new byte[bufLen * 2];
148      Array.Copy(buf, newBuf, bufLen);
149      buf = newBuf;
150      free = bufLen;
151    }
152    int read = stream.Read(buf, bufLen, free);
153    if (read > 0) {
154      fileLen = bufLen = (bufLen + read);
155      return read;
156    }
157    // end of stream reached
158    return 0;
159  }
160}
161
162//-----------------------------------------------------------------------------------
163// UTF8Buffer
164//-----------------------------------------------------------------------------------
165public class UTF8Buffer: Buffer {
166  public UTF8Buffer(Buffer b): base(b) {}
167
168  public override int Read() {
169    int ch;
170    do {
171      ch = base.Read();
172      // until we find a utf8 start (0xxxxxxx or 11xxxxxx)
173    } while ((ch >= 128) && ((ch & 0xC0) != 0xC0) && (ch != EOF));
174    if (ch < 128 || ch == EOF) {
175      // nothing to do, first 127 chars are the same in ascii and utf8
176      // 0xxxxxxx or end of file character
177    } else if ((ch & 0xF0) == 0xF0) {
178      // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
179      int c1 = ch & 0x07; ch = base.Read();
180      int c2 = ch & 0x3F; ch = base.Read();
181      int c3 = ch & 0x3F; ch = base.Read();
182      int c4 = ch & 0x3F;
183      ch = (((((c1 << 6) | c2) << 6) | c3) << 6) | c4;
184    } else if ((ch & 0xE0) == 0xE0) {
185      // 1110xxxx 10xxxxxx 10xxxxxx
186      int c1 = ch & 0x0F; ch = base.Read();
187      int c2 = ch & 0x3F; ch = base.Read();
188      int c3 = ch & 0x3F;
189      ch = (((c1 << 6) | c2) << 6) | c3;
190    } else if ((ch & 0xC0) == 0xC0) {
191      // 110xxxxx 10xxxxxx
192      int c1 = ch & 0x1F; ch = base.Read();
193      int c2 = ch & 0x3F;
194      ch = (c1 << 6) | c2;
195    }
196    return ch;
197  }
198}
199
200//-----------------------------------------------------------------------------------
201// Scanner
202//-----------------------------------------------------------------------------------
203public class Scanner {
204  const char EOL = '\n';
205  const int eofSym = 0; /* pdt */
206  const int maxT = 30;
207  const int noSym = 30;
208
209
210  public Buffer buffer; // scanner buffer
211 
212  Token t;          // current token
213  int ch;           // current input character
214  int pos;          // byte position of current character
215  int charPos;      // position by unicode characters starting with 0
216  int col;          // column number of current character
217  int line;         // line number of current character
218  int oldEols;      // EOLs that appeared in a comment;
219  static readonly Hashtable start; // maps first token character to start state
220
221  Token tokens;     // list of tokens already peeked (first token is a dummy)
222  Token pt;         // current peek token
223 
224  char[] tval = new char[128]; // text of current token
225  int tlen;         // length of current token
226 
227  static Scanner() {
228    start = new Hashtable(128);
229    for (int i = 65; i <= 90; ++i) start[i] = 1;
230    for (int i = 97; i <= 122; ++i) start[i] = 1;
231    start[60] = 2;
232    start[62] = 4;
233    start[46] = 15;
234    start[61] = 7;
235    start[124] = 8;
236    start[40] = 9;
237    start[41] = 10;
238    start[91] = 11;
239    start[93] = 12;
240    start[123] = 13;
241    start[125] = 14;
242    start[Buffer.EOF] = -1;
243
244  }
245 
246  public Scanner (string fileName) {
247    try {
248      Stream stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.Read);
249      buffer = new Buffer(stream, false);
250      Init();
251    } catch (IOException) {
252      throw new FatalError("Cannot open file " + fileName);
253    }
254  }
255 
256  public Scanner (Stream s) {
257    buffer = new Buffer(s, true);
258    Init();
259  }
260 
261  void Init() {
262    pos = -1; line = 1; col = 0; charPos = -1;
263    oldEols = 0;
264    NextCh();
265    if (ch == 0xEF) { // check optional byte order mark for UTF-8
266      NextCh(); int ch1 = ch;
267      NextCh(); int ch2 = ch;
268      if (ch1 != 0xBB || ch2 != 0xBF) {
269        throw new FatalError(String.Format("illegal byte order mark: EF {0,2:X} {1,2:X}", ch1, ch2));
270      }
271      buffer = new UTF8Buffer(buffer); col = 0; charPos = -1;
272      NextCh();
273    }
274    pt = tokens = new Token();  // first token is a dummy
275  }
276 
277  void NextCh() {
278    if (oldEols > 0) { ch = EOL; oldEols--; }
279    else {
280      pos = buffer.Pos;
281      // buffer reads unicode chars, if UTF8 has been detected
282      ch = buffer.Read(); col++; charPos++;
283      // replace isolated '\r' by '\n' in order to make
284      // eol handling uniform across Windows, Unix and Mac
285      if (ch == '\r' && buffer.Peek() != '\n') ch = EOL;
286      if (ch == EOL) { line++; col = 0; }
287    }
288
289  }
290
291  void AddCh() {
292    if (tlen >= tval.Length) {
293      char[] newBuf = new char[2 * tval.Length];
294      Array.Copy(tval, 0, newBuf, 0, tval.Length);
295      tval = newBuf;
296    }
297    if (ch != Buffer.EOF) {
298      tval[tlen++] = (char) ch;
299      NextCh();
300    }
301  }
302
303
304
305  bool Comment0() {
306    int level = 1, pos0 = pos, line0 = line, col0 = col, charPos0 = charPos;
307    NextCh();
308    if (ch == '/') {
309      NextCh();
310      for(;;) {
311        if (ch == 13) {
312          level--;
313          if (level == 0) { oldEols = line - line0; NextCh(); return true; }
314          NextCh();
315        } else if (ch == Buffer.EOF) return false;
316        else NextCh();
317      }
318    } else {
319      buffer.Pos = pos0; NextCh(); line = line0; col = col0; charPos = charPos0;
320    }
321    return false;
322  }
323
324  bool Comment1() {
325    int level = 1, pos0 = pos, line0 = line, col0 = col, charPos0 = charPos;
326    NextCh();
327    if (ch == '*') {
328      NextCh();
329      for(;;) {
330        if (ch == '*') {
331          NextCh();
332          if (ch == '/') {
333            level--;
334            if (level == 0) { oldEols = line - line0; NextCh(); return true; }
335            NextCh();
336          }
337        } else if (ch == '/') {
338          NextCh();
339          if (ch == '*') {
340            level++; NextCh();
341          }
342        } else if (ch == Buffer.EOF) return false;
343        else NextCh();
344      }
345    } else {
346      buffer.Pos = pos0; NextCh(); line = line0; col = col0; charPos = charPos0;
347    }
348    return false;
349  }
350
351
352  void CheckLiteral() {
353    switch (t.val) {
354      case "PROBLEM": t.kind = 2; break;
355      case "CODE": t.kind = 3; break;
356      case "INIT": t.kind = 6; break;
357      case "NONTERMINALS": t.kind = 7; break;
358      case "TERMINALS": t.kind = 8; break;
359      case "RULES": t.kind = 9; break;
360      case "MAXIMIZE": t.kind = 10; break;
361      case "MINIMIZE": t.kind = 11; break;
362      case "END": t.kind = 12; break;
363      case "LOCAL": t.kind = 14; break;
364      case "SEM": t.kind = 15; break;
365      case "CONSTRAINTS": t.kind = 16; break;
366      case "IN": t.kind = 17; break;
367      case "SET": t.kind = 18; break;
368      case "RANGE": t.kind = 19; break;
369      case "EPS": t.kind = 23; break;
370      default: break;
371    }
372  }
373
374  Token NextToken() {
375    while (ch == ' ' ||
376      ch >= 9 && ch <= 10 || ch == 13
377    ) NextCh();
378    if (ch == '/' && Comment0() ||ch == '/' && Comment1()) return NextToken();
379    int recKind = noSym;
380    int recEnd = pos;
381    t = new Token();
382    t.pos = pos; t.col = col; t.line = line; t.charPos = charPos;
383    int state;
384    if (start.ContainsKey(ch)) { state = (int) start[ch]; }
385    else { state = 0; }
386    tlen = 0; AddCh();
387   
388    switch (state) {
389      case -1: { t.kind = eofSym; break; } // NextCh already done
390      case 0: {
391        if (recKind != noSym) {
392          tlen = recEnd - t.pos;
393          SetScannerBehindT();
394        }
395        t.kind = recKind; break;
396      } // NextCh already done
397      case 1:
398        recEnd = pos; recKind = 1;
399        if (ch >= '0' && ch <= '9' || ch >= 'A' && ch <= 'Z' || ch >= 'a' && ch <= 'z') {AddCh(); goto case 1;}
400        else {t.kind = 1; t.val = new String(tval, 0, tlen); CheckLiteral(); return t;}
401      case 2:
402        if (ch == '<') {AddCh(); goto case 3;}
403        else {goto case 0;}
404      case 3:
405        {t.kind = 4; break;}
406      case 4:
407        if (ch == '>') {AddCh(); goto case 5;}
408        else {goto case 0;}
409      case 5:
410        {t.kind = 5; break;}
411      case 6:
412        {t.kind = 20; break;}
413      case 7:
414        {t.kind = 21; break;}
415      case 8:
416        {t.kind = 22; break;}
417      case 9:
418        {t.kind = 24; break;}
419      case 10:
420        {t.kind = 25; break;}
421      case 11:
422        {t.kind = 26; break;}
423      case 12:
424        {t.kind = 27; break;}
425      case 13:
426        {t.kind = 28; break;}
427      case 14:
428        {t.kind = 29; break;}
429      case 15:
430        recEnd = pos; recKind = 13;
431        if (ch == '.') {AddCh(); goto case 6;}
432        else {t.kind = 13; break;}
433
434    }
435    t.val = new String(tval, 0, tlen);
436    return t;
437  }
438 
439  private void SetScannerBehindT() {
440    buffer.Pos = t.pos;
441    NextCh();
442    line = t.line; col = t.col; charPos = t.charPos;
443    for (int i = 0; i < tlen; i++) NextCh();
444  }
445 
446  // get the next token (possibly a token already seen during peeking)
447  public Token Scan () {
448    if (tokens.next == null) {
449      return NextToken();
450    } else {
451      pt = tokens = tokens.next;
452      return tokens;
453    }
454  }
455
456  // peek for the next token, ignore pragmas
457  public Token Peek () {
458    do {
459      if (pt.next == null) {
460        pt.next = NextToken();
461      }
462      pt = pt.next;
463    } while (pt.kind > maxT); // skip pragmas
464 
465    return pt;
466  }
467
468  // make sure that peeking starts at the current scan position
469  public void ResetPeek () { pt = tokens; }
470
471} // end Scanner
472}
Note: See TracBrowser for help on using the repository browser.