Context Navigation

source: branches/HeuristicLab.Problems.GPDL/SyntaxAnalyzer/Scanner.cs @ 10159

Visit:

Last change on this file since 10159 was 10067, checked in by gkronber, 11 years ago
#2026 worked on brute force solver for GPDL problems.
File size: 12.8 KB

Rev	Line
[9724]	1
	2	using System;
	3	using System.IO;
	4	using System.Collections;
	5
	6	namespace SyntaxAnalyzer {
	7
	8	public class Token {
	9	public int kind; // token kind
	10	public int pos; // token position in bytes in the source text (starting at 0)
	11	public int charPos; // token position in characters in the source text (starting at 0)
	12	public int col; // token column (starting at 1)
	13	public int line; // token line (starting at 1)
	14	public string val; // token value
	15	public Token next; // ML 2005-03-11 Tokens are kept in linked list
	16	}
	17
	18	//-----------------------------------------------------------------------------------
	19	// Buffer
	20	//-----------------------------------------------------------------------------------
	21	public class Buffer {
	22	// This Buffer supports the following cases:
	23	// 1) seekable stream (file)
	24	// a) whole stream in buffer
	25	// b) part of stream in buffer
	26	// 2) non seekable stream (network, console)
	27
	28	public const int EOF = char.MaxValue + 1;
	29	const int MIN_BUFFER_LENGTH = 1024; // 1KB
	30	const int MAX_BUFFER_LENGTH = MIN_BUFFER_LENGTH * 64; // 64KB
	31	byte[] buf; // input buffer
	32	int bufStart; // position of first byte in buffer relative to input stream
	33	int bufLen; // length of buffer
	34	int fileLen; // length of input stream (may change if the stream is no file)
	35	int bufPos; // current position in buffer
	36	Stream stream; // input stream (seekable)
	37	bool isUserStream; // was the stream opened by the user?
	38
	39	public Buffer (Stream s, bool isUserStream) {
	40	stream = s; this.isUserStream = isUserStream;
	41
	42	if (stream.CanSeek) {
	43	fileLen = (int) stream.Length;
	44	bufLen = Math.Min(fileLen, MAX_BUFFER_LENGTH);
	45	bufStart = Int32.MaxValue; // nothing in the buffer so far
	46	} else {
	47	fileLen = bufLen = bufStart = 0;
	48	}
	49
	50	buf = new byte[(bufLen>0) ? bufLen : MIN_BUFFER_LENGTH];
	51	if (fileLen > 0) Pos = 0; // setup buffer to position 0 (start)
	52	else bufPos = 0; // index 0 is already after the file, thus Pos = 0 is invalid
	53	if (bufLen == fileLen && stream.CanSeek) Close();
	54	}
	55
	56	protected Buffer(Buffer b) { // called in UTF8Buffer constructor
	57	buf = b.buf;
	58	bufStart = b.bufStart;
	59	bufLen = b.bufLen;
	60	fileLen = b.fileLen;
	61	bufPos = b.bufPos;
	62	stream = b.stream;
	63	// keep destructor from closing the stream
	64	b.stream = null;
	65	isUserStream = b.isUserStream;
	66	}
	67
	68	~Buffer() { Close(); }
	69
	70	protected void Close() {
	71	if (!isUserStream && stream != null) {
	72	stream.Close();
	73	stream = null;
	74	}
	75	}
	76
	77	public virtual int Read () {
	78	if (bufPos < bufLen) {
	79	return buf[bufPos++];
	80	} else if (Pos < fileLen) {
	81	Pos = Pos; // shift buffer start to Pos
	82	return buf[bufPos++];
	83	} else if (stream != null && !stream.CanSeek && ReadNextStreamChunk() > 0) {
	84	return buf[bufPos++];
	85	} else {
	86	return EOF;
	87	}
	88	}
	89
	90	public int Peek () {
	91	int curPos = Pos;
	92	int ch = Read();
	93	Pos = curPos;
	94	return ch;
	95	}
	96
	97	// beg .. begin, zero-based, inclusive, in byte
	98	// end .. end, zero-based, exclusive, in byte
	99	public string GetString (int beg, int end) {
	100	int len = 0;
	101	char[] buf = new char[end - beg];
	102	int oldPos = Pos;
	103	Pos = beg;
	104	while (Pos < end) buf[len++] = (char) Read();
	105	Pos = oldPos;
	106	return new String(buf, 0, len);
	107	}
	108
	109	public int Pos {
	110	get { return bufPos + bufStart; }
	111	set {
	112	if (value >= fileLen && stream != null && !stream.CanSeek) {
	113	// Wanted position is after buffer and the stream
	114	// is not seek-able e.g. network or console,
	115	// thus we have to read the stream manually till
	116	// the wanted position is in sight.
	117	while (value >= fileLen && ReadNextStreamChunk() > 0);
	118	}
	119
	120	if (value < 0 \|\| value > fileLen) {
	121	throw new FatalError("buffer out of bounds access, position: " + value);
	122	}
	123
	124	if (value >= bufStart && value < bufStart + bufLen) { // already in buffer
	125	bufPos = value - bufStart;
	126	} else if (stream != null) { // must be swapped in
	127	stream.Seek(value, SeekOrigin.Begin);
	128	bufLen = stream.Read(buf, 0, buf.Length);
	129	bufStart = value; bufPos = 0;
	130	} else {
	131	// set the position to the end of the file, Pos will return fileLen.
	132	bufPos = fileLen - bufStart;
	133	}
	134	}
	135	}
	136
	137	// Read the next chunk of bytes from the stream, increases the buffer
	138	// if needed and updates the fields fileLen and bufLen.
	139	// Returns the number of bytes read.
	140	private int ReadNextStreamChunk() {
	141	int free = buf.Length - bufLen;
	142	if (free == 0) {
	143	// in the case of a growing input stream
	144	// we can neither seek in the stream, nor can we
	145	// foresee the maximum length, thus we must adapt
	146	// the buffer size on demand.
	147	byte[] newBuf = new byte[bufLen * 2];
	148	Array.Copy(buf, newBuf, bufLen);
	149	buf = newBuf;
	150	free = bufLen;
	151	}
	152	int read = stream.Read(buf, bufLen, free);
	153	if (read > 0) {
	154	fileLen = bufLen = (bufLen + read);
	155	return read;
	156	}
	157	// end of stream reached
	158	return 0;
	159	}
	160	}
	161
	162	//-----------------------------------------------------------------------------------
	163	// UTF8Buffer
	164	//-----------------------------------------------------------------------------------
	165	public class UTF8Buffer: Buffer {
	166	public UTF8Buffer(Buffer b): base(b) {}
	167
	168	public override int Read() {
	169	int ch;
	170	do {
	171	ch = base.Read();
	172	// until we find a utf8 start (0xxxxxxx or 11xxxxxx)
	173	} while ((ch >= 128) && ((ch & 0xC0) != 0xC0) && (ch != EOF));
	174	if (ch < 128 \|\| ch == EOF) {
	175	// nothing to do, first 127 chars are the same in ascii and utf8
	176	// 0xxxxxxx or end of file character
	177	} else if ((ch & 0xF0) == 0xF0) {
	178	// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	179	int c1 = ch & 0x07; ch = base.Read();
	180	int c2 = ch & 0x3F; ch = base.Read();
	181	int c3 = ch & 0x3F; ch = base.Read();
	182	int c4 = ch & 0x3F;
	183	ch = (((((c1 << 6) \| c2) << 6) \| c3) << 6) \| c4;
	184	} else if ((ch & 0xE0) == 0xE0) {
	185	// 1110xxxx 10xxxxxx 10xxxxxx
	186	int c1 = ch & 0x0F; ch = base.Read();
	187	int c2 = ch & 0x3F; ch = base.Read();
	188	int c3 = ch & 0x3F;
	189	ch = (((c1 << 6) \| c2) << 6) \| c3;
	190	} else if ((ch & 0xC0) == 0xC0) {
	191	// 110xxxxx 10xxxxxx
	192	int c1 = ch & 0x1F; ch = base.Read();
	193	int c2 = ch & 0x3F;
	194	ch = (c1 << 6) \| c2;
	195	}
	196	return ch;
	197	}
	198	}
	199
	200	//-----------------------------------------------------------------------------------
	201	// Scanner
	202	//-----------------------------------------------------------------------------------
	203	public class Scanner {
	204	const char EOL = '\n';
	205	const int eofSym = 0; /* pdt */
[10067]	206	const int maxT = 30;
	207	const int noSym = 30;
[9724]	208
	209
	210	public Buffer buffer; // scanner buffer
	211
	212	Token t; // current token
	213	int ch; // current input character
	214	int pos; // byte position of current character
	215	int charPos; // position by unicode characters starting with 0
	216	int col; // column number of current character
	217	int line; // line number of current character
	218	int oldEols; // EOLs that appeared in a comment;
	219	static readonly Hashtable start; // maps first token character to start state
	220
	221	Token tokens; // list of tokens already peeked (first token is a dummy)
	222	Token pt; // current peek token
	223
	224	char[] tval = new char[128]; // text of current token
	225	int tlen; // length of current token
	226
	227	static Scanner() {
	228	start = new Hashtable(128);
[10067]	229	for (int i = 65; i <= 90; ++i) start[i] = 1;
	230	for (int i = 97; i <= 122; ++i) start[i] = 1;
	231	start[60] = 2;
	232	start[62] = 4;
	233	start[46] = 15;
	234	start[61] = 7;
	235	start[124] = 8;
	236	start[40] = 9;
	237	start[41] = 10;
	238	start[91] = 11;
	239	start[93] = 12;
	240	start[123] = 13;
	241	start[125] = 14;
	242	start[Buffer.EOF] = -1;
[9724]	243
	244	}
	245
	246	public Scanner (string fileName) {
	247	try {
	248	Stream stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.Read);
	249	buffer = new Buffer(stream, false);
	250	Init();
	251	} catch (IOException) {
	252	throw new FatalError("Cannot open file " + fileName);
	253	}
	254	}
	255
	256	public Scanner (Stream s) {
	257	buffer = new Buffer(s, true);
	258	Init();
	259	}
	260
	261	void Init() {
	262	pos = -1; line = 1; col = 0; charPos = -1;
	263	oldEols = 0;
	264	NextCh();
	265	if (ch == 0xEF) { // check optional byte order mark for UTF-8
	266	NextCh(); int ch1 = ch;
	267	NextCh(); int ch2 = ch;
	268	if (ch1 != 0xBB \|\| ch2 != 0xBF) {
	269	throw new FatalError(String.Format("illegal byte order mark: EF {0,2:X} {1,2:X}", ch1, ch2));
	270	}
	271	buffer = new UTF8Buffer(buffer); col = 0; charPos = -1;
	272	NextCh();
	273	}
	274	pt = tokens = new Token(); // first token is a dummy
	275	}
	276
	277	void NextCh() {
	278	if (oldEols > 0) { ch = EOL; oldEols--; }
	279	else {
	280	pos = buffer.Pos;
	281	// buffer reads unicode chars, if UTF8 has been detected
	282	ch = buffer.Read(); col++; charPos++;
	283	// replace isolated '\r' by '\n' in order to make
	284	// eol handling uniform across Windows, Unix and Mac
	285	if (ch == '\r' && buffer.Peek() != '\n') ch = EOL;
	286	if (ch == EOL) { line++; col = 0; }
	287	}
	288
	289	}
	290
	291	void AddCh() {
	292	if (tlen >= tval.Length) {
	293	char[] newBuf = new char[2 * tval.Length];
	294	Array.Copy(tval, 0, newBuf, 0, tval.Length);
	295	tval = newBuf;
	296	}
	297	if (ch != Buffer.EOF) {
	298	tval[tlen++] = (char) ch;
	299	NextCh();
	300	}
	301	}
	302
	303
	304
[10067]	305	bool Comment0() {
	306	int level = 1, pos0 = pos, line0 = line, col0 = col, charPos0 = charPos;
	307	NextCh();
	308	if (ch == '*') {
	309	NextCh();
	310	for(;;) {
	311	if (ch == '*') {
	312	NextCh();
	313	if (ch == '/') {
	314	level--;
	315	if (level == 0) { oldEols = line - line0; NextCh(); return true; }
	316	NextCh();
	317	}
	318	} else if (ch == '/') {
	319	NextCh();
	320	if (ch == '*') {
	321	level++; NextCh();
	322	}
	323	} else if (ch == Buffer.EOF) return false;
	324	else NextCh();
	325	}
	326	} else {
	327	buffer.Pos = pos0; NextCh(); line = line0; col = col0; charPos = charPos0;
	328	}
	329	return false;
	330	}
[9724]	331
[10067]	332
[9724]	333	void CheckLiteral() {
[10067]	334	switch (t.val) {
	335	case "PROBLEM": t.kind = 2; break;
	336	case "CODE": t.kind = 3; break;
	337	case "INIT": t.kind = 6; break;
	338	case "NONTERMINALS": t.kind = 7; break;
	339	case "TERMINALS": t.kind = 8; break;
	340	case "RULES": t.kind = 9; break;
	341	case "MAXIMIZE": t.kind = 10; break;
	342	case "MINIMIZE": t.kind = 11; break;
	343	case "END": t.kind = 12; break;
	344	case "LOCAL": t.kind = 14; break;
	345	case "SEM": t.kind = 15; break;
	346	case "CONSTRAINTS": t.kind = 16; break;
	347	case "IN": t.kind = 17; break;
	348	case "SET": t.kind = 18; break;
	349	case "RANGE": t.kind = 19; break;
	350	case "EPS": t.kind = 23; break;
	351	default: break;
[9724]	352	}
	353	}
	354
	355	Token NextToken() {
	356	while (ch == ' ' \|\|
	357	ch >= 9 && ch <= 10 \|\| ch == 13
	358	) NextCh();
	359	if (ch == '/' && Comment0()) return NextToken();
	360	int recKind = noSym;
	361	int recEnd = pos;
	362	t = new Token();
	363	t.pos = pos; t.col = col; t.line = line; t.charPos = charPos;
	364	int state;
	365	if (start.ContainsKey(ch)) { state = (int) start[ch]; }
	366	else { state = 0; }
	367	tlen = 0; AddCh();
	368
	369	switch (state) {
	370	case -1: { t.kind = eofSym; break; } // NextCh already done
	371	case 0: {
	372	if (recKind != noSym) {
	373	tlen = recEnd - t.pos;
	374	SetScannerBehindT();
	375	}
	376	t.kind = recKind; break;
	377	} // NextCh already done
[10067]	378	case 1:
	379	recEnd = pos; recKind = 1;
	380	if (ch >= '0' && ch <= '9' \|\| ch >= 'A' && ch <= 'Z' \|\| ch >= 'a' && ch <= 'z') {AddCh(); goto case 1;}
	381	else {t.kind = 1; t.val = new String(tval, 0, tlen); CheckLiteral(); return t;}
	382	case 2:
	383	if (ch == '<') {AddCh(); goto case 3;}
	384	else {goto case 0;}
	385	case 3:
	386	{t.kind = 4; break;}
	387	case 4:
	388	if (ch == '>') {AddCh(); goto case 5;}
	389	else {goto case 0;}
	390	case 5:
	391	{t.kind = 5; break;}
	392	case 6:
	393	{t.kind = 20; break;}
	394	case 7:
	395	{t.kind = 21; break;}
	396	case 8:
	397	{t.kind = 22; break;}
	398	case 9:
	399	{t.kind = 24; break;}
	400	case 10:
	401	{t.kind = 25; break;}
	402	case 11:
	403	{t.kind = 26; break;}
	404	case 12:
	405	{t.kind = 27; break;}
	406	case 13:
	407	{t.kind = 28; break;}
	408	case 14:
	409	{t.kind = 29; break;}
	410	case 15:
	411	recEnd = pos; recKind = 13;
	412	if (ch == '.') {AddCh(); goto case 6;}
	413	else {t.kind = 13; break;}
[9724]	414
	415	}
	416	t.val = new String(tval, 0, tlen);
	417	return t;
	418	}
	419
	420	private void SetScannerBehindT() {
	421	buffer.Pos = t.pos;
	422	NextCh();
	423	line = t.line; col = t.col; charPos = t.charPos;
	424	for (int i = 0; i < tlen; i++) NextCh();
	425	}
	426
	427	// get the next token (possibly a token already seen during peeking)
	428	public Token Scan () {
	429	if (tokens.next == null) {
	430	return NextToken();
	431	} else {
	432	pt = tokens = tokens.next;
	433	return tokens;
	434	}
	435	}
	436
	437	// peek for the next token, ignore pragmas
	438	public Token Peek () {
	439	do {
	440	if (pt.next == null) {
	441	pt.next = NextToken();
	442	}
	443	pt = pt.next;
	444	} while (pt.kind > maxT); // skip pragmas
	445
	446	return pt;
	447	}
	448
	449	// make sure that peeking starts at the current scan position
	450	public void ResetPeek () { pt = tokens; }
	451
	452	} // end Scanner
	453	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences