Context Navigation

source: trunk/sources/HeuristicLab.DataAnalysis/DatasetParser.cs @ 202

Visit:

Last change on this file since 202 was 173, checked in by gkronber, 17 years ago
fixed a bug non-matching string constant
File size: 10.9 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22	using System;
23	using System.Collections.Generic;
24	using System.Globalization;
25	using System.IO;
26	using HeuristicLab.Data;
27
28	namespace HeuristicLab.DataAnalysis {
29	public class DatasetParser {
30	private Tokenizer tokenizer;
31	private Dictionary<string, List<Token>> metadata;
32	private List<List<double>> samplesList;
33
34	private int rows;
35	public int Rows {
36	get { return rows; }
37	set { rows = value; }
38	}
39
40	private int columns;
41	public int Columns {
42	get { return columns; }
43	set { columns = value; }
44	}
45
46	private double[] samples;
47	public double[] Samples {
48	get {
49	return samples;
50	}
51	}
52
53	public string ProblemName {
54	get {
55	return metadata["PROBLEMNAME"][0].stringValue;
56	}
57	}
58
59	public string[] VariableNames {
60	get {
61	List<Token> nameList = metadata["VARIABLENAMES"];
62	string[] names = new string[nameList.Count];
63	for (int i = 0; i < names.Length; i++) {
64	names[i] = nameList[i].stringValue;
65	}
66
67	return names;
68	}
69	}
70
71	public int TargetVariable {
72	get {
73	return metadata["TARGETVARIABLE"][0].intValue;
74	}
75	}
76
77	public int MaxTreeHeight {
78	get {
79	return metadata["MAXIMUMTREEHEIGHT"][0].intValue;
80	}
81	}
82
83	public int MaxTreeSize {
84	get {
85	return metadata["MAXIMUMTREESIZE"][0].intValue;
86	}
87	}
88
89	public int TrainingSamplesStart {
90	get {
91	if(!metadata.ContainsKey("TRAININGSAMPLESSTART")) return 0;
92	else return metadata["TRAININGSAMPLESSTART"][0].intValue;
93	}
94	}
95
96	public int TrainingSamplesEnd {
97	get {
98	if(!metadata.ContainsKey("TRAININGSAMPLESEND")) return rows;
99	else return metadata["TRAININGSAMPLESEND"][0].intValue;
100	}
101	}
102
103	public DatasetParser() {
104	this.metadata = new Dictionary<string, List<Token>>();
105	samplesList = new List<List<double>>();
106	}
107
108	public void Import(string importFileName, bool strict) {
109	StreamReader reader = new StreamReader(importFileName);
110	this.tokenizer = new Tokenizer(reader);
111	tokenizer.Separators = new string[] { " ", ";", "\t" };
112
113	// parse the file
114	Parse(strict);
115
116	// translate the list of samples into a DoubleMatrixData item
117	samples = new double[samplesList.Count * samplesList[0].Count];
118	rows = samplesList.Count;
119	columns = samplesList[0].Count;
120
121	int i = 0;
122	int j = 0;
123	foreach (List<double> row in samplesList) {
124	j = 0;
125	foreach (double element in row) {
126	samples[i * columns + j] = element;
127	j++;
128	}
129	i++;
130	}
131	}
132
133	#region tokenizer
134	internal enum TokenTypeEnum {
135	At, Assign, NewLine, String, Double, Int
136	}
137
138	internal class Token {
139	public TokenTypeEnum type;
140	public string stringValue;
141	public double doubleValue;
142	public int intValue;
143
144	public Token(TokenTypeEnum type, string value) {
145	this.type = type;
146	stringValue = value;
147	doubleValue = 0.0;
148	intValue = 0;
149	}
150
151	public override string ToString() {
152	return stringValue;
153	}
154	}
155
156
157	class Tokenizer {
158	private StreamReader reader;
159	private List<Token> tokens;
160	private string[] separators;
161
162	public int CurrentLineNumber = 0;
163	public string CurrentLine;
164
165	public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
166	public static Token AtToken = new Token(TokenTypeEnum.At, "@");
167	public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
168
169	public string[] Separators {
170	get { return separators; }
171	set { separators = value; }
172	}
173
174
175	public Tokenizer(StreamReader reader) {
176	this.reader = reader;
177	tokens = new List<Token>();
178	ReadNextTokens();
179	}
180
181	private void ReadNextTokens() {
182	if (!reader.EndOfStream) {
183	CurrentLine = reader.ReadLine();
184	Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) {
185	return MakeToken(str);
186	});
187
188	tokens.AddRange(newTokens);
189	tokens.Add(NewlineToken);
190	CurrentLineNumber++;
191	}
192	}
193
194	private Token MakeToken(string strToken) {
195	if (strToken == "@")
196	return AtToken;
197	else if (strToken == "=")
198	return AssignmentToken;
199	else {
200	Token token = new Token(TokenTypeEnum.String, strToken);
201
202	// try invariant culture
203	NumberFormatInfo currentNumberFormatInfo = CultureInfo.InvariantCulture.NumberFormat;
204	if (int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
205	token.type = TokenTypeEnum.Int;
206	return token;
207	} else if (double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
208	token.type = TokenTypeEnum.Double;
209	return token;
210	}
211	// try german culture
212	currentNumberFormatInfo = CultureInfo.GetCultureInfo("de-DE").NumberFormat;
213	if (int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
214	token.type = TokenTypeEnum.Int;
215	return token;
216	} else if (double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
217	token.type = TokenTypeEnum.Double;
218	return token;
219	}
220
221	// try current culture
222	currentNumberFormatInfo = CultureInfo.CurrentCulture.NumberFormat;
223	if (int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
224	token.type = TokenTypeEnum.Int;
225	return token;
226	} else if (double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
227	token.type = TokenTypeEnum.Double;
228	return token;
229	}
230
231	// nothing worked
232	return token;
233	}
234	}
235
236	public Token Peek() {
237	return tokens[0];
238	}
239
240	public Token Next() {
241	Token next = tokens[0];
242	tokens.RemoveAt(0);
243	if (tokens.Count == 0) {
244	ReadNextTokens();
245	}
246	return next;
247	}
248
249	public bool HasNext() {
250	return tokens.Count > 0 \|\| !reader.EndOfStream;
251	}
252	}
253	#endregion
254
255	#region parsing
256	private void Parse(bool strict) {
257	ParseMetaData(strict);
258	ParseSampleData(strict);
259	}
260
261	private void ParseSampleData(bool strict) {
262	List<double> row = new List<double>();
263	while (tokenizer.HasNext()) {
264	Token current = tokenizer.Next();
265	if (current.type == TokenTypeEnum.Double) {
266	// just take the value
267	row.Add(current.doubleValue);
268	} else if (current.type == TokenTypeEnum.Int) {
269	// translate the int value to double
270	row.Add((double)current.intValue);
271	} else if (current == Tokenizer.NewlineToken) {
272	// when parsing strictly all rows have to have the same number of values
273	if (strict) {
274	// the first row defines how many samples are needed
275	if (samplesList.Count > 0 && samplesList[0].Count != row.Count) {
276	Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
277	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.");
278	}
279	} else if (samplesList.Count > 0) {
280	// when we are not strict then fill or drop elements as needed
281	if (samplesList[0].Count > row.Count) {
282	// fill with NAN
283	for (int i = row.Count; i < samplesList[0].Count; i++) {
284	row.Add(double.NaN);
285	}
286	} else if (samplesList[0].Count < row.Count) {
287	// drop last k elements where k = n - length of first row
288	row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
289	}
290	}
291
292	// add the current row to the collection of rows and start a new row
293	samplesList.Add(row);
294	row = new List<double>();
295	} else {
296	// found an unexpected token => return false when parsing strictly
297	// when we are parsing non-strictly we also allow unreadable values inserting NAN instead
298	if (strict) {
299	Error("Unkown value " + current + " in line " + tokenizer.CurrentLineNumber +
300	"\n" + tokenizer.CurrentLine);
301	} else {
302	row.Add(double.NaN);
303	}
304	}
305	}
306	}
307
308	private void ParseMetaData(bool strict) {
309	while (tokenizer.Peek() == Tokenizer.AtToken) {
310	Expect(Tokenizer.AtToken);
311
312	Token nameToken = tokenizer.Next();
313	if (nameToken.type != TokenTypeEnum.String)
314	throw new Exception("Expected a variable name; got " + nameToken +
315	"\nLine " + tokenizer.CurrentLineNumber + ": " + tokenizer.CurrentLine);
316
317	Expect(Tokenizer.AssignmentToken);
318
319	List<Token> tokens = new List<Token>();
320	Token valueToken = tokenizer.Next();
321	while (valueToken != Tokenizer.NewlineToken) {
322	tokens.Add(valueToken);
323	valueToken = tokenizer.Next();
324	}
325
326	metadata[nameToken.stringValue] = tokens;
327	}
328	}
329
330	private void Expect(Token expectedToken) {
331	Token actualToken = tokenizer.Next();
332	if (actualToken != expectedToken) {
333	Error("Expected: " + expectedToken + " got: " + actualToken +
334	"\nLine " + tokenizer.CurrentLineNumber + ": " + tokenizer.CurrentLine);
335	}
336	}
337
338	private void Error(string message) {
339	throw new Exception("Error while parsing.\n" + message);
340	}
341	#endregion
342	}
343	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences