Context Navigation

source: trunk/sources/HeuristicLab.DataAnalysis/DatasetParser.cs @ 393

Visit:

Last change on this file since 393 was 363, checked in by gkronber, 16 years ago

implemented operator to store the best of run solution, in regard of a specific fitness variable).
adapted struct-id infrastructure to allow evaluation of models on validation data.

ticket #194

File size: 12.5 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22	using System;
23	using System.Collections.Generic;
24	using System.Globalization;
25	using System.IO;
26	using HeuristicLab.Data;
27
28	namespace HeuristicLab.DataAnalysis {
29	public class DatasetParser {
30	private const string PROBLEMNAME = "PROBLEMNAME";
31	private const string VARIABLENAMES = "VARIABLENAMES";
32	private const string TARGETVARIABLE = "TARGETVARIABLE";
33	private const string MAXIMUMTREEHEIGHT = "MAXIMUMTREEHEIGHT";
34	private const string MAXIMUMTREESIZE = "MAXIMUMTREESIZE";
35	private const string TRAININGSAMPLESSTART = "TRAININGSAMPLESSTART";
36	private const string TRAININGSAMPLESEND = "TRAININGSAMPLESEND";
37	private const string VALIDATIONSAMPLESSTART = "VALIDATIONSAMPLESSTART";
38	private const string VALIDATIONSAMPLESEND = "VALIDATIONSAMPLESEND";
39	private Tokenizer tokenizer;
40	private Dictionary<string, List<Token>> metadata;
41	private List<List<double>> samplesList;
42
43	private int rows;
44	public int Rows {
45	get { return rows; }
46	set { rows = value; }
47	}
48
49	private int columns;
50	public int Columns {
51	get { return columns; }
52	set { columns = value; }
53	}
54
55	private double[] samples;
56	public double[] Samples {
57	get {
58	return samples;
59	}
60	}
61
62	public string ProblemName {
63	get {
64	if(metadata.ContainsKey(PROBLEMNAME)) {
65	return metadata[PROBLEMNAME][0].stringValue;
66	} else return "-";
67	}
68	}
69
70	public string[] VariableNames {
71	get {
72	if(metadata.ContainsKey(VARIABLENAMES)) {
73	List<Token> nameList = metadata[VARIABLENAMES];
74	string[] names = new string[nameList.Count];
75	for(int i = 0; i < names.Length; i++) {
76	names[i] = nameList[i].stringValue;
77	}
78	return names;
79	} else {
80	string[] names = new string[columns];
81	for(int i = 0; i < names.Length; i++) {
82	names[i] = "X" + i.ToString("000");
83	}
84	return names;
85	}
86	}
87	}
88
89	public int TargetVariable {
90	get {
91	if(metadata.ContainsKey(TARGETVARIABLE)) {
92	return metadata[TARGETVARIABLE][0].intValue;
93	} else return 0; // default is the first column
94	}
95	}
96
97	public int MaxTreeHeight {
98	get {
99	if(metadata.ContainsKey(MAXIMUMTREEHEIGHT)) {
100	return metadata[MAXIMUMTREEHEIGHT][0].intValue;
101	} else return 0;
102	}
103	}
104
105	public int MaxTreeSize {
106	get {
107	if(metadata.ContainsKey(MAXIMUMTREESIZE)) {
108	return metadata[MAXIMUMTREESIZE][0].intValue;
109	} else return 0;
110	}
111	}
112
113	public int TrainingSamplesStart {
114	get {
115	if(metadata.ContainsKey(TRAININGSAMPLESSTART)) {
116	return metadata[TRAININGSAMPLESSTART][0].intValue;
117	} else return 0;
118	}
119	}
120
121	public int TrainingSamplesEnd {
122	get {
123	if(metadata.ContainsKey(TRAININGSAMPLESEND)) {
124	return metadata[TRAININGSAMPLESEND][0].intValue;
125	} else return rows;
126	}
127	}
128	public int ValidationSamplesStart {
129	get {
130	if(metadata.ContainsKey(VALIDATIONSAMPLESSTART)) {
131	return metadata[VALIDATIONSAMPLESSTART][0].intValue;
132	} else return 0;
133	}
134	}
135
136	public int ValidationSamplesEnd {
137	get {
138	if(metadata.ContainsKey(VALIDATIONSAMPLESEND)) {
139	return metadata[VALIDATIONSAMPLESEND][0].intValue;
140	} else return rows;
141	}
142	}
143
144	public DatasetParser() {
145	this.metadata = new Dictionary<string, List<Token>>();
146	samplesList = new List<List<double>>();
147	}
148
149	public void Import(string importFileName, bool strict) {
150	StreamReader reader = new StreamReader(importFileName);
151	this.tokenizer = new Tokenizer(reader);
152	tokenizer.Separators = new string[] { " ", ";", "\t" };
153
154	try {
155	// parse the file
156	Parse(strict);
157	} finally {
158	reader.Close();
159	}
160
161	// translate the list of samples into a DoubleMatrixData item
162	samples = new double[samplesList.Count * samplesList[0].Count];
163	rows = samplesList.Count;
164	columns = samplesList[0].Count;
165
166	int i = 0;
167	int j = 0;
168	foreach(List<double> row in samplesList) {
169	j = 0;
170	foreach(double element in row) {
171	samples[i * columns + j] = element;
172	j++;
173	}
174	i++;
175	}
176	}
177
178	#region tokenizer
179	internal enum TokenTypeEnum {
180	At, Assign, NewLine, String, Double, Int
181	}
182
183	internal class Token {
184	public TokenTypeEnum type;
185	public string stringValue;
186	public double doubleValue;
187	public int intValue;
188
189	public Token(TokenTypeEnum type, string value) {
190	this.type = type;
191	stringValue = value;
192	doubleValue = 0.0;
193	intValue = 0;
194	}
195
196	public override string ToString() {
197	return stringValue;
198	}
199	}
200
201
202	class Tokenizer {
203	private StreamReader reader;
204	private List<Token> tokens;
205	private string[] separators;
206
207	public int CurrentLineNumber = 0;
208	public string CurrentLine;
209
210	public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
211	public static Token AtToken = new Token(TokenTypeEnum.At, "@");
212	public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
213
214	public string[] Separators {
215	get { return separators; }
216	set { separators = value; }
217	}
218
219
220	public Tokenizer(StreamReader reader) {
221	this.reader = reader;
222	tokens = new List<Token>();
223	ReadNextTokens();
224	}
225
226	private void ReadNextTokens() {
227	if(!reader.EndOfStream) {
228	CurrentLine = reader.ReadLine();
229	Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) {
230	return MakeToken(str);
231	});
232
233	tokens.AddRange(newTokens);
234	tokens.Add(NewlineToken);
235	CurrentLineNumber++;
236	}
237	}
238
239	private Token MakeToken(string strToken) {
240	if(strToken == "@")
241	return AtToken;
242	else if(strToken == "=")
243	return AssignmentToken;
244	else {
245	Token token = new Token(TokenTypeEnum.String, strToken);
246
247	// try invariant culture
248	NumberFormatInfo currentNumberFormatInfo = CultureInfo.InvariantCulture.NumberFormat;
249	if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
250	token.type = TokenTypeEnum.Int;
251	return token;
252	} else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
253	token.type = TokenTypeEnum.Double;
254	return token;
255	}
256	// try german culture
257	currentNumberFormatInfo = CultureInfo.GetCultureInfo("de-DE").NumberFormat;
258	if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
259	token.type = TokenTypeEnum.Int;
260	return token;
261	} else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
262	token.type = TokenTypeEnum.Double;
263	return token;
264	}
265
266	// try current culture
267	currentNumberFormatInfo = CultureInfo.CurrentCulture.NumberFormat;
268	if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
269	token.type = TokenTypeEnum.Int;
270	return token;
271	} else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
272	token.type = TokenTypeEnum.Double;
273	return token;
274	}
275
276	// nothing worked
277	return token;
278	}
279	}
280
281	public Token Peek() {
282	return tokens[0];
283	}
284
285	public Token Next() {
286	Token next = tokens[0];
287	tokens.RemoveAt(0);
288	if(tokens.Count == 0) {
289	ReadNextTokens();
290	}
291	return next;
292	}
293
294	public bool HasNext() {
295	return tokens.Count > 0 \|\| !reader.EndOfStream;
296	}
297	}
298	#endregion
299
300	#region parsing
301	private void Parse(bool strict) {
302	ParseMetaData(strict);
303	ParseSampleData(strict);
304	}
305
306	private void ParseSampleData(bool strict) {
307	List<double> row = new List<double>();
308	while(tokenizer.HasNext()) {
309	Token current = tokenizer.Next();
310	if(current.type == TokenTypeEnum.Double) {
311	// just take the value
312	row.Add(current.doubleValue);
313	} else if(current.type == TokenTypeEnum.Int) {
314	// translate the int value to double
315	row.Add((double)current.intValue);
316	} else if(current == Tokenizer.NewlineToken) {
317	// when parsing strictly all rows have to have the same number of values
318	if(strict) {
319	// the first row defines how many samples are needed
320	if(samplesList.Count > 0 && samplesList[0].Count != row.Count) {
321	Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
322	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
323	}
324	} else if(samplesList.Count > 0) {
325	// when we are not strict then fill or drop elements as needed
326	if(samplesList[0].Count > row.Count) {
327	// fill with NAN
328	for(int i = row.Count; i < samplesList[0].Count; i++) {
329	row.Add(double.NaN);
330	}
331	} else if(samplesList[0].Count < row.Count) {
332	// drop last k elements where k = n - length of first row
333	row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
334	}
335	}
336
337	// add the current row to the collection of rows and start a new row
338	samplesList.Add(row);
339	row = new List<double>();
340	} else {
341	// found an unexpected token => return false when parsing strictly
342	// when we are parsing non-strictly we also allow unreadable values inserting NAN instead
343	if(strict) {
344	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
345	} else {
346	row.Add(double.NaN);
347	}
348	}
349	}
350	}
351
352	private void ParseMetaData(bool strict) {
353	while(tokenizer.Peek() == Tokenizer.AtToken) {
354	Expect(Tokenizer.AtToken);
355
356	Token nameToken = tokenizer.Next();
357	if(nameToken.type != TokenTypeEnum.String)
358	Error("Expected a variable name.", nameToken.stringValue, tokenizer.CurrentLineNumber);
359
360	Expect(Tokenizer.AssignmentToken);
361
362	List<Token> tokens = new List<Token>();
363	Token valueToken = tokenizer.Next();
364	while(valueToken != Tokenizer.NewlineToken) {
365	tokens.Add(valueToken);
366	valueToken = tokenizer.Next();
367	}
368
369	metadata[nameToken.stringValue] = tokens;
370	}
371	}
372
373	private void Expect(Token expectedToken) {
374	Token actualToken = tokenizer.Next();
375	if(actualToken != expectedToken) {
376	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
377	}
378	}
379
380	private void Error(string message, string token, int lineNumber) {
381	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
382	}
383	#endregion
384	}
385	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences