Context Navigation

source: trunk/sources/HeuristicLab.DataAnalysis/DatasetParser.cs @ 397

Visit:

Last change on this file since 397 was 397, checked in by gkronber, 16 years ago
fixed #206
File size: 13.0 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22	using System;
23	using System.Collections.Generic;
24	using System.Globalization;
25	using System.IO;
26	using HeuristicLab.Data;
27
28	namespace HeuristicLab.DataAnalysis {
29	public class DatasetParser {
30	private const string PROBLEMNAME = "PROBLEMNAME";
31	private const string VARIABLENAMES = "VARIABLENAMES";
32	private const string TARGETVARIABLE = "TARGETVARIABLE";
33	private const string MAXIMUMTREEHEIGHT = "MAXIMUMTREEHEIGHT";
34	private const string MAXIMUMTREESIZE = "MAXIMUMTREESIZE";
35	private const string TRAININGSAMPLESSTART = "TRAININGSAMPLESSTART";
36	private const string TRAININGSAMPLESEND = "TRAININGSAMPLESEND";
37	private const string VALIDATIONSAMPLESSTART = "VALIDATIONSAMPLESSTART";
38	private const string VALIDATIONSAMPLESEND = "VALIDATIONSAMPLESEND";
39	private const string TESTSAMPLESSTART = "TESTSAMPLESSTART";
40	private const string TESTSAMPLESEND = "TESTSAMPLESEND";
41	private Tokenizer tokenizer;
42	private Dictionary<string, List<Token>> metadata;
43	private List<List<double>> samplesList;
44
45	private int rows;
46	public int Rows {
47	get { return rows; }
48	set { rows = value; }
49	}
50
51	private int columns;
52	public int Columns {
53	get { return columns; }
54	set { columns = value; }
55	}
56
57	private double[] samples;
58	public double[] Samples {
59	get {
60	return samples;
61	}
62	}
63
64	public string ProblemName {
65	get {
66	if(metadata.ContainsKey(PROBLEMNAME)) {
67	return metadata[PROBLEMNAME][0].stringValue;
68	} else return "-";
69	}
70	}
71
72	public string[] VariableNames {
73	get {
74	if(metadata.ContainsKey(VARIABLENAMES)) {
75	List<Token> nameList = metadata[VARIABLENAMES];
76	string[] names = new string[nameList.Count];
77	for(int i = 0; i < names.Length; i++) {
78	names[i] = nameList[i].stringValue;
79	}
80	return names;
81	} else {
82	string[] names = new string[columns];
83	for(int i = 0; i < names.Length; i++) {
84	names[i] = "X" + i.ToString("000");
85	}
86	return names;
87	}
88	}
89	}
90
91	public int TargetVariable {
92	get {
93	if(metadata.ContainsKey(TARGETVARIABLE)) {
94	return metadata[TARGETVARIABLE][0].intValue;
95	} else return 0; // default is the first column
96	}
97	}
98
99	public int MaxTreeHeight {
100	get {
101	if(metadata.ContainsKey(MAXIMUMTREEHEIGHT)) {
102	return metadata[MAXIMUMTREEHEIGHT][0].intValue;
103	} else return 0;
104	}
105	}
106
107	public int MaxTreeSize {
108	get {
109	if(metadata.ContainsKey(MAXIMUMTREESIZE)) {
110	return metadata[MAXIMUMTREESIZE][0].intValue;
111	} else return 0;
112	}
113	}
114
115	public int TrainingSamplesStart {
116	get {
117	if(metadata.ContainsKey(TRAININGSAMPLESSTART)) {
118	return metadata[TRAININGSAMPLESSTART][0].intValue;
119	} else return 0;
120	}
121	}
122
123	public int TrainingSamplesEnd {
124	get {
125	if(metadata.ContainsKey(TRAININGSAMPLESEND)) {
126	return metadata[TRAININGSAMPLESEND][0].intValue;
127	} else return rows;
128	}
129	}
130	public int ValidationSamplesStart {
131	get {
132	if(metadata.ContainsKey(VALIDATIONSAMPLESSTART)) {
133	return metadata[VALIDATIONSAMPLESSTART][0].intValue;
134	} else return 0;
135	}
136	}
137
138	public int ValidationSamplesEnd {
139	get {
140	if(metadata.ContainsKey(VALIDATIONSAMPLESEND)) {
141	return metadata[VALIDATIONSAMPLESEND][0].intValue;
142	} else return rows;
143	}
144	}
145	public int TestSamplesStart {
146	get {
147	if(metadata.ContainsKey(TESTSAMPLESSTART)) {
148	return metadata[TESTSAMPLESSTART][0].intValue;
149	} else return 0;
150	}
151	}
152
153	public int TestSamplesEnd {
154	get {
155	if(metadata.ContainsKey(TESTSAMPLESEND)) {
156	return metadata[TESTSAMPLESEND][0].intValue;
157	} else return rows;
158	}
159	}
160
161	public DatasetParser() {
162	this.metadata = new Dictionary<string, List<Token>>();
163	samplesList = new List<List<double>>();
164	}
165
166	public void Import(string importFileName, bool strict) {
167	StreamReader reader = new StreamReader(importFileName);
168	this.tokenizer = new Tokenizer(reader);
169	tokenizer.Separators = new string[] { " ", ";", "\t" };
170
171	try {
172	// parse the file
173	Parse(strict);
174	} finally {
175	reader.Close();
176	}
177
178	// translate the list of samples into a DoubleMatrixData item
179	samples = new double[samplesList.Count * samplesList[0].Count];
180	rows = samplesList.Count;
181	columns = samplesList[0].Count;
182
183	int i = 0;
184	int j = 0;
185	foreach(List<double> row in samplesList) {
186	j = 0;
187	foreach(double element in row) {
188	samples[i * columns + j] = element;
189	j++;
190	}
191	i++;
192	}
193	}
194
195	#region tokenizer
196	internal enum TokenTypeEnum {
197	At, Assign, NewLine, String, Double, Int
198	}
199
200	internal class Token {
201	public TokenTypeEnum type;
202	public string stringValue;
203	public double doubleValue;
204	public int intValue;
205
206	public Token(TokenTypeEnum type, string value) {
207	this.type = type;
208	stringValue = value;
209	doubleValue = 0.0;
210	intValue = 0;
211	}
212
213	public override string ToString() {
214	return stringValue;
215	}
216	}
217
218
219	class Tokenizer {
220	private StreamReader reader;
221	private List<Token> tokens;
222	private string[] separators;
223
224	public int CurrentLineNumber = 0;
225	public string CurrentLine;
226
227	public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
228	public static Token AtToken = new Token(TokenTypeEnum.At, "@");
229	public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
230
231	public string[] Separators {
232	get { return separators; }
233	set { separators = value; }
234	}
235
236
237	public Tokenizer(StreamReader reader) {
238	this.reader = reader;
239	tokens = new List<Token>();
240	ReadNextTokens();
241	}
242
243	private void ReadNextTokens() {
244	if(!reader.EndOfStream) {
245	CurrentLine = reader.ReadLine();
246	Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) {
247	return MakeToken(str);
248	});
249
250	tokens.AddRange(newTokens);
251	tokens.Add(NewlineToken);
252	CurrentLineNumber++;
253	}
254	}
255
256	private Token MakeToken(string strToken) {
257	if(strToken == "@")
258	return AtToken;
259	else if(strToken == "=")
260	return AssignmentToken;
261	else {
262	Token token = new Token(TokenTypeEnum.String, strToken);
263
264	// try invariant culture
265	NumberFormatInfo currentNumberFormatInfo = CultureInfo.InvariantCulture.NumberFormat;
266	if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
267	token.type = TokenTypeEnum.Int;
268	return token;
269	} else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
270	token.type = TokenTypeEnum.Double;
271	return token;
272	}
273	// try german culture
274	currentNumberFormatInfo = CultureInfo.GetCultureInfo("de-DE").NumberFormat;
275	if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
276	token.type = TokenTypeEnum.Int;
277	return token;
278	} else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
279	token.type = TokenTypeEnum.Double;
280	return token;
281	}
282
283	// try current culture
284	currentNumberFormatInfo = CultureInfo.CurrentCulture.NumberFormat;
285	if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
286	token.type = TokenTypeEnum.Int;
287	return token;
288	} else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
289	token.type = TokenTypeEnum.Double;
290	return token;
291	}
292
293	// nothing worked
294	return token;
295	}
296	}
297
298	public Token Peek() {
299	return tokens[0];
300	}
301
302	public Token Next() {
303	Token next = tokens[0];
304	tokens.RemoveAt(0);
305	if(tokens.Count == 0) {
306	ReadNextTokens();
307	}
308	return next;
309	}
310
311	public bool HasNext() {
312	return tokens.Count > 0 \|\| !reader.EndOfStream;
313	}
314	}
315	#endregion
316
317	#region parsing
318	private void Parse(bool strict) {
319	ParseMetaData(strict);
320	ParseSampleData(strict);
321	}
322
323	private void ParseSampleData(bool strict) {
324	List<double> row = new List<double>();
325	while(tokenizer.HasNext()) {
326	Token current = tokenizer.Next();
327	if(current.type == TokenTypeEnum.Double) {
328	// just take the value
329	row.Add(current.doubleValue);
330	} else if(current.type == TokenTypeEnum.Int) {
331	// translate the int value to double
332	row.Add((double)current.intValue);
333	} else if(current == Tokenizer.NewlineToken) {
334	// when parsing strictly all rows have to have the same number of values
335	if(strict) {
336	// the first row defines how many samples are needed
337	if(samplesList.Count > 0 && samplesList[0].Count != row.Count) {
338	Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
339	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
340	}
341	} else if(samplesList.Count > 0) {
342	// when we are not strict then fill or drop elements as needed
343	if(samplesList[0].Count > row.Count) {
344	// fill with NAN
345	for(int i = row.Count; i < samplesList[0].Count; i++) {
346	row.Add(double.NaN);
347	}
348	} else if(samplesList[0].Count < row.Count) {
349	// drop last k elements where k = n - length of first row
350	row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
351	}
352	}
353
354	// add the current row to the collection of rows and start a new row
355	samplesList.Add(row);
356	row = new List<double>();
357	} else {
358	// found an unexpected token => return false when parsing strictly
359	// when we are parsing non-strictly we also allow unreadable values inserting NAN instead
360	if(strict) {
361	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
362	} else {
363	row.Add(double.NaN);
364	}
365	}
366	}
367	}
368
369	private void ParseMetaData(bool strict) {
370	while(tokenizer.Peek() == Tokenizer.AtToken) {
371	Expect(Tokenizer.AtToken);
372
373	Token nameToken = tokenizer.Next();
374	if(nameToken.type != TokenTypeEnum.String)
375	Error("Expected a variable name.", nameToken.stringValue, tokenizer.CurrentLineNumber);
376
377	Expect(Tokenizer.AssignmentToken);
378
379	List<Token> tokens = new List<Token>();
380	Token valueToken = tokenizer.Next();
381	while(valueToken != Tokenizer.NewlineToken) {
382	tokens.Add(valueToken);
383	valueToken = tokenizer.Next();
384	}
385
386	metadata[nameToken.stringValue] = tokens;
387	}
388	}
389
390	private void Expect(Token expectedToken) {
391	Token actualToken = tokenizer.Next();
392	if(actualToken != expectedToken) {
393	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
394	}
395	}
396
397	private void Error(string message, string token, int lineNumber) {
398	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
399	}
400	#endregion
401	}
402	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences