Context Navigation

source: trunk/sources/HeuristicLab.DataAnalysis/DatasetParser.cs @ 290

Visit:

Last change on this file since 290 was 273, checked in by gkronber, 16 years ago
fixed #160
File size: 11.9 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22	using System;
23	using System.Collections.Generic;
24	using System.Globalization;
25	using System.IO;
26	using HeuristicLab.Data;
27
28	namespace HeuristicLab.DataAnalysis {
29	public class DatasetParser {
30	private const string PROBLEMNAME = "PROBLEMNAME";
31	private const string VARIABLENAMES = "VARIABLENAMES";
32	private const string TARGETVARIABLE = "TARGETVARIABLE";
33	private const string MAXIMUMTREEHEIGHT = "MAXIMUMTREEHEIGHT";
34	private const string MAXIMUMTREESIZE = "MAXIMUMTREESIZE";
35	private const string TRAININGSAMPLESSTART = "TRAININGSAMPLESSTART";
36	private const string TRAININGSAMPLESEND = "TRAININGSAMPLESEND";
37	private Tokenizer tokenizer;
38	private Dictionary<string, List<Token>> metadata;
39	private List<List<double>> samplesList;
40
41	private int rows;
42	public int Rows {
43	get { return rows; }
44	set { rows = value; }
45	}
46
47	private int columns;
48	public int Columns {
49	get { return columns; }
50	set { columns = value; }
51	}
52
53	private double[] samples;
54	public double[] Samples {
55	get {
56	return samples;
57	}
58	}
59
60	public string ProblemName {
61	get {
62	if(metadata.ContainsKey(PROBLEMNAME)) {
63	return metadata[PROBLEMNAME][0].stringValue;
64	} else return "-";
65	}
66	}
67
68	public string[] VariableNames {
69	get {
70	if(metadata.ContainsKey(VARIABLENAMES)) {
71	List<Token> nameList = metadata[VARIABLENAMES];
72	string[] names = new string[nameList.Count];
73	for(int i = 0; i < names.Length; i++) {
74	names[i] = nameList[i].stringValue;
75	}
76	return names;
77	} else {
78	string[] names = new string[columns];
79	for(int i = 0; i < names.Length; i++) {
80	names[i] = "X" + i.ToString("000");
81	}
82	return names;
83	}
84	}
85	}
86
87	public int TargetVariable {
88	get {
89	if(metadata.ContainsKey(TARGETVARIABLE)) {
90	return metadata[TARGETVARIABLE][0].intValue;
91	} else return 0; // default is the first column
92	}
93	}
94
95	public int MaxTreeHeight {
96	get {
97	if(metadata.ContainsKey(MAXIMUMTREEHEIGHT)) {
98	return metadata[MAXIMUMTREEHEIGHT][0].intValue;
99	} else return 0;
100	}
101	}
102
103	public int MaxTreeSize {
104	get {
105	if(metadata.ContainsKey(MAXIMUMTREESIZE)) {
106	return metadata[MAXIMUMTREESIZE][0].intValue;
107	} else return 0;
108	}
109	}
110
111	public int TrainingSamplesStart {
112	get {
113	if(metadata.ContainsKey(TRAININGSAMPLESSTART)) {
114	return metadata[TRAININGSAMPLESSTART][0].intValue;
115	} else return 0;
116	}
117	}
118
119	public int TrainingSamplesEnd {
120	get {
121	if(metadata.ContainsKey(TRAININGSAMPLESEND)) {
122	return metadata[TRAININGSAMPLESEND][0].intValue;
123	} else return rows;
124	}
125	}
126
127	public DatasetParser() {
128	this.metadata = new Dictionary<string, List<Token>>();
129	samplesList = new List<List<double>>();
130	}
131
132	public void Import(string importFileName, bool strict) {
133	StreamReader reader = new StreamReader(importFileName);
134	this.tokenizer = new Tokenizer(reader);
135	tokenizer.Separators = new string[] { " ", ";", "\t" };
136
137	try {
138	// parse the file
139	Parse(strict);
140	} finally {
141	reader.Close();
142	}
143
144	// translate the list of samples into a DoubleMatrixData item
145	samples = new double[samplesList.Count * samplesList[0].Count];
146	rows = samplesList.Count;
147	columns = samplesList[0].Count;
148
149	int i = 0;
150	int j = 0;
151	foreach(List<double> row in samplesList) {
152	j = 0;
153	foreach(double element in row) {
154	samples[i * columns + j] = element;
155	j++;
156	}
157	i++;
158	}
159	}
160
161	#region tokenizer
162	internal enum TokenTypeEnum {
163	At, Assign, NewLine, String, Double, Int
164	}
165
166	internal class Token {
167	public TokenTypeEnum type;
168	public string stringValue;
169	public double doubleValue;
170	public int intValue;
171
172	public Token(TokenTypeEnum type, string value) {
173	this.type = type;
174	stringValue = value;
175	doubleValue = 0.0;
176	intValue = 0;
177	}
178
179	public override string ToString() {
180	return stringValue;
181	}
182	}
183
184
185	class Tokenizer {
186	private StreamReader reader;
187	private List<Token> tokens;
188	private string[] separators;
189
190	public int CurrentLineNumber = 0;
191	public string CurrentLine;
192
193	public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
194	public static Token AtToken = new Token(TokenTypeEnum.At, "@");
195	public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
196
197	public string[] Separators {
198	get { return separators; }
199	set { separators = value; }
200	}
201
202
203	public Tokenizer(StreamReader reader) {
204	this.reader = reader;
205	tokens = new List<Token>();
206	ReadNextTokens();
207	}
208
209	private void ReadNextTokens() {
210	if(!reader.EndOfStream) {
211	CurrentLine = reader.ReadLine();
212	Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) {
213	return MakeToken(str);
214	});
215
216	tokens.AddRange(newTokens);
217	tokens.Add(NewlineToken);
218	CurrentLineNumber++;
219	}
220	}
221
222	private Token MakeToken(string strToken) {
223	if(strToken == "@")
224	return AtToken;
225	else if(strToken == "=")
226	return AssignmentToken;
227	else {
228	Token token = new Token(TokenTypeEnum.String, strToken);
229
230	// try invariant culture
231	NumberFormatInfo currentNumberFormatInfo = CultureInfo.InvariantCulture.NumberFormat;
232	if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
233	token.type = TokenTypeEnum.Int;
234	return token;
235	} else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
236	token.type = TokenTypeEnum.Double;
237	return token;
238	}
239	// try german culture
240	currentNumberFormatInfo = CultureInfo.GetCultureInfo("de-DE").NumberFormat;
241	if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
242	token.type = TokenTypeEnum.Int;
243	return token;
244	} else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
245	token.type = TokenTypeEnum.Double;
246	return token;
247	}
248
249	// try current culture
250	currentNumberFormatInfo = CultureInfo.CurrentCulture.NumberFormat;
251	if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
252	token.type = TokenTypeEnum.Int;
253	return token;
254	} else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
255	token.type = TokenTypeEnum.Double;
256	return token;
257	}
258
259	// nothing worked
260	return token;
261	}
262	}
263
264	public Token Peek() {
265	return tokens[0];
266	}
267
268	public Token Next() {
269	Token next = tokens[0];
270	tokens.RemoveAt(0);
271	if(tokens.Count == 0) {
272	ReadNextTokens();
273	}
274	return next;
275	}
276
277	public bool HasNext() {
278	return tokens.Count > 0 \|\| !reader.EndOfStream;
279	}
280	}
281	#endregion
282
283	#region parsing
284	private void Parse(bool strict) {
285	ParseMetaData(strict);
286	ParseSampleData(strict);
287	}
288
289	private void ParseSampleData(bool strict) {
290	List<double> row = new List<double>();
291	while(tokenizer.HasNext()) {
292	Token current = tokenizer.Next();
293	if(current.type == TokenTypeEnum.Double) {
294	// just take the value
295	row.Add(current.doubleValue);
296	} else if(current.type == TokenTypeEnum.Int) {
297	// translate the int value to double
298	row.Add((double)current.intValue);
299	} else if(current == Tokenizer.NewlineToken) {
300	// when parsing strictly all rows have to have the same number of values
301	if(strict) {
302	// the first row defines how many samples are needed
303	if(samplesList.Count > 0 && samplesList[0].Count != row.Count) {
304	Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
305	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
306	}
307	} else if(samplesList.Count > 0) {
308	// when we are not strict then fill or drop elements as needed
309	if(samplesList[0].Count > row.Count) {
310	// fill with NAN
311	for(int i = row.Count; i < samplesList[0].Count; i++) {
312	row.Add(double.NaN);
313	}
314	} else if(samplesList[0].Count < row.Count) {
315	// drop last k elements where k = n - length of first row
316	row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
317	}
318	}
319
320	// add the current row to the collection of rows and start a new row
321	samplesList.Add(row);
322	row = new List<double>();
323	} else {
324	// found an unexpected token => return false when parsing strictly
325	// when we are parsing non-strictly we also allow unreadable values inserting NAN instead
326	if(strict) {
327	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
328	} else {
329	row.Add(double.NaN);
330	}
331	}
332	}
333	}
334
335	private void ParseMetaData(bool strict) {
336	while(tokenizer.Peek() == Tokenizer.AtToken) {
337	Expect(Tokenizer.AtToken);
338
339	Token nameToken = tokenizer.Next();
340	if(nameToken.type != TokenTypeEnum.String)
341	Error("Expected a variable name.", nameToken.stringValue, tokenizer.CurrentLineNumber);
342
343	Expect(Tokenizer.AssignmentToken);
344
345	List<Token> tokens = new List<Token>();
346	Token valueToken = tokenizer.Next();
347	while(valueToken != Tokenizer.NewlineToken) {
348	tokens.Add(valueToken);
349	valueToken = tokenizer.Next();
350	}
351
352	metadata[nameToken.stringValue] = tokens;
353	}
354	}
355
356	private void Expect(Token expectedToken) {
357	Token actualToken = tokenizer.Next();
358	if(actualToken != expectedToken) {
359	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
360	}
361	}
362
363	private void Error(string message, string token, int lineNumber) {
364	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
365	}
366	#endregion
367	}
368	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences