Context Navigation

source: trunk/sources/HeuristicLab.DataAnalysis/DatasetParser.cs @ 272

Visit:

Last change on this file since 272 was 272, checked in by gkronber, 16 years ago
fixed #158
File size: 10.9 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22	using System;
23	using System.Collections.Generic;
24	using System.Globalization;
25	using System.IO;
26	using HeuristicLab.Data;
27
28	namespace HeuristicLab.DataAnalysis {
29	public class DatasetParser {
30	private Tokenizer tokenizer;
31	private Dictionary<string, List<Token>> metadata;
32	private List<List<double>> samplesList;
33
34	private int rows;
35	public int Rows {
36	get { return rows; }
37	set { rows = value; }
38	}
39
40	private int columns;
41	public int Columns {
42	get { return columns; }
43	set { columns = value; }
44	}
45
46	private double[] samples;
47	public double[] Samples {
48	get {
49	return samples;
50	}
51	}
52
53	public string ProblemName {
54	get {
55	return metadata["PROBLEMNAME"][0].stringValue;
56	}
57	}
58
59	public string[] VariableNames {
60	get {
61	List<Token> nameList = metadata["VARIABLENAMES"];
62	string[] names = new string[nameList.Count];
63	for(int i = 0; i < names.Length; i++) {
64	names[i] = nameList[i].stringValue;
65	}
66
67	return names;
68	}
69	}
70
71	public int TargetVariable {
72	get {
73	return metadata["TARGETVARIABLE"][0].intValue;
74	}
75	}
76
77	public int MaxTreeHeight {
78	get {
79	return metadata["MAXIMUMTREEHEIGHT"][0].intValue;
80	}
81	}
82
83	public int MaxTreeSize {
84	get {
85	return metadata["MAXIMUMTREESIZE"][0].intValue;
86	}
87	}
88
89	public int TrainingSamplesStart {
90	get {
91	if(!metadata.ContainsKey("TRAININGSAMPLESSTART")) return 0;
92	else return metadata["TRAININGSAMPLESSTART"][0].intValue;
93	}
94	}
95
96	public int TrainingSamplesEnd {
97	get {
98	if(!metadata.ContainsKey("TRAININGSAMPLESEND")) return rows;
99	else return metadata["TRAININGSAMPLESEND"][0].intValue;
100	}
101	}
102
103	public DatasetParser() {
104	this.metadata = new Dictionary<string, List<Token>>();
105	samplesList = new List<List<double>>();
106	}
107
108	public void Import(string importFileName, bool strict) {
109	StreamReader reader = new StreamReader(importFileName);
110	this.tokenizer = new Tokenizer(reader);
111	tokenizer.Separators = new string[] { " ", ";", "\t" };
112
113	try {
114	// parse the file
115	Parse(strict);
116	} finally {
117	reader.Close();
118	}
119
120	// translate the list of samples into a DoubleMatrixData item
121	samples = new double[samplesList.Count * samplesList[0].Count];
122	rows = samplesList.Count;
123	columns = samplesList[0].Count;
124
125	int i = 0;
126	int j = 0;
127	foreach(List<double> row in samplesList) {
128	j = 0;
129	foreach(double element in row) {
130	samples[i * columns + j] = element;
131	j++;
132	}
133	i++;
134	}
135	}
136
137	#region tokenizer
138	internal enum TokenTypeEnum {
139	At, Assign, NewLine, String, Double, Int
140	}
141
142	internal class Token {
143	public TokenTypeEnum type;
144	public string stringValue;
145	public double doubleValue;
146	public int intValue;
147
148	public Token(TokenTypeEnum type, string value) {
149	this.type = type;
150	stringValue = value;
151	doubleValue = 0.0;
152	intValue = 0;
153	}
154
155	public override string ToString() {
156	return stringValue;
157	}
158	}
159
160
161	class Tokenizer {
162	private StreamReader reader;
163	private List<Token> tokens;
164	private string[] separators;
165
166	public int CurrentLineNumber = 0;
167	public string CurrentLine;
168
169	public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
170	public static Token AtToken = new Token(TokenTypeEnum.At, "@");
171	public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
172
173	public string[] Separators {
174	get { return separators; }
175	set { separators = value; }
176	}
177
178
179	public Tokenizer(StreamReader reader) {
180	this.reader = reader;
181	tokens = new List<Token>();
182	ReadNextTokens();
183	}
184
185	private void ReadNextTokens() {
186	if(!reader.EndOfStream) {
187	CurrentLine = reader.ReadLine();
188	Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) {
189	return MakeToken(str);
190	});
191
192	tokens.AddRange(newTokens);
193	tokens.Add(NewlineToken);
194	CurrentLineNumber++;
195	}
196	}
197
198	private Token MakeToken(string strToken) {
199	if(strToken == "@")
200	return AtToken;
201	else if(strToken == "=")
202	return AssignmentToken;
203	else {
204	Token token = new Token(TokenTypeEnum.String, strToken);
205
206	// try invariant culture
207	NumberFormatInfo currentNumberFormatInfo = CultureInfo.InvariantCulture.NumberFormat;
208	if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
209	token.type = TokenTypeEnum.Int;
210	return token;
211	} else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
212	token.type = TokenTypeEnum.Double;
213	return token;
214	}
215	// try german culture
216	currentNumberFormatInfo = CultureInfo.GetCultureInfo("de-DE").NumberFormat;
217	if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
218	token.type = TokenTypeEnum.Int;
219	return token;
220	} else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
221	token.type = TokenTypeEnum.Double;
222	return token;
223	}
224
225	// try current culture
226	currentNumberFormatInfo = CultureInfo.CurrentCulture.NumberFormat;
227	if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
228	token.type = TokenTypeEnum.Int;
229	return token;
230	} else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
231	token.type = TokenTypeEnum.Double;
232	return token;
233	}
234
235	// nothing worked
236	return token;
237	}
238	}
239
240	public Token Peek() {
241	return tokens[0];
242	}
243
244	public Token Next() {
245	Token next = tokens[0];
246	tokens.RemoveAt(0);
247	if(tokens.Count == 0) {
248	ReadNextTokens();
249	}
250	return next;
251	}
252
253	public bool HasNext() {
254	return tokens.Count > 0 \|\| !reader.EndOfStream;
255	}
256	}
257	#endregion
258
259	#region parsing
260	private void Parse(bool strict) {
261	ParseMetaData(strict);
262	ParseSampleData(strict);
263	}
264
265	private void ParseSampleData(bool strict) {
266	List<double> row = new List<double>();
267	while(tokenizer.HasNext()) {
268	Token current = tokenizer.Next();
269	if(current.type == TokenTypeEnum.Double) {
270	// just take the value
271	row.Add(current.doubleValue);
272	} else if(current.type == TokenTypeEnum.Int) {
273	// translate the int value to double
274	row.Add((double)current.intValue);
275	} else if(current == Tokenizer.NewlineToken) {
276	// when parsing strictly all rows have to have the same number of values
277	if(strict) {
278	// the first row defines how many samples are needed
279	if(samplesList.Count > 0 && samplesList[0].Count != row.Count) {
280	Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
281	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.");
282	}
283	} else if(samplesList.Count > 0) {
284	// when we are not strict then fill or drop elements as needed
285	if(samplesList[0].Count > row.Count) {
286	// fill with NAN
287	for(int i = row.Count; i < samplesList[0].Count; i++) {
288	row.Add(double.NaN);
289	}
290	} else if(samplesList[0].Count < row.Count) {
291	// drop last k elements where k = n - length of first row
292	row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
293	}
294	}
295
296	// add the current row to the collection of rows and start a new row
297	samplesList.Add(row);
298	row = new List<double>();
299	} else {
300	// found an unexpected token => return false when parsing strictly
301	// when we are parsing non-strictly we also allow unreadable values inserting NAN instead
302	if(strict) {
303	Error("Unkown value " + current + " in line " + tokenizer.CurrentLineNumber +
304	"\n" + tokenizer.CurrentLine);
305	} else {
306	row.Add(double.NaN);
307	}
308	}
309	}
310	}
311
312	private void ParseMetaData(bool strict) {
313	while(tokenizer.Peek() == Tokenizer.AtToken) {
314	Expect(Tokenizer.AtToken);
315
316	Token nameToken = tokenizer.Next();
317	if(nameToken.type != TokenTypeEnum.String)
318	throw new Exception("Expected a variable name; got " + nameToken +
319	"\nLine " + tokenizer.CurrentLineNumber + ": " + tokenizer.CurrentLine);
320
321	Expect(Tokenizer.AssignmentToken);
322
323	List<Token> tokens = new List<Token>();
324	Token valueToken = tokenizer.Next();
325	while(valueToken != Tokenizer.NewlineToken) {
326	tokens.Add(valueToken);
327	valueToken = tokenizer.Next();
328	}
329
330	metadata[nameToken.stringValue] = tokens;
331	}
332	}
333
334	private void Expect(Token expectedToken) {
335	Token actualToken = tokenizer.Next();
336	if(actualToken != expectedToken) {
337	Error("Expected: " + expectedToken + " got: " + actualToken +
338	"\nLine " + tokenizer.CurrentLineNumber + ": " + tokenizer.CurrentLine);
339	}
340	}
341
342	private void Error(string message) {
343	throw new Exception("Error while parsing.\n" + message);
344	}
345	#endregion
346	}
347	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences