Context Navigation

source: branches/3.0/sources/HeuristicLab.DataAnalysis/DatasetParser.cs @ 11322

Visit:

Last change on this file since 11322 was 2, checked in by swagner, 17 years ago
Added HeuristicLab 3.0 sources from former SVN repository at revision 52
File size: 10.7 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22	using System;
23	using System.Collections.Generic;
24	using System.Globalization;
25	using System.IO;
26	using HeuristicLab.Data;
27
28	namespace HeuristicLab.DataAnalysis {
29	public class DatasetParser {
30	private Tokenizer tokenizer;
31	private Dictionary<string, List<Token>> metadata;
32	private List<List<double>> samplesList;
33
34	private int rows;
35	public int Rows {
36	get { return rows; }
37	set { rows = value; }
38	}
39
40	private int columns;
41	public int Columns {
42	get { return columns; }
43	set { columns = value; }
44	}
45
46	private double[] samples;
47	public double[] Samples {
48	get {
49	return samples;
50	}
51	}
52
53	public string ProblemName {
54	get {
55	return metadata["PROBLEMNAME"][0].stringValue;
56	}
57	}
58
59	public string[] VariableNames {
60	get {
61	List<Token> nameList = metadata["VARIABLENAMES"];
62	string[] names = new string[nameList.Count];
63	for (int i = 0; i < names.Length; i++) {
64	names[i] = nameList[i].stringValue;
65	}
66
67	return names;
68	}
69	}
70
71	public int TargetVariable {
72	get {
73	return metadata["TARGETVARIABLE"][0].intValue;
74	}
75	}
76
77	public int MaxTreeHeight {
78	get {
79	return metadata["MAXIMUMTREEHEIGHT"][0].intValue;
80	}
81	}
82
83	public int MaxTreeSize {
84	get {
85	return metadata["MAXIMUMTREESIZE"][0].intValue;
86	}
87	}
88
89	public int TrainingSamplesStart {
90	get {
91	return metadata["TRAININGSAMPLESSTART"][0].intValue;
92	}
93	}
94
95	public int TrainingSamplesEnd {
96	get {
97	return metadata["TRAININGSAMPLESEND"][0].intValue;
98	}
99	}
100
101	public DatasetParser() {
102	this.metadata = new Dictionary<string, List<Token>>();
103	samplesList = new List<List<double>>();
104	}
105
106	public void Import(string importFileName, bool strict) {
107	StreamReader reader = new StreamReader(importFileName);
108	this.tokenizer = new Tokenizer(reader);
109	tokenizer.Separators = new string[] { " ", ";", "\t" };
110
111	// parse the file
112	Parse(strict);
113
114	// translate the list of samples into a DoubleMatrixData item
115	samples = new double[samplesList.Count * samplesList[0].Count];
116	rows = samplesList.Count;
117	columns = samplesList[0].Count;
118
119	int i = 0;
120	int j = 0;
121	foreach (List<double> row in samplesList) {
122	j = 0;
123	foreach (double element in row) {
124	samples[i * columns + j] = element;
125	j++;
126	}
127	i++;
128	}
129	}
130
131	#region tokenizer
132	internal enum TokenTypeEnum {
133	At, Assign, NewLine, String, Double, Int
134	}
135
136	internal class Token {
137	public TokenTypeEnum type;
138	public string stringValue;
139	public double doubleValue;
140	public int intValue;
141
142	public Token(TokenTypeEnum type, string value) {
143	this.type = type;
144	stringValue = value;
145	doubleValue = 0.0;
146	intValue = 0;
147	}
148
149	public override string ToString() {
150	return stringValue;
151	}
152	}
153
154
155	class Tokenizer {
156	private StreamReader reader;
157	private List<Token> tokens;
158	private string[] separators;
159
160	public int CurrentLineNumber = 0;
161	public string CurrentLine;
162
163	public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
164	public static Token AtToken = new Token(TokenTypeEnum.At, "@");
165	public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
166
167	public string[] Separators {
168	get { return separators; }
169	set { separators = value; }
170	}
171
172
173	public Tokenizer(StreamReader reader) {
174	this.reader = reader;
175	tokens = new List<Token>();
176	ReadNextTokens();
177	}
178
179	private void ReadNextTokens() {
180	if (!reader.EndOfStream) {
181	CurrentLine = reader.ReadLine();
182	Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) {
183	return MakeToken(str);
184	});
185
186	tokens.AddRange(newTokens);
187	tokens.Add(NewlineToken);
188	CurrentLineNumber++;
189	}
190	}
191
192	private Token MakeToken(string strToken) {
193	if (strToken == "@")
194	return AtToken;
195	else if (strToken == "=")
196	return AssignmentToken;
197	else {
198	Token token = new Token(TokenTypeEnum.String, strToken);
199
200	// try invariant culture
201	NumberFormatInfo currentNumberFormatInfo = CultureInfo.InvariantCulture.NumberFormat;
202	if (int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
203	token.type = TokenTypeEnum.Int;
204	return token;
205	} else if (double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
206	token.type = TokenTypeEnum.Double;
207	return token;
208	}
209	// try german culture
210	currentNumberFormatInfo = CultureInfo.GetCultureInfo("de-DE").NumberFormat;
211	if (int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
212	token.type = TokenTypeEnum.Int;
213	return token;
214	} else if (double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
215	token.type = TokenTypeEnum.Double;
216	return token;
217	}
218
219	// try current culture
220	currentNumberFormatInfo = CultureInfo.CurrentCulture.NumberFormat;
221	if (int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
222	token.type = TokenTypeEnum.Int;
223	return token;
224	} else if (double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
225	token.type = TokenTypeEnum.Double;
226	return token;
227	}
228
229	// nothing worked
230	return token;
231	}
232	}
233
234	public Token Peek() {
235	return tokens[0];
236	}
237
238	public Token Next() {
239	Token next = tokens[0];
240	tokens.RemoveAt(0);
241	if (tokens.Count == 0) {
242	ReadNextTokens();
243	}
244	return next;
245	}
246
247	public bool HasNext() {
248	return tokens.Count > 0 \|\| !reader.EndOfStream;
249	}
250	}
251	#endregion
252
253	#region parsing
254	private void Parse(bool strict) {
255	ParseMetaData(strict);
256	ParseSampleData(strict);
257	}
258
259	private void ParseSampleData(bool strict) {
260	List<double> row = new List<double>();
261	while (tokenizer.HasNext()) {
262	Token current = tokenizer.Next();
263	if (current.type == TokenTypeEnum.Double) {
264	// just take the value
265	row.Add(current.doubleValue);
266	} else if (current.type == TokenTypeEnum.Int) {
267	// translate the int value to double
268	row.Add((double)current.intValue);
269	} else if (current == Tokenizer.NewlineToken) {
270	// when parsing strictly all rows have to have the same number of values
271	if (strict) {
272	// the first row defines how many samples are needed
273	if (samplesList.Count > 0 && samplesList[0].Count != row.Count) {
274	Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
275	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.");
276	}
277	} else if (samplesList.Count > 0) {
278	// when we are not strict then fill or drop elements as needed
279	if (samplesList[0].Count > row.Count) {
280	// fill with NAN
281	for (int i = row.Count; i < samplesList[0].Count; i++) {
282	row.Add(double.NaN);
283	}
284	} else if (samplesList[0].Count < row.Count) {
285	// drop last k elements where k = n - length of first row
286	row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
287	}
288	}
289
290	// add the current row to the collection of rows and start a new row
291	samplesList.Add(row);
292	row = new List<double>();
293	} else {
294	// found an unexpected token => return false when parsing strictly
295	// when we are parsing non-strictly we also allow unreadable values inserting NAN instead
296	if (strict) {
297	Error("Unkown value " + current + " in line " + tokenizer.CurrentLineNumber +
298	"\n" + tokenizer.CurrentLine);
299	} else {
300	row.Add(double.NaN);
301	}
302	}
303	}
304	}
305
306	private void ParseMetaData(bool strict) {
307	while (tokenizer.Peek() == Tokenizer.AtToken) {
308	Expect(Tokenizer.AtToken);
309
310	Token nameToken = tokenizer.Next();
311	if (nameToken.type != TokenTypeEnum.String)
312	throw new Exception("Expected a variable name; got " + nameToken +
313	"\nLine " + tokenizer.CurrentLineNumber + ": " + tokenizer.CurrentLine);
314
315	Expect(Tokenizer.AssignmentToken);
316
317	List<Token> tokens = new List<Token>();
318	Token valueToken = tokenizer.Next();
319	while (valueToken != Tokenizer.NewlineToken) {
320	tokens.Add(valueToken);
321	valueToken = tokenizer.Next();
322	}
323
324	metadata[nameToken.stringValue] = tokens;
325	}
326	}
327
328	private void Expect(Token expectedToken) {
329	Token actualToken = tokenizer.Next();
330	if (actualToken != expectedToken) {
331	Error("Expected: " + expectedToken + " got: " + actualToken +
332	"\nLine " + tokenizer.CurrentLineNumber + ": " + tokenizer.CurrentLine);
333	}
334	}
335
336	private void Error(string message) {
337	throw new Exception("Error while parsing.\n" + message);
338	}
339	#endregion
340	}
341	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences