Context Navigation

source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/CsvFileParser.cs @ 4068

Visit:

Last change on this file since 4068 was 4068, checked in by swagner, 14 years ago
Sorted usings and removed unused usings in entire solution (#1094)
File size: 10.5 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22	using System;
23	using System.Collections.Generic;
24	using System.Globalization;
25	using System.IO;
26	using System.Linq;
27	using System.Text;
28
29	namespace HeuristicLab.Problems.DataAnalysis {
30	public class CsvFileParser {
31	private const string VARIABLENAMES = "VARIABLENAMES";
32	private Tokenizer tokenizer;
33	private List<string> variableNames;
34	private List<List<double>> rowValues;
35
36	private int rows;
37	public int Rows {
38	get { return rows; }
39	set { rows = value; }
40	}
41
42	private int columns;
43	public int Columns {
44	get { return columns; }
45	set { columns = value; }
46	}
47
48	private double[,] values;
49	public double[,] Values {
50	get {
51	return values;
52	}
53	}
54
55	public IEnumerable<string> VariableNames {
56	get {
57	if (variableNames.Count > 0) return variableNames;
58	else {
59	string[] names = new string[columns];
60	for (int i = 0; i < names.Length; i++) {
61	names[i] = "X" + i.ToString("000");
62	}
63	return names;
64	}
65	}
66	}
67
68	public CsvFileParser() {
69	rowValues = new List<List<double>>();
70	variableNames = new List<string>();
71	}
72
73	private void Reset() {
74	variableNames.Clear();
75	rowValues.Clear();
76	}
77
78	public void Parse(string fileName) {
79	TryParse(fileName);
80	// translate the list of samples into a DoubleMatrixData item
81	rows = rowValues.Count;
82	columns = rowValues[0].Count;
83	values = new double[rows, columns];
84
85	int rowIndex = 0;
86	int columnIndex = 0;
87	foreach (List<double> row in rowValues) {
88	columnIndex = 0;
89	foreach (double element in row) {
90	values[rowIndex, columnIndex++] = element;
91	}
92	rowIndex++;
93	}
94	}
95
96	private void TryParse(string fileName) {
97	Exception lastEx = null;
98	NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { CultureInfo.InvariantCulture.NumberFormat };
99	foreach (NumberFormatInfo numberFormat in possibleFormats) {
100	using (StreamReader reader = new StreamReader(fileName)) {
101	tokenizer = new Tokenizer(reader, numberFormat);
102	try {
103	// parse the file
104	Parse();
105	return; // parsed without errors -> return;
106	}
107	catch (DataFormatException ex) {
108	lastEx = ex;
109	}
110	}
111	}
112	// all number formats threw an exception -> rethrow the last exception
113	throw lastEx;
114	}
115
116	#region tokenizer
117	internal enum TokenTypeEnum {
118	NewLine, Separator, String, Double
119	}
120
121	internal class Token {
122	public TokenTypeEnum type;
123	public string stringValue;
124	public double doubleValue;
125
126	public Token(TokenTypeEnum type, string value) {
127	this.type = type;
128	stringValue = value;
129	doubleValue = 0.0;
130	}
131
132	public override string ToString() {
133	return stringValue;
134	}
135	}
136
137
138	internal class Tokenizer {
139	private StreamReader reader;
140	private List<Token> tokens;
141	private NumberFormatInfo numberFormatInfo;
142
143	private int currentLineNumber = 0;
144	public int CurrentLineNumber {
145	get { return currentLineNumber; }
146	private set { currentLineNumber = value; }
147	}
148	private string currentLine;
149	public string CurrentLine {
150	get { return currentLine; }
151	private set { currentLine = value; }
152	}
153
154	private Token newlineToken;
155	public Token NewlineToken {
156	get { return newlineToken; }
157	private set { newlineToken = value; }
158	}
159	private Token separatorToken;
160	public Token SeparatorToken {
161	get { return separatorToken; }
162	private set { separatorToken = value; }
163	}
164
165	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
166	this.reader = reader;
167	this.numberFormatInfo = numberFormatInfo;
168	separatorToken = new Token(TokenTypeEnum.Separator, separator.ToString());
169	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
170	tokens = new List<Token>();
171	ReadNextTokens();
172	}
173	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo)
174	: this(reader, numberFormatInfo, ';') {
175	}
176
177	private void ReadNextTokens() {
178	if (!reader.EndOfStream) {
179	CurrentLine = reader.ReadLine();
180	var newTokens = from str in Split(CurrentLine)
181	let trimmedStr = str.Trim()
182	where !string.IsNullOrEmpty(trimmedStr)
183	select MakeToken(trimmedStr.Trim());
184
185	tokens.AddRange(newTokens);
186	tokens.Add(NewlineToken);
187	CurrentLineNumber++;
188	}
189	}
190
191	private IEnumerable<string> Split(string line) {
192	StringBuilder subStr = new StringBuilder();
193	foreach (char c in line) {
194	if (c == ';') {
195	yield return subStr.ToString();
196	subStr = new StringBuilder();
197	yield return c.ToString();
198	} else {
199	subStr.Append(c);
200	}
201	}
202	yield return subStr.ToString();
203	}
204
205	private Token MakeToken(string strToken) {
206	Token token = new Token(TokenTypeEnum.String, strToken);
207	if (strToken.Equals(SeparatorToken.stringValue)) {
208	return SeparatorToken;
209	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
210	token.type = TokenTypeEnum.Double;
211	return token;
212	}
213
214	// couldn't parse the token as an int or float number so return a string token
215	return token;
216	}
217
218	public Token Peek() {
219	return tokens[0];
220	}
221
222	public Token Next() {
223	Token next = tokens[0];
224	tokens.RemoveAt(0);
225	if (tokens.Count == 0) {
226	ReadNextTokens();
227	}
228	return next;
229	}
230
231	public bool HasNext() {
232	return tokens.Count > 0 \|\| !reader.EndOfStream;
233	}
234	}
235	#endregion
236
237	#region parsing
238	private void Parse() {
239	ParseVariableNames();
240	if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
241	ParseValues();
242	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
243	}
244
245	private void ParseValues() {
246	while (tokenizer.HasNext()) {
247	List<double> row = new List<double>();
248	row.Add(NextValue(tokenizer));
249	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
250	Expect(tokenizer.SeparatorToken);
251	row.Add(NextValue(tokenizer));
252	}
253	Expect(tokenizer.NewlineToken);
254	// all rows have to have the same number of values
255	// the first row defines how many samples are needed
256	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
257	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
258	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
259	}
260	// add the current row to the collection of rows and start a new row
261	rowValues.Add(row);
262	row = new List<double>();
263	}
264	}
265
266	private double NextValue(Tokenizer tokenizer) {
267	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
268	Token current = tokenizer.Next();
269	if (current.type == TokenTypeEnum.Separator \|\| current.type == TokenTypeEnum.String) {
270	return double.NaN;
271	} else if (current.type == TokenTypeEnum.Double) {
272	// just take the value
273	return current.doubleValue;
274	}
275	// found an unexpected token => throw error
276	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
277	// this line is never executed because Error() throws an exception
278	throw new InvalidOperationException();
279	}
280
281	private void ParseVariableNames() {
282	// if the first line doesn't start with a double value then we assume that the
283	// first line contains variable names
284	if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
285
286	List<Token> tokens = new List<Token>();
287	Token valueToken;
288	valueToken = tokenizer.Next();
289	tokens.Add(valueToken);
290	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
291	Expect(tokenizer.SeparatorToken);
292	valueToken = tokenizer.Next();
293	if (valueToken != tokenizer.NewlineToken) {
294	tokens.Add(valueToken);
295	}
296	}
297	if (valueToken != tokenizer.NewlineToken) {
298	Expect(tokenizer.NewlineToken);
299	}
300	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
301	}
302	}
303
304	private void Expect(Token expectedToken) {
305	Token actualToken = tokenizer.Next();
306	if (actualToken != expectedToken) {
307	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
308	}
309	}
310
311	private void Error(string message, string token, int lineNumber) {
312	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
313	}
314	#endregion
315	}
316	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences