Context Navigation

source: branches/DataAnalysis SolutionEnsembles/HeuristicLab.Problems.DataAnalysis/3.4/TableFileParser.cs @ 5815

Visit:

Last change on this file since 5815 was 5809, checked in by mkommend, 14 years ago
#1418: Reintegrated branch into trunk.
File size: 14.1 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2011 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22	using System;
23	using System.Collections.Generic;
24	using System.Globalization;
25	using System.IO;
26	using System.Linq;
27	using System.Runtime.Serialization;
28	using System.Text;
29
30	namespace HeuristicLab.Problems.DataAnalysis {
31	public class TableFileParser {
32	private const int BUFFER_SIZE = 1024;
33	private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
34	private Tokenizer tokenizer;
35	private List<List<double>> rowValues;
36
37	private int rows;
38	public int Rows {
39	get { return rows; }
40	set { rows = value; }
41	}
42
43	private int columns;
44	public int Columns {
45	get { return columns; }
46	set { columns = value; }
47	}
48
49	private double[,] values;
50	public double[,] Values {
51	get {
52	return values;
53	}
54	}
55
56	private List<string> variableNames;
57	public IEnumerable<string> VariableNames {
58	get {
59	if (variableNames.Count > 0) return variableNames;
60	else {
61	string[] names = new string[columns];
62	for (int i = 0; i < names.Length; i++) {
63	names[i] = "X" + i.ToString("000");
64	}
65	return names;
66	}
67	}
68	}
69
70	public TableFileParser() {
71	rowValues = new List<List<double>>();
72	variableNames = new List<string>();
73	}
74
75	public void Parse(string fileName) {
76	NumberFormatInfo numberFormat;
77	char separator;
78	DetermineFileFormat(fileName, out numberFormat, out separator);
79	using (StreamReader reader = new StreamReader(fileName)) {
80	tokenizer = new Tokenizer(reader, numberFormat, separator);
81	// parse the file
82	Parse();
83	}
84
85	// translate the list of samples into a DoubleMatrixData item
86	rows = rowValues.Count;
87	columns = rowValues[0].Count;
88	values = new double[rows, columns];
89
90	int rowIndex = 0;
91	int columnIndex = 0;
92	foreach (List<double> row in rowValues) {
93	columnIndex = 0;
94	foreach (double element in row) {
95	values[rowIndex, columnIndex++] = element;
96	}
97	rowIndex++;
98	}
99	}
100
101	private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out char separator) {
102	using (StreamReader reader = new StreamReader(fileName)) {
103	// skip first line
104	reader.ReadLine();
105	// read a block
106	char[] buffer = new char[BUFFER_SIZE];
107	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
108	// count frequency of special characters
109	Dictionary<char, int> charCounts = buffer.Take(charsRead)
110	.GroupBy(c => c)
111	.ToDictionary(g => g.Key, g => g.Count());
112
113	// depending on the characters occuring in the block
114	// we distinghish a number of different cases based on the the following rules:
115	// many points => it must be English number format, the other frequently occuring char is the separator
116	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
117	// => check the line in more detail:
118	// English: 0, 0, 0, 0
119	// German: 0,0 0,0 0,0 ...
120	// => if commas are followed by space => English format
121	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
122	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
123	if (OccurrencesOf(charCounts, '.') > 10) {
124	numberFormat = NumberFormatInfo.InvariantInfo;
125	separator = POSSIBLE_SEPARATORS
126	.Where(c => OccurrencesOf(charCounts, c) > 10)
127	.OrderBy(c => -OccurrencesOf(charCounts, c))
128	.DefaultIfEmpty(' ')
129	.First();
130	} else if (OccurrencesOf(charCounts, ',') > 10) {
131	// no points and many commas
132	int countCommaNonDigitPairs = 0;
133	for (int i = 0; i < charsRead - 1; i++) {
134	if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
135	countCommaNonDigitPairs++;
136	}
137	}
138	if (countCommaNonDigitPairs > 10) {
139	// English format (only integer values) with ',' as separator
140	numberFormat = NumberFormatInfo.InvariantInfo;
141	separator = ',';
142	} else {
143	char[] disallowedSeparators = new char[] { ',' };
144	// German format (real values)
145	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
146	separator = POSSIBLE_SEPARATORS
147	.Except(disallowedSeparators)
148	.Where(c => OccurrencesOf(charCounts, c) > 10)
149	.OrderBy(c => -OccurrencesOf(charCounts, c))
150	.DefaultIfEmpty(' ')
151	.First();
152	}
153	} else {
154	// no points and no commas => English format
155	numberFormat = NumberFormatInfo.InvariantInfo;
156	separator = POSSIBLE_SEPARATORS
157	.Where(c => OccurrencesOf(charCounts, c) > 10)
158	.OrderBy(c => -OccurrencesOf(charCounts, c))
159	.DefaultIfEmpty(' ')
160	.First();
161	}
162	}
163	}
164
165	private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
166	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
167	}
168
169	#region tokenizer
170	internal enum TokenTypeEnum {
171	NewLine, Separator, String, Double
172	}
173
174	internal class Token {
175	public TokenTypeEnum type;
176	public string stringValue;
177	public double doubleValue;
178
179	public Token(TokenTypeEnum type, string value) {
180	this.type = type;
181	stringValue = value;
182	doubleValue = 0.0;
183	}
184
185	public override string ToString() {
186	return stringValue;
187	}
188	}
189
190
191	internal class Tokenizer {
192	private StreamReader reader;
193	private List<Token> tokens;
194	private NumberFormatInfo numberFormatInfo;
195	private char separator;
196	private const string INTERNAL_SEPARATOR = "#";
197
198	private int currentLineNumber = 0;
199	public int CurrentLineNumber {
200	get { return currentLineNumber; }
201	private set { currentLineNumber = value; }
202	}
203	private string currentLine;
204	public string CurrentLine {
205	get { return currentLine; }
206	private set { currentLine = value; }
207	}
208
209	private Token newlineToken;
210	public Token NewlineToken {
211	get { return newlineToken; }
212	private set { newlineToken = value; }
213	}
214	private Token separatorToken;
215	public Token SeparatorToken {
216	get { return separatorToken; }
217	private set { separatorToken = value; }
218	}
219
220	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
221	this.reader = reader;
222	this.numberFormatInfo = numberFormatInfo;
223	this.separator = separator;
224	separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
225	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
226	tokens = new List<Token>();
227	ReadNextTokens();
228	}
229
230	private void ReadNextTokens() {
231	if (!reader.EndOfStream) {
232	CurrentLine = reader.ReadLine();
233	var newTokens = from str in Split(CurrentLine)
234	let trimmedStr = str.Trim()
235	where !string.IsNullOrEmpty(trimmedStr)
236	select MakeToken(trimmedStr);
237
238	tokens.AddRange(newTokens);
239	tokens.Add(NewlineToken);
240	CurrentLineNumber++;
241	}
242	}
243
244	private IEnumerable<string> Split(string line) {
245	StringBuilder subStr = new StringBuilder();
246	foreach (char c in line) {
247	if (c == separator) {
248	yield return subStr.ToString();
249	subStr = new StringBuilder();
250	// all separator characters are transformed to the internally used separator character
251	yield return INTERNAL_SEPARATOR;
252	} else {
253	subStr.Append(c);
254	}
255	}
256	yield return subStr.ToString();
257	}
258
259	private Token MakeToken(string strToken) {
260	Token token = new Token(TokenTypeEnum.String, strToken);
261	if (strToken.Equals(INTERNAL_SEPARATOR)) {
262	return SeparatorToken;
263	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
264	token.type = TokenTypeEnum.Double;
265	return token;
266	}
267
268	// couldn't parse the token as an int or float number so return a string token
269	return token;
270	}
271
272	public Token Peek() {
273	return tokens[0];
274	}
275
276	public Token Next() {
277	Token next = tokens[0];
278	tokens.RemoveAt(0);
279	if (tokens.Count == 0) {
280	ReadNextTokens();
281	}
282	return next;
283	}
284
285	public bool HasNext() {
286	return tokens.Count > 0 \|\| !reader.EndOfStream;
287	}
288	}
289	#endregion
290
291	#region parsing
292	private void Parse() {
293	ParseVariableNames();
294	if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
295	ParseValues();
296	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
297	}
298
299	private void ParseValues() {
300	while (tokenizer.HasNext()) {
301	List<double> row = new List<double>();
302	row.Add(NextValue(tokenizer));
303	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
304	Expect(tokenizer.SeparatorToken);
305	row.Add(NextValue(tokenizer));
306	}
307	Expect(tokenizer.NewlineToken);
308	// all rows have to have the same number of values
309	// the first row defines how many samples are needed
310	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
311	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
312	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
313	}
314	// add the current row to the collection of rows and start a new row
315	rowValues.Add(row);
316	row = new List<double>();
317	}
318	}
319
320	private double NextValue(Tokenizer tokenizer) {
321	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
322	Token current = tokenizer.Next();
323	if (current.type == TokenTypeEnum.Separator \|\| current.type == TokenTypeEnum.String) {
324	return double.NaN;
325	} else if (current.type == TokenTypeEnum.Double) {
326	// just take the value
327	return current.doubleValue;
328	}
329	// found an unexpected token => throw error
330	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
331	// this line is never executed because Error() throws an exception
332	throw new InvalidOperationException();
333	}
334
335	private void ParseVariableNames() {
336	// if the first line doesn't start with a double value then we assume that the
337	// first line contains variable names
338	if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
339
340	List<Token> tokens = new List<Token>();
341	Token valueToken;
342	valueToken = tokenizer.Next();
343	tokens.Add(valueToken);
344	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
345	Expect(tokenizer.SeparatorToken);
346	valueToken = tokenizer.Next();
347	if (valueToken != tokenizer.NewlineToken) {
348	tokens.Add(valueToken);
349	}
350	}
351	if (valueToken != tokenizer.NewlineToken) {
352	Expect(tokenizer.NewlineToken);
353	}
354	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
355	}
356	}
357
358	private void Expect(Token expectedToken) {
359	Token actualToken = tokenizer.Next();
360	if (actualToken != expectedToken) {
361	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
362	}
363	}
364
365	private void Error(string message, string token, int lineNumber) {
366	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
367	}
368	#endregion
369
370	[Serializable]
371	private class DataFormatException : Exception {
372	private int line;
373	public int Line {
374	get { return line; }
375	}
376	private string token;
377	public string Token {
378	get { return token; }
379	}
380	public DataFormatException(string message, string token, int line)
381	: base(message + "\nToken: " + token + " (line: " + line + ")") {
382	this.token = token;
383	this.line = line;
384	}
385
386	public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
387	}
388	}
389	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences