Context Navigation

source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/TableFileParser.cs @ 6742

Visit:

Last change on this file since 6742 was 6742, checked in by gkronber, 13 years ago
#1597 improved handling of empty rows.
File size: 16.2 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2011 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22	using System;
23	using System.Collections;
24	using System.Collections.Generic;
25	using System.Globalization;
26	using System.IO;
27	using System.Linq;
28	using System.Runtime.Serialization;
29	using System.Text;
30
31	namespace HeuristicLab.Problems.DataAnalysis {
32	public class TableFileParser {
33	private const int BUFFER_SIZE = 1024;
34	private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
35	private Tokenizer tokenizer;
36	private List<List<object>> rowValues;
37
38	private int rows;
39	public int Rows {
40	get { return rows; }
41	set { rows = value; }
42	}
43
44	private int columns;
45	public int Columns {
46	get { return columns; }
47	set { columns = value; }
48	}
49
50	private List<IList> values;
51	public List<IList> Values {
52	get {
53	return values;
54	}
55	}
56
57	private List<string> variableNames;
58	public IEnumerable<string> VariableNames {
59	get {
60	if (variableNames.Count > 0) return variableNames;
61	else {
62	string[] names = new string[columns];
63	for (int i = 0; i < names.Length; i++) {
64	names[i] = "X" + i.ToString("000");
65	}
66	return names;
67	}
68	}
69	}
70
71	public TableFileParser() {
72	rowValues = new List<List<object>>();
73	variableNames = new List<string>();
74	}
75
76	public void Parse(string fileName) {
77	NumberFormatInfo numberFormat;
78	DateTimeFormatInfo dateTimeFormatInfo;
79	char separator;
80	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
81	using (StreamReader reader = new StreamReader(fileName)) {
82	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
83	// parse the file
84	Parse();
85	}
86
87	// translate the list of samples into a DoubleMatrixData item
88	rows = rowValues.Count;
89	columns = rowValues[0].Count;
90	values = new List<IList>();
91
92	//create columns
93	for (int col = 0; col < columns; col++) {
94	var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(10).Select(v => v.GetType());
95	if (!types.Any()) {
96	values.Add(new List<string>());
97	continue;
98	}
99
100	var columnType = types.GroupBy(v => v).OrderBy(v => v).Last().Key;
101	if (columnType == typeof(double)) values.Add(new List<double>());
102	else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
103	else if (columnType == typeof(string)) values.Add(new List<string>());
104	else throw new InvalidOperationException();
105	}
106
107
108
109	//fill with values
110	foreach (List<object> row in rowValues) {
111	int columnIndex = 0;
112	foreach (object element in row) {
113	//handle missing values with default values
114	if (element as string == string.Empty) {
115	if (values[columnIndex] is List<double>) values[columnIndex].Add(double.NaN);
116	else if (values[columnIndex] is List<DateTime>) values[columnIndex].Add(DateTime.MinValue);
117	else if (values[columnIndex] is List<string>) values[columnIndex].Add(string.Empty);
118	else throw new InvalidOperationException();
119	} else values[columnIndex].Add(element);
120	columnIndex++;
121	}
122	}
123	}
124
125	private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
126	using (StreamReader reader = new StreamReader(fileName)) {
127	// skip first line
128	reader.ReadLine();
129	// read a block
130	char[] buffer = new char[BUFFER_SIZE];
131	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
132	// count frequency of special characters
133	Dictionary<char, int> charCounts = buffer.Take(charsRead)
134	.GroupBy(c => c)
135	.ToDictionary(g => g.Key, g => g.Count());
136
137	// depending on the characters occuring in the block
138	// we distinghish a number of different cases based on the the following rules:
139	// many points => it must be English number format, the other frequently occuring char is the separator
140	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
141	// => check the line in more detail:
142	// English: 0, 0, 0, 0
143	// German: 0,0 0,0 0,0 ...
144	// => if commas are followed by space => English format
145	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
146	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
147	if (OccurrencesOf(charCounts, '.') > 10) {
148	numberFormat = NumberFormatInfo.InvariantInfo;
149	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
150	separator = POSSIBLE_SEPARATORS
151	.Where(c => OccurrencesOf(charCounts, c) > 10)
152	.OrderBy(c => -OccurrencesOf(charCounts, c))
153	.DefaultIfEmpty(' ')
154	.First();
155	} else if (OccurrencesOf(charCounts, ',') > 10) {
156	// no points and many commas
157	int countCommaNonDigitPairs = 0;
158	for (int i = 0; i < charsRead - 1; i++) {
159	if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
160	countCommaNonDigitPairs++;
161	}
162	}
163	if (countCommaNonDigitPairs > 10) {
164	// English format (only integer values) with ',' as separator
165	numberFormat = NumberFormatInfo.InvariantInfo;
166	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
167	separator = ',';
168	} else {
169	char[] disallowedSeparators = new char[] { ',' };
170	// German format (real values)
171	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
172	dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
173	separator = POSSIBLE_SEPARATORS
174	.Except(disallowedSeparators)
175	.Where(c => OccurrencesOf(charCounts, c) > 10)
176	.OrderBy(c => -OccurrencesOf(charCounts, c))
177	.DefaultIfEmpty(' ')
178	.First();
179	}
180	} else {
181	// no points and no commas => English format
182	numberFormat = NumberFormatInfo.InvariantInfo;
183	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
184	separator = POSSIBLE_SEPARATORS
185	.Where(c => OccurrencesOf(charCounts, c) > 10)
186	.OrderBy(c => -OccurrencesOf(charCounts, c))
187	.DefaultIfEmpty(' ')
188	.First();
189	}
190	}
191	}
192
193	private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
194	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
195	}
196
197	#region tokenizer
198	internal enum TokenTypeEnum {
199	NewLine, Separator, String, Double, DateTime
200	}
201
202	internal class Token {
203	public TokenTypeEnum type;
204	public string stringValue;
205	public double doubleValue;
206	public DateTime dateTimeValue;
207
208	public Token(TokenTypeEnum type, string value) {
209	this.type = type;
210	stringValue = value;
211	dateTimeValue = DateTime.MinValue;
212	doubleValue = 0.0;
213	}
214
215	public override string ToString() {
216	return stringValue;
217	}
218	}
219
220
221	internal class Tokenizer {
222	private StreamReader reader;
223	private List<Token> tokens;
224	private NumberFormatInfo numberFormatInfo;
225	private DateTimeFormatInfo dateTimeFormatInfo;
226	private char separator;
227	private const string INTERNAL_SEPARATOR = "#";
228
229	private int currentLineNumber = 0;
230	public int CurrentLineNumber {
231	get { return currentLineNumber; }
232	private set { currentLineNumber = value; }
233	}
234	private string currentLine;
235	public string CurrentLine {
236	get { return currentLine; }
237	private set { currentLine = value; }
238	}
239
240	private Token newlineToken;
241	public Token NewlineToken {
242	get { return newlineToken; }
243	private set { newlineToken = value; }
244	}
245	private Token separatorToken;
246	public Token SeparatorToken {
247	get { return separatorToken; }
248	private set { separatorToken = value; }
249	}
250
251	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
252	this.reader = reader;
253	this.numberFormatInfo = numberFormatInfo;
254	this.dateTimeFormatInfo = dateTimeFormatInfo;
255	this.separator = separator;
256	separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
257	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
258	tokens = new List<Token>();
259	ReadNextTokens();
260	}
261
262	private void ReadNextTokens() {
263	if (!reader.EndOfStream) {
264	CurrentLine = reader.ReadLine();
265	var newTokens = from str in Split(CurrentLine)
266	let trimmedStr = str.Trim()
267	where !string.IsNullOrEmpty(trimmedStr)
268	select MakeToken(trimmedStr);
269
270	tokens.AddRange(newTokens);
271	tokens.Add(NewlineToken);
272	CurrentLineNumber++;
273	}
274	}
275
276	private IEnumerable<string> Split(string line) {
277	StringBuilder subStr = new StringBuilder();
278	foreach (char c in line) {
279	if (c == separator) {
280	yield return subStr.ToString();
281	subStr = new StringBuilder();
282	// all separator characters are transformed to the internally used separator character
283	yield return INTERNAL_SEPARATOR;
284	} else {
285	subStr.Append(c);
286	}
287	}
288	yield return subStr.ToString();
289	}
290
291	private Token MakeToken(string strToken) {
292	Token token = new Token(TokenTypeEnum.String, strToken);
293	if (strToken.Equals(INTERNAL_SEPARATOR)) {
294	return SeparatorToken;
295	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
296	token.type = TokenTypeEnum.Double;
297	return token;
298	} else if (DateTime.TryParse(strToken, out token.dateTimeValue)) {
299	token.type = TokenTypeEnum.DateTime;
300	return token;
301	}
302
303	// couldn't parse the token as an int or float number or datetime value so return a string token
304	return token;
305	}
306
307	public Token Peek() {
308	return tokens[0];
309	}
310
311	public Token Next() {
312	Token next = tokens[0];
313	tokens.RemoveAt(0);
314	if (tokens.Count == 0) {
315	ReadNextTokens();
316	}
317	return next;
318	}
319
320	public bool HasNext() {
321	return tokens.Count > 0 \|\| !reader.EndOfStream;
322	}
323	}
324	#endregion
325
326	#region parsing
327	private void Parse() {
328	ParseVariableNames();
329	if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
330	ParseValues();
331	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
332	}
333
334	private void ParseValues() {
335	while (tokenizer.HasNext()) {
336	if (tokenizer.Peek() == tokenizer.NewlineToken) {
337	tokenizer.Next();
338	} else {
339	List<object> row = new List<object>();
340	object value = NextValue(tokenizer);
341	row.Add(value);
342	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
343	Expect(tokenizer.SeparatorToken);
344	row.Add(NextValue(tokenizer));
345	}
346	Expect(tokenizer.NewlineToken);
347	// all rows have to have the same number of values
348	// the first row defines how many samples are needed
349	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
350	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
351	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
352	tokenizer.CurrentLineNumber);
353	}
354	rowValues.Add(row);
355	}
356	}
357	}
358
359	private object NextValue(Tokenizer tokenizer) {
360	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;
361	Token current = tokenizer.Next();
362	if (current.type == TokenTypeEnum.Separator) {
363	return double.NaN;
364	} else if (current.type == TokenTypeEnum.String) {
365	return current.stringValue;
366	} else if (current.type == TokenTypeEnum.Double) {
367	return current.doubleValue;
368	} else if (current.type == TokenTypeEnum.DateTime) {
369	return current.dateTimeValue;
370	}
371	// found an unexpected token => throw error
372	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
373	// this line is never executed because Error() throws an exception
374	throw new InvalidOperationException();
375	}
376
377	private void ParseVariableNames() {
378	//if first token is double no variables names are given
379	if (tokenizer.Peek().type == TokenTypeEnum.Double) return;
380
381	// the first line must contain variable names
382	List<Token> tokens = new List<Token>();
383	Token valueToken;
384	valueToken = tokenizer.Next();
385	tokens.Add(valueToken);
386	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
387	Expect(tokenizer.SeparatorToken);
388	valueToken = tokenizer.Next();
389	if (valueToken != tokenizer.NewlineToken) {
390	tokens.Add(valueToken);
391	}
392	}
393	if (valueToken != tokenizer.NewlineToken) {
394	Expect(tokenizer.NewlineToken);
395	}
396	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
397	}
398
399	private void Expect(Token expectedToken) {
400	Token actualToken = tokenizer.Next();
401	if (actualToken != expectedToken) {
402	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
403	}
404	}
405
406	private void Error(string message, string token, int lineNumber) {
407	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
408	}
409	#endregion
410
411	[Serializable]
412	private class DataFormatException : Exception {
413	private int line;
414	public int Line {
415	get { return line; }
416	}
417	private string token;
418	public string Token {
419	get { return token; }
420	}
421	public DataFormatException(string message, string token, int line)
422	: base(message + "\nToken: " + token + " (line: " + line + ")") {
423	this.token = token;
424	this.line = line;
425	}
426
427	public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
428	}
429	}
430	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences