Context Navigation

source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/TableFileParser.cs @ 6776

Visit:

Last change on this file since 6776 was 6776, checked in by gkronber, 13 years ago
#1640 fixed a bug in parsing datetime values and improved code for filling dataset columns
File size: 16.2 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2011 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22	using System;
23	using System.Collections;
24	using System.Collections.Generic;
25	using System.Globalization;
26	using System.IO;
27	using System.Linq;
28	using System.Runtime.Serialization;
29	using System.Text;
30
31	namespace HeuristicLab.Problems.DataAnalysis {
32	public class TableFileParser {
33	private const int BUFFER_SIZE = 1024;
34	private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
35	private Tokenizer tokenizer;
36	private List<List<object>> rowValues;
37
38	private int rows;
39	public int Rows {
40	get { return rows; }
41	set { rows = value; }
42	}
43
44	private int columns;
45	public int Columns {
46	get { return columns; }
47	set { columns = value; }
48	}
49
50	private List<IList> values;
51	public List<IList> Values {
52	get {
53	return values;
54	}
55	}
56
57	private List<string> variableNames;
58	public IEnumerable<string> VariableNames {
59	get {
60	if (variableNames.Count > 0) return variableNames;
61	else {
62	string[] names = new string[columns];
63	for (int i = 0; i < names.Length; i++) {
64	names[i] = "X" + i.ToString("000");
65	}
66	return names;
67	}
68	}
69	}
70
71	public TableFileParser() {
72	rowValues = new List<List<object>>();
73	variableNames = new List<string>();
74	}
75
76	public void Parse(string fileName) {
77	NumberFormatInfo numberFormat;
78	DateTimeFormatInfo dateTimeFormatInfo;
79	char separator;
80	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
81	using (StreamReader reader = new StreamReader(fileName)) {
82	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
83	// parse the file
84	Parse();
85	}
86
87	// translate the list of samples into a DoubleMatrixData item
88	rows = rowValues.Count;
89	columns = rowValues[0].Count;
90	values = new List<IList>();
91
92	//create columns
93	for (int col = 0; col < columns; col++) {
94	var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(10).Select(v => v.GetType());
95	if (!types.Any()) {
96	values.Add(new List<string>());
97	continue;
98	}
99
100	var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
101	if (columnType == typeof(double)) values.Add(new List<double>());
102	else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
103	else if (columnType == typeof(string)) values.Add(new List<string>());
104	else throw new InvalidOperationException();
105	}
106
107
108
109	//fill with values
110	foreach (List<object> row in rowValues) {
111	int columnIndex = 0;
112	foreach (object element in row) {
113	if (values[columnIndex] is List<double> && !(element is double))
114	values[columnIndex].Add(double.NaN);
115	else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
116	values[columnIndex].Add(DateTime.MinValue);
117	else if (values[columnIndex] is List<string> && !(element is string))
118	values[columnIndex].Add(string.Empty);
119	else
120	values[columnIndex].Add(element);
121	columnIndex++;
122	}
123	}
124	}
125
126	private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
127	using (StreamReader reader = new StreamReader(fileName)) {
128	// skip first line
129	reader.ReadLine();
130	// read a block
131	char[] buffer = new char[BUFFER_SIZE];
132	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
133	// count frequency of special characters
134	Dictionary<char, int> charCounts = buffer.Take(charsRead)
135	.GroupBy(c => c)
136	.ToDictionary(g => g.Key, g => g.Count());
137
138	// depending on the characters occuring in the block
139	// we distinghish a number of different cases based on the the following rules:
140	// many points => it must be English number format, the other frequently occuring char is the separator
141	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
142	// => check the line in more detail:
143	// English: 0, 0, 0, 0
144	// German: 0,0 0,0 0,0 ...
145	// => if commas are followed by space => English format
146	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
147	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
148	if (OccurrencesOf(charCounts, '.') > 10) {
149	numberFormat = NumberFormatInfo.InvariantInfo;
150	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
151	separator = POSSIBLE_SEPARATORS
152	.Where(c => OccurrencesOf(charCounts, c) > 10)
153	.OrderBy(c => -OccurrencesOf(charCounts, c))
154	.DefaultIfEmpty(' ')
155	.First();
156	} else if (OccurrencesOf(charCounts, ',') > 10) {
157	// no points and many commas
158	int countCommaNonDigitPairs = 0;
159	for (int i = 0; i < charsRead - 1; i++) {
160	if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
161	countCommaNonDigitPairs++;
162	}
163	}
164	if (countCommaNonDigitPairs > 10) {
165	// English format (only integer values) with ',' as separator
166	numberFormat = NumberFormatInfo.InvariantInfo;
167	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
168	separator = ',';
169	} else {
170	char[] disallowedSeparators = new char[] { ',' };
171	// German format (real values)
172	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
173	dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
174	separator = POSSIBLE_SEPARATORS
175	.Except(disallowedSeparators)
176	.Where(c => OccurrencesOf(charCounts, c) > 10)
177	.OrderBy(c => -OccurrencesOf(charCounts, c))
178	.DefaultIfEmpty(' ')
179	.First();
180	}
181	} else {
182	// no points and no commas => English format
183	numberFormat = NumberFormatInfo.InvariantInfo;
184	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
185	separator = POSSIBLE_SEPARATORS
186	.Where(c => OccurrencesOf(charCounts, c) > 10)
187	.OrderBy(c => -OccurrencesOf(charCounts, c))
188	.DefaultIfEmpty(' ')
189	.First();
190	}
191	}
192	}
193
194	private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
195	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
196	}
197
198	#region tokenizer
199	internal enum TokenTypeEnum {
200	NewLine, Separator, String, Double, DateTime
201	}
202
203	internal class Token {
204	public TokenTypeEnum type;
205	public string stringValue;
206	public double doubleValue;
207	public DateTime dateTimeValue;
208
209	public Token(TokenTypeEnum type, string value) {
210	this.type = type;
211	stringValue = value;
212	dateTimeValue = DateTime.MinValue;
213	doubleValue = 0.0;
214	}
215
216	public override string ToString() {
217	return stringValue;
218	}
219	}
220
221
222	internal class Tokenizer {
223	private StreamReader reader;
224	private List<Token> tokens;
225	private NumberFormatInfo numberFormatInfo;
226	private DateTimeFormatInfo dateTimeFormatInfo;
227	private char separator;
228	private const string INTERNAL_SEPARATOR = "#";
229
230	private int currentLineNumber = 0;
231	public int CurrentLineNumber {
232	get { return currentLineNumber; }
233	private set { currentLineNumber = value; }
234	}
235	private string currentLine;
236	public string CurrentLine {
237	get { return currentLine; }
238	private set { currentLine = value; }
239	}
240
241	private Token newlineToken;
242	public Token NewlineToken {
243	get { return newlineToken; }
244	private set { newlineToken = value; }
245	}
246	private Token separatorToken;
247	public Token SeparatorToken {
248	get { return separatorToken; }
249	private set { separatorToken = value; }
250	}
251
252	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
253	this.reader = reader;
254	this.numberFormatInfo = numberFormatInfo;
255	this.dateTimeFormatInfo = dateTimeFormatInfo;
256	this.separator = separator;
257	separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
258	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
259	tokens = new List<Token>();
260	ReadNextTokens();
261	}
262
263	private void ReadNextTokens() {
264	if (!reader.EndOfStream) {
265	CurrentLine = reader.ReadLine();
266	var newTokens = from str in Split(CurrentLine)
267	let trimmedStr = str.Trim()
268	where !string.IsNullOrEmpty(trimmedStr)
269	select MakeToken(trimmedStr);
270
271	tokens.AddRange(newTokens);
272	tokens.Add(NewlineToken);
273	CurrentLineNumber++;
274	}
275	}
276
277	private IEnumerable<string> Split(string line) {
278	StringBuilder subStr = new StringBuilder();
279	foreach (char c in line) {
280	if (c == separator) {
281	yield return subStr.ToString();
282	subStr = new StringBuilder();
283	// all separator characters are transformed to the internally used separator character
284	yield return INTERNAL_SEPARATOR;
285	} else {
286	subStr.Append(c);
287	}
288	}
289	yield return subStr.ToString();
290	}
291
292	private Token MakeToken(string strToken) {
293	Token token = new Token(TokenTypeEnum.String, strToken);
294	if (strToken.Equals(INTERNAL_SEPARATOR)) {
295	return SeparatorToken;
296	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
297	token.type = TokenTypeEnum.Double;
298	return token;
299	} else if (DateTime.TryParse(strToken, dateTimeFormatInfo, DateTimeStyles.None, out token.dateTimeValue)) {
300	token.type = TokenTypeEnum.DateTime;
301	return token;
302	}
303
304	// couldn't parse the token as an int or float number or datetime value so return a string token
305	return token;
306	}
307
308	public Token Peek() {
309	return tokens[0];
310	}
311
312	public Token Next() {
313	Token next = tokens[0];
314	tokens.RemoveAt(0);
315	if (tokens.Count == 0) {
316	ReadNextTokens();
317	}
318	return next;
319	}
320
321	public bool HasNext() {
322	return tokens.Count > 0 \|\| !reader.EndOfStream;
323	}
324	}
325	#endregion
326
327	#region parsing
328	private void Parse() {
329	ParseVariableNames();
330	if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
331	ParseValues();
332	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
333	}
334
335	private void ParseValues() {
336	while (tokenizer.HasNext()) {
337	if (tokenizer.Peek() == tokenizer.NewlineToken) {
338	tokenizer.Next();
339	} else {
340	List<object> row = new List<object>();
341	object value = NextValue(tokenizer);
342	row.Add(value);
343	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
344	Expect(tokenizer.SeparatorToken);
345	row.Add(NextValue(tokenizer));
346	}
347	Expect(tokenizer.NewlineToken);
348	// all rows have to have the same number of values
349	// the first row defines how many samples are needed
350	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
351	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
352	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
353	tokenizer.CurrentLineNumber);
354	}
355	rowValues.Add(row);
356	}
357	}
358	}
359
360	private object NextValue(Tokenizer tokenizer) {
361	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;
362	Token current = tokenizer.Next();
363	if (current.type == TokenTypeEnum.Separator) {
364	return double.NaN;
365	} else if (current.type == TokenTypeEnum.String) {
366	return current.stringValue;
367	} else if (current.type == TokenTypeEnum.Double) {
368	return current.doubleValue;
369	} else if (current.type == TokenTypeEnum.DateTime) {
370	return current.dateTimeValue;
371	}
372	// found an unexpected token => throw error
373	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
374	// this line is never executed because Error() throws an exception
375	throw new InvalidOperationException();
376	}
377
378	private void ParseVariableNames() {
379	//if first token is double no variables names are given
380	if (tokenizer.Peek().type == TokenTypeEnum.Double) return;
381
382	// the first line must contain variable names
383	List<Token> tokens = new List<Token>();
384	Token valueToken;
385	valueToken = tokenizer.Next();
386	tokens.Add(valueToken);
387	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
388	Expect(tokenizer.SeparatorToken);
389	valueToken = tokenizer.Next();
390	if (valueToken != tokenizer.NewlineToken) {
391	tokens.Add(valueToken);
392	}
393	}
394	if (valueToken != tokenizer.NewlineToken) {
395	Expect(tokenizer.NewlineToken);
396	}
397	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
398	}
399
400	private void Expect(Token expectedToken) {
401	Token actualToken = tokenizer.Next();
402	if (actualToken != expectedToken) {
403	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
404	}
405	}
406
407	private void Error(string message, string token, int lineNumber) {
408	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
409	}
410	#endregion
411
412	[Serializable]
413	private class DataFormatException : Exception {
414	private int line;
415	public int Line {
416	get { return line; }
417	}
418	private string token;
419	public string Token {
420	get { return token; }
421	}
422	public DataFormatException(string message, string token, int line)
423	: base(message + "\nToken: " + token + " (line: " + line + ")") {
424	this.token = token;
425	this.line = line;
426	}
427
428	public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
429	}
430	}
431	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences