Context Navigation

source: branches/Breadcrumbs/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 11069

Visit:

Last change on this file since 11069 was 9652, checked in by sforsten, 11 years ago
#2047: `TableFileParser` can now handle white spaces (currently the character '\0' symbolizes white spaces in the `TableFileParser`)
File size: 21.0 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2013 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22
23	using System;
24	using System.Collections;
25	using System.Collections.Generic;
26	using System.Globalization;
27	using System.IO;
28	using System.Linq;
29	using System.Runtime.Serialization;
30
31	namespace HeuristicLab.Problems.Instances.DataAnalysis {
32	public class TableFileParser {
33	private const int BUFFER_SIZE = 65536;
34	// char used to symbolize whitespaces (no missing values can be handled with whitespaces)
35	private const char WHITESPACECHAR = (char)0;
36	private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
37	private Tokenizer tokenizer;
38	private List<List<object>> rowValues;
39
40	private int rows;
41	public int Rows {
42	get { return rows; }
43	set { rows = value; }
44	}
45
46	private int columns;
47	public int Columns {
48	get { return columns; }
49	set { columns = value; }
50	}
51
52	private List<IList> values;
53	public List<IList> Values {
54	get {
55	return values;
56	}
57	}
58
59	private List<string> variableNames;
60	public IEnumerable<string> VariableNames {
61	get {
62	if (variableNames.Count > 0) return variableNames;
63	else {
64	string[] names = new string[columns];
65	for (int i = 0; i < names.Length; i++) {
66	names[i] = "X" + i.ToString("000");
67	}
68	return names;
69	}
70	}
71	}
72
73	public TableFileParser() {
74	rowValues = new List<List<object>>();
75	variableNames = new List<string>();
76	}
77
78	public bool AreColumnNamesInFirstLine(string fileName) {
79	NumberFormatInfo numberFormat;
80	DateTimeFormatInfo dateTimeFormatInfo;
81	char separator;
82	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
83	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
84	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
85	}
86	}
87
88	public bool AreColumnNamesInFirstLine(Stream stream) {
89	NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
90	DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
91	char separator = ',';
92	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
93	}
94
95	public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
96	DateTimeFormatInfo dateTimeFormatInfo, char separator) {
97	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
98	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
99	}
100	}
101
102	public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
103	DateTimeFormatInfo dateTimeFormatInfo, char separator) {
104	using (StreamReader reader = new StreamReader(stream)) {
105	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
106	return tokenizer.Peek().type != TokenTypeEnum.Double;
107	}
108	}
109
110	/// <summary>
111	/// Parses a file and determines the format first
112	/// </summary>
113	/// <param name="fileName">file which is parsed</param>
114	/// <param name="columnNamesInFirstLine"></param>
115	public void Parse(string fileName, bool columnNamesInFirstLine) {
116	NumberFormatInfo numberFormat;
117	DateTimeFormatInfo dateTimeFormatInfo;
118	char separator;
119	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
120	Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
121	}
122
123	/// <summary>
124	/// Parses a file with the given formats
125	/// </summary>
126	/// <param name="fileName">file which is parsed</param>
127	/// <param name="numberFormat">Format of numbers</param>
128	/// <param name="dateTimeFormatInfo">Format of datetime</param>
129	/// <param name="separator">defines the separator</param>
130	/// <param name="columnNamesInFirstLine"></param>
131	public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine) {
132	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
133	Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
134	}
135	}
136
137	/// <summary>
138	/// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
139	/// </summary>
140	/// <param name="stream">stream which is parsed</param>
141	/// <param name="columnNamesInFirstLine"></param>
142	public void Parse(Stream stream, bool columnNamesInFirstLine) {
143	NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
144	DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
145	char separator = ',';
146	Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
147	}
148
149	/// <summary>
150	/// Parses a stream with the given formats.
151	/// </summary>
152	/// <param name="stream">Stream which is parsed</param>
153	/// <param name="numberFormat">Format of numbers</param>
154	/// <param name="dateTimeFormatInfo">Format of datetime</param>
155	/// <param name="separator">defines the separator</param>
156	/// <param name="columnNamesInFirstLine"></param>
157	public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine) {
158	using (StreamReader reader = new StreamReader(stream)) {
159	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
160	// parse the file
161	Parse(columnNamesInFirstLine);
162	}
163
164	// translate the list of samples into a DoubleMatrixData item
165	rows = rowValues.Count;
166	columns = rowValues[0].Count;
167	values = new List<IList>();
168
169	//create columns
170	for (int col = 0; col < columns; col++) {
171	var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(10).Select(v => v.GetType());
172	if (!types.Any()) {
173	values.Add(new List<string>());
174	continue;
175	}
176
177	var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
178	if (columnType == typeof(double)) values.Add(new List<double>());
179	else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
180	else if (columnType == typeof(string)) values.Add(new List<string>());
181	else throw new InvalidOperationException();
182	}
183
184
185
186	//fill with values
187	foreach (List<object> row in rowValues) {
188	int columnIndex = 0;
189	foreach (object element in row) {
190	if (values[columnIndex] is List<double> && !(element is double))
191	values[columnIndex].Add(double.NaN);
192	else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
193	values[columnIndex].Add(DateTime.MinValue);
194	else if (values[columnIndex] is List<string> && !(element is string))
195	values[columnIndex].Add(string.Empty);
196	else
197	values[columnIndex].Add(element);
198	columnIndex++;
199	}
200	}
201	}
202
203	public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
204	DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
205	}
206
207	public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
208	using (StreamReader reader = new StreamReader(stream)) {
209	// skip first line
210	reader.ReadLine();
211	// read a block
212	char[] buffer = new char[BUFFER_SIZE];
213	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
214	// count frequency of special characters
215	Dictionary<char, int> charCounts = buffer.Take(charsRead)
216	.GroupBy(c => c)
217	.ToDictionary(g => g.Key, g => g.Count());
218
219	// depending on the characters occuring in the block
220	// we distinghish a number of different cases based on the the following rules:
221	// many points => it must be English number format, the other frequently occuring char is the separator
222	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
223	// => check the line in more detail:
224	// English: 0, 0, 0, 0
225	// German: 0,0 0,0 0,0 ...
226	// => if commas are followed by space => English format
227	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
228	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
229	if (OccurrencesOf(charCounts, '.') > 10) {
230	numberFormat = NumberFormatInfo.InvariantInfo;
231	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
232	separator = POSSIBLE_SEPARATORS
233	.Where(c => OccurrencesOf(charCounts, c) > 10)
234	.OrderBy(c => -OccurrencesOf(charCounts, c))
235	.DefaultIfEmpty(' ')
236	.First();
237	} else if (OccurrencesOf(charCounts, ',') > 10) {
238	// no points and many commas
239	// count the number of tokens (chains of only digits and commas) that contain multiple comma characters
240	int tokensWithMultipleCommas = 0;
241	for (int i = 0; i < charsRead; i++) {
242	int nCommas = 0;
243	while (i < charsRead && (buffer[i] == ',' \|\| Char.IsDigit(buffer[i]))) {
244	if (buffer[i] == ',') nCommas++;
245	i++;
246	}
247	if (nCommas > 2) tokensWithMultipleCommas++;
248	}
249	if (tokensWithMultipleCommas > 1) {
250	// English format (only integer values) with ',' as separator
251	numberFormat = NumberFormatInfo.InvariantInfo;
252	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
253	separator = ',';
254	} else {
255	char[] disallowedSeparators = new char[] { ',' };
256	// German format (real values)
257	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
258	dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
259	separator = POSSIBLE_SEPARATORS
260	.Except(disallowedSeparators)
261	.Where(c => OccurrencesOf(charCounts, c) > 10)
262	.OrderBy(c => -OccurrencesOf(charCounts, c))
263	.DefaultIfEmpty(' ')
264	.First();
265	}
266	} else {
267	// no points and no commas => English format
268	numberFormat = NumberFormatInfo.InvariantInfo;
269	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
270	separator = POSSIBLE_SEPARATORS
271	.Where(c => OccurrencesOf(charCounts, c) > 10)
272	.OrderBy(c => -OccurrencesOf(charCounts, c))
273	.DefaultIfEmpty(' ')
274	.First();
275	}
276	}
277	}
278
279	private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
280	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
281	}
282
283	#region tokenizer
284	internal enum TokenTypeEnum {
285	NewLine, Separator, String, Double, DateTime
286	}
287
288	internal class Token {
289	public TokenTypeEnum type;
290	public string stringValue;
291	public double doubleValue;
292	public DateTime dateTimeValue;
293
294	public Token(TokenTypeEnum type, string value) {
295	this.type = type;
296	stringValue = value;
297	dateTimeValue = DateTime.MinValue;
298	doubleValue = 0.0;
299	}
300
301	public override string ToString() {
302	return stringValue;
303	}
304	}
305
306
307	internal class Tokenizer {
308	private StreamReader reader;
309	private List<Token> tokens;
310	private NumberFormatInfo numberFormatInfo;
311	private DateTimeFormatInfo dateTimeFormatInfo;
312	private char separator;
313	private const string INTERNAL_SEPARATOR = "#";
314
315	private int currentLineNumber = 0;
316	public int CurrentLineNumber {
317	get { return currentLineNumber; }
318	private set { currentLineNumber = value; }
319	}
320	private string currentLine;
321	public string CurrentLine {
322	get { return currentLine; }
323	private set { currentLine = value; }
324	}
325
326	private Token newlineToken;
327	public Token NewlineToken {
328	get { return newlineToken; }
329	private set { newlineToken = value; }
330	}
331	private Token separatorToken;
332	public Token SeparatorToken {
333	get { return separatorToken; }
334	private set { separatorToken = value; }
335	}
336
337	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
338	this.reader = reader;
339	this.numberFormatInfo = numberFormatInfo;
340	this.dateTimeFormatInfo = dateTimeFormatInfo;
341	this.separator = separator;
342	separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
343	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
344	tokens = new List<Token>();
345	ReadNextTokens();
346	}
347
348	private void ReadNextTokens() {
349	if (!reader.EndOfStream) {
350	CurrentLine = reader.ReadLine();
351	var newTokens = from str in Split(CurrentLine)
352	let trimmedStr = str.Trim()
353	where !string.IsNullOrEmpty(trimmedStr)
354	select MakeToken(trimmedStr);
355
356	tokens.AddRange(newTokens);
357	tokens.Add(NewlineToken);
358	CurrentLineNumber++;
359	}
360	}
361
362	private IEnumerable<string> Split(string line) {
363	IEnumerable<string> splitString;
364	if (separator == WHITESPACECHAR) {
365	//separate whitespaces
366	splitString = line.Split(new char[0], StringSplitOptions.RemoveEmptyEntries);
367	} else {
368	splitString = line.Split(separator);
369	}
370	int cur = splitString.Count();
371	foreach (var str in splitString) {
372	yield return str;
373	cur--;
374	// do not return the INTERNAL_SEPARATOR after the last string
375	if (cur != 0) {
376	yield return INTERNAL_SEPARATOR;
377	}
378	}
379	}
380
381	private Token MakeToken(string strToken) {
382	Token token = new Token(TokenTypeEnum.String, strToken);
383	if (strToken.Equals(INTERNAL_SEPARATOR)) {
384	return SeparatorToken;
385	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
386	token.type = TokenTypeEnum.Double;
387	return token;
388	} else if (DateTime.TryParse(strToken, dateTimeFormatInfo, DateTimeStyles.None, out token.dateTimeValue)) {
389	token.type = TokenTypeEnum.DateTime;
390	return token;
391	}
392
393	// couldn't parse the token as an int or float number or datetime value so return a string token
394	return token;
395	}
396
397	public Token Peek() {
398	return tokens[0];
399	}
400
401	public Token Next() {
402	Token next = tokens[0];
403	tokens.RemoveAt(0);
404	if (tokens.Count == 0) {
405	ReadNextTokens();
406	}
407	return next;
408	}
409
410	public bool HasNext() {
411	return tokens.Count > 0 \|\| !reader.EndOfStream;
412	}
413	}
414	#endregion
415
416	#region parsing
417	private void Parse(bool columnNamesInFirstLine) {
418	if (columnNamesInFirstLine) {
419	ParseVariableNames();
420	if (!tokenizer.HasNext())
421	Error(
422	"Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
423	"", tokenizer.CurrentLineNumber);
424	}
425	ParseValues();
426	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
427	}
428
429	private void ParseValues() {
430	while (tokenizer.HasNext()) {
431	if (tokenizer.Peek() == tokenizer.NewlineToken) {
432	tokenizer.Next();
433	} else {
434	List<object> row = new List<object>();
435	object value = NextValue(tokenizer);
436	row.Add(value);
437	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
438	Expect(tokenizer.SeparatorToken);
439	row.Add(NextValue(tokenizer));
440	}
441	Expect(tokenizer.NewlineToken);
442	// all rows have to have the same number of values
443	// the first row defines how many samples are needed
444	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
445	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
446	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
447	tokenizer.CurrentLineNumber);
448	}
449	rowValues.Add(row);
450	}
451	}
452	}
453
454	private object NextValue(Tokenizer tokenizer) {
455	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;
456	Token current = tokenizer.Next();
457	if (current.type == TokenTypeEnum.Separator) {
458	return double.NaN;
459	} else if (current.type == TokenTypeEnum.String) {
460	return current.stringValue;
461	} else if (current.type == TokenTypeEnum.Double) {
462	return current.doubleValue;
463	} else if (current.type == TokenTypeEnum.DateTime) {
464	return current.dateTimeValue;
465	}
466	// found an unexpected token => throw error
467	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
468	// this line is never executed because Error() throws an exception
469	throw new InvalidOperationException();
470	}
471
472	private void ParseVariableNames() {
473	// the first line must contain variable names
474	List<Token> tokens = new List<Token>();
475	Token valueToken;
476	valueToken = tokenizer.Next();
477	tokens.Add(valueToken);
478	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
479	Expect(tokenizer.SeparatorToken);
480	valueToken = tokenizer.Next();
481	if (valueToken != tokenizer.NewlineToken) {
482	tokens.Add(valueToken);
483	}
484	}
485	if (valueToken != tokenizer.NewlineToken) {
486	Expect(tokenizer.NewlineToken);
487	}
488	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
489	}
490
491	private void Expect(Token expectedToken) {
492	Token actualToken = tokenizer.Next();
493	if (actualToken != expectedToken) {
494	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
495	}
496	}
497
498	private void Error(string message, string token, int lineNumber) {
499	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
500	}
501	#endregion
502
503	[Serializable]
504	public class DataFormatException : Exception {
505	private int line;
506	public int Line {
507	get { return line; }
508	}
509	private string token;
510	public string Token {
511	get { return token; }
512	}
513	public DataFormatException(string message, string token, int line)
514	: base(message + "\nToken: " + token + " (line: " + line + ")") {
515	this.token = token;
516	this.line = line;
517	}
518
519	public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
520	}
521	}
522	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences