Context Navigation

source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 9611

Visit:

Last change on this file since 9611 was 9611, checked in by mkommend, 11 years ago
#2070: Removed flag NumberStyles.AllowTrailingSign in the TableFileParser.
File size: 20.7 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2013 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22
23	using System;
24	using System.Collections;
25	using System.Collections.Generic;
26	using System.Globalization;
27	using System.IO;
28	using System.Linq;
29	using System.Runtime.Serialization;
30	using System.Text;
31
32	namespace HeuristicLab.Problems.Instances.DataAnalysis {
33	public class TableFileParser {
34	private const int BUFFER_SIZE = 65536;
35	private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
36	private Tokenizer tokenizer;
37	private List<List<object>> rowValues;
38
39	private int rows;
40	public int Rows {
41	get { return rows; }
42	set { rows = value; }
43	}
44
45	private int columns;
46	public int Columns {
47	get { return columns; }
48	set { columns = value; }
49	}
50
51	private List<IList> values;
52	public List<IList> Values {
53	get {
54	return values;
55	}
56	}
57
58	private List<string> variableNames;
59	public IEnumerable<string> VariableNames {
60	get {
61	if (variableNames.Count > 0) return variableNames;
62	else {
63	string[] names = new string[columns];
64	for (int i = 0; i < names.Length; i++) {
65	names[i] = "X" + i.ToString("000");
66	}
67	return names;
68	}
69	}
70	}
71
72	public TableFileParser() {
73	rowValues = new List<List<object>>();
74	variableNames = new List<string>();
75	}
76
77	public bool AreColumnNamesInFirstLine(string fileName) {
78	NumberFormatInfo numberFormat;
79	DateTimeFormatInfo dateTimeFormatInfo;
80	char separator;
81	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
82	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
83	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
84	}
85	}
86
87	public bool AreColumnNamesInFirstLine(Stream stream) {
88	NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
89	DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
90	char separator = ',';
91	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
92	}
93
94	public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
95	DateTimeFormatInfo dateTimeFormatInfo, char separator) {
96	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
97	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
98	}
99	}
100
101	public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
102	DateTimeFormatInfo dateTimeFormatInfo, char separator) {
103	using (StreamReader reader = new StreamReader(stream)) {
104	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
105	return tokenizer.Peek().type != TokenTypeEnum.Double;
106	}
107	}
108
109	/// <summary>
110	/// Parses a file and determines the format first
111	/// </summary>
112	/// <param name="fileName">file which is parsed</param>
113	/// <param name="columnNamesInFirstLine"></param>
114	public void Parse(string fileName, bool columnNamesInFirstLine) {
115	NumberFormatInfo numberFormat;
116	DateTimeFormatInfo dateTimeFormatInfo;
117	char separator;
118	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
119	Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
120	}
121
122	/// <summary>
123	/// Parses a file with the given formats
124	/// </summary>
125	/// <param name="fileName">file which is parsed</param>
126	/// <param name="numberFormat">Format of numbers</param>
127	/// <param name="dateTimeFormatInfo">Format of datetime</param>
128	/// <param name="separator">defines the separator</param>
129	/// <param name="columnNamesInFirstLine"></param>
130	public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine) {
131	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
132	Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
133	}
134	}
135
136	/// <summary>
137	/// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
138	/// </summary>
139	/// <param name="stream">stream which is parsed</param>
140	/// <param name="columnNamesInFirstLine"></param>
141	public void Parse(Stream stream, bool columnNamesInFirstLine) {
142	NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
143	DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
144	char separator = ',';
145	Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine);
146	}
147
148	/// <summary>
149	/// Parses a stream with the given formats.
150	/// </summary>
151	/// <param name="stream">Stream which is parsed</param>
152	/// <param name="numberFormat">Format of numbers</param>
153	/// <param name="dateTimeFormatInfo">Format of datetime</param>
154	/// <param name="separator">defines the separator</param>
155	/// <param name="columnNamesInFirstLine"></param>
156	public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine) {
157	using (StreamReader reader = new StreamReader(stream)) {
158	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
159	// parse the file
160	Parse(columnNamesInFirstLine);
161	}
162
163	// translate the list of samples into a DoubleMatrixData item
164	rows = rowValues.Count;
165	columns = rowValues[0].Count;
166	values = new List<IList>();
167
168	//create columns
169	for (int col = 0; col < columns; col++) {
170	var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(10).Select(v => v.GetType());
171	if (!types.Any()) {
172	values.Add(new List<string>());
173	continue;
174	}
175
176	var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
177	if (columnType == typeof(double)) values.Add(new List<double>());
178	else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
179	else if (columnType == typeof(string)) values.Add(new List<string>());
180	else throw new InvalidOperationException();
181	}
182
183
184
185	//fill with values
186	foreach (List<object> row in rowValues) {
187	int columnIndex = 0;
188	foreach (object element in row) {
189	if (values[columnIndex] is List<double> && !(element is double))
190	values[columnIndex].Add(double.NaN);
191	else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
192	values[columnIndex].Add(DateTime.MinValue);
193	else if (values[columnIndex] is List<string> && !(element is string))
194	values[columnIndex].Add(string.Empty);
195	else
196	values[columnIndex].Add(element);
197	columnIndex++;
198	}
199	}
200	}
201
202	public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
203	DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
204	}
205
206	public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
207	using (StreamReader reader = new StreamReader(stream)) {
208	// skip first line
209	reader.ReadLine();
210	// read a block
211	char[] buffer = new char[BUFFER_SIZE];
212	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
213	// count frequency of special characters
214	Dictionary<char, int> charCounts = buffer.Take(charsRead)
215	.GroupBy(c => c)
216	.ToDictionary(g => g.Key, g => g.Count());
217
218	// depending on the characters occuring in the block
219	// we distinghish a number of different cases based on the the following rules:
220	// many points => it must be English number format, the other frequently occuring char is the separator
221	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
222	// => check the line in more detail:
223	// English: 0, 0, 0, 0
224	// German: 0,0 0,0 0,0 ...
225	// => if commas are followed by space => English format
226	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
227	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
228	if (OccurrencesOf(charCounts, '.') > 10) {
229	numberFormat = NumberFormatInfo.InvariantInfo;
230	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
231	separator = POSSIBLE_SEPARATORS
232	.Where(c => OccurrencesOf(charCounts, c) > 10)
233	.OrderBy(c => -OccurrencesOf(charCounts, c))
234	.DefaultIfEmpty(' ')
235	.First();
236	} else if (OccurrencesOf(charCounts, ',') > 10) {
237	// no points and many commas
238	// count the number of tokens (chains of only digits and commas) that contain multiple comma characters
239	int tokensWithMultipleCommas = 0;
240	for (int i = 0; i < charsRead; i++) {
241	int nCommas = 0;
242	while (i < charsRead && (buffer[i] == ',' \|\| Char.IsDigit(buffer[i]))) {
243	if (buffer[i] == ',') nCommas++;
244	i++;
245	}
246	if (nCommas > 2) tokensWithMultipleCommas++;
247	}
248	if (tokensWithMultipleCommas > 1) {
249	// English format (only integer values) with ',' as separator
250	numberFormat = NumberFormatInfo.InvariantInfo;
251	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
252	separator = ',';
253	} else {
254	char[] disallowedSeparators = new char[] { ',' };
255	// German format (real values)
256	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
257	dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
258	separator = POSSIBLE_SEPARATORS
259	.Except(disallowedSeparators)
260	.Where(c => OccurrencesOf(charCounts, c) > 10)
261	.OrderBy(c => -OccurrencesOf(charCounts, c))
262	.DefaultIfEmpty(' ')
263	.First();
264	}
265	} else {
266	// no points and no commas => English format
267	numberFormat = NumberFormatInfo.InvariantInfo;
268	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
269	separator = POSSIBLE_SEPARATORS
270	.Where(c => OccurrencesOf(charCounts, c) > 10)
271	.OrderBy(c => -OccurrencesOf(charCounts, c))
272	.DefaultIfEmpty(' ')
273	.First();
274	}
275	}
276	}
277
278	private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
279	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
280	}
281
282	#region tokenizer
283	internal enum TokenTypeEnum {
284	NewLine, Separator, String, Double, DateTime
285	}
286
287	internal class Token {
288	public TokenTypeEnum type;
289	public string stringValue;
290	public double doubleValue;
291	public DateTime dateTimeValue;
292
293	public Token(TokenTypeEnum type, string value) {
294	this.type = type;
295	stringValue = value;
296	dateTimeValue = DateTime.MinValue;
297	doubleValue = 0.0;
298	}
299
300	public override string ToString() {
301	return stringValue;
302	}
303	}
304
305
306	internal class Tokenizer {
307	private StreamReader reader;
308	private List<Token> tokens;
309	private NumberFormatInfo numberFormatInfo;
310	private DateTimeFormatInfo dateTimeFormatInfo;
311	private char separator;
312	private const string INTERNAL_SEPARATOR = "#";
313
314	private int currentLineNumber = 0;
315	public int CurrentLineNumber {
316	get { return currentLineNumber; }
317	private set { currentLineNumber = value; }
318	}
319	private string currentLine;
320	public string CurrentLine {
321	get { return currentLine; }
322	private set { currentLine = value; }
323	}
324
325	private Token newlineToken;
326	public Token NewlineToken {
327	get { return newlineToken; }
328	private set { newlineToken = value; }
329	}
330	private Token separatorToken;
331	public Token SeparatorToken {
332	get { return separatorToken; }
333	private set { separatorToken = value; }
334	}
335
336	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
337	this.reader = reader;
338	this.numberFormatInfo = numberFormatInfo;
339	this.dateTimeFormatInfo = dateTimeFormatInfo;
340	this.separator = separator;
341	separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
342	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
343	tokens = new List<Token>();
344	ReadNextTokens();
345	}
346
347	private void ReadNextTokens() {
348	if (!reader.EndOfStream) {
349	CurrentLine = reader.ReadLine();
350	var newTokens = from str in Split(CurrentLine)
351	let trimmedStr = str.Trim()
352	where !string.IsNullOrEmpty(trimmedStr)
353	select MakeToken(trimmedStr);
354
355	tokens.AddRange(newTokens);
356	tokens.Add(NewlineToken);
357	CurrentLineNumber++;
358	}
359	}
360
361	private IEnumerable<string> Split(string line) {
362	StringBuilder subStr = new StringBuilder();
363	foreach (char c in line) {
364	if (c == separator) {
365	yield return subStr.ToString();
366	subStr = new StringBuilder();
367	// all separator characters are transformed to the internally used separator character
368	yield return INTERNAL_SEPARATOR;
369	} else {
370	subStr.Append(c);
371	}
372	}
373	yield return subStr.ToString();
374	}
375
376	private Token MakeToken(string strToken) {
377	Token token = new Token(TokenTypeEnum.String, strToken);
378	if (strToken.Equals(INTERNAL_SEPARATOR)) {
379	return SeparatorToken;
380	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
381	token.type = TokenTypeEnum.Double;
382	return token;
383	} else if (DateTime.TryParse(strToken, dateTimeFormatInfo, DateTimeStyles.None, out token.dateTimeValue)) {
384	token.type = TokenTypeEnum.DateTime;
385	return token;
386	}
387
388	// couldn't parse the token as an int or float number or datetime value so return a string token
389	return token;
390	}
391
392	public Token Peek() {
393	return tokens[0];
394	}
395
396	public Token Next() {
397	Token next = tokens[0];
398	tokens.RemoveAt(0);
399	if (tokens.Count == 0) {
400	ReadNextTokens();
401	}
402	return next;
403	}
404
405	public bool HasNext() {
406	return tokens.Count > 0 \|\| !reader.EndOfStream;
407	}
408	}
409	#endregion
410
411	#region parsing
412	private void Parse(bool columnNamesInFirstLine) {
413	if (columnNamesInFirstLine) {
414	ParseVariableNames();
415	if (!tokenizer.HasNext())
416	Error(
417	"Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
418	"", tokenizer.CurrentLineNumber);
419	}
420	ParseValues();
421	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
422	}
423
424	private void ParseValues() {
425	while (tokenizer.HasNext()) {
426	if (tokenizer.Peek() == tokenizer.NewlineToken) {
427	tokenizer.Next();
428	} else {
429	List<object> row = new List<object>();
430	object value = NextValue(tokenizer);
431	row.Add(value);
432	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
433	Expect(tokenizer.SeparatorToken);
434	row.Add(NextValue(tokenizer));
435	}
436	Expect(tokenizer.NewlineToken);
437	// all rows have to have the same number of values
438	// the first row defines how many samples are needed
439	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
440	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
441	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
442	tokenizer.CurrentLineNumber);
443	}
444	rowValues.Add(row);
445	}
446	}
447	}
448
449	private object NextValue(Tokenizer tokenizer) {
450	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;
451	Token current = tokenizer.Next();
452	if (current.type == TokenTypeEnum.Separator) {
453	return double.NaN;
454	} else if (current.type == TokenTypeEnum.String) {
455	return current.stringValue;
456	} else if (current.type == TokenTypeEnum.Double) {
457	return current.doubleValue;
458	} else if (current.type == TokenTypeEnum.DateTime) {
459	return current.dateTimeValue;
460	}
461	// found an unexpected token => throw error
462	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
463	// this line is never executed because Error() throws an exception
464	throw new InvalidOperationException();
465	}
466
467	private void ParseVariableNames() {
468	// the first line must contain variable names
469	List<Token> tokens = new List<Token>();
470	Token valueToken;
471	valueToken = tokenizer.Next();
472	tokens.Add(valueToken);
473	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
474	Expect(tokenizer.SeparatorToken);
475	valueToken = tokenizer.Next();
476	if (valueToken != tokenizer.NewlineToken) {
477	tokens.Add(valueToken);
478	}
479	}
480	if (valueToken != tokenizer.NewlineToken) {
481	Expect(tokenizer.NewlineToken);
482	}
483	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
484	}
485
486	private void Expect(Token expectedToken) {
487	Token actualToken = tokenizer.Next();
488	if (actualToken != expectedToken) {
489	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
490	}
491	}
492
493	private void Error(string message, string token, int lineNumber) {
494	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
495	}
496	#endregion
497
498	[Serializable]
499	public class DataFormatException : Exception {
500	private int line;
501	public int Line {
502	get { return line; }
503	}
504	private string token;
505	public string Token {
506	get { return token; }
507	}
508	public DataFormatException(string message, string token, int line)
509	: base(message + "\nToken: " + token + " (line: " + line + ")") {
510	this.token = token;
511	this.line = line;
512	}
513
514	public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
515	}
516	}
517	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences