Context Navigation

source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 13414

Visit:

Last change on this file since 13414 was 13414, checked in by gkronber, 8 years ago
#2071: added progress reporting when importing regression problem data from csv files.
File size: 22.3 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22
23	using System;
24	using System.Collections;
25	using System.Collections.Generic;
26	using System.Globalization;
27	using System.IO;
28	using System.Linq;
29	using System.Runtime.Serialization;
30
31	namespace HeuristicLab.Problems.Instances.DataAnalysis {
32	public class TableFileParser : Progress<long> { // reports the number of bytes read
33	private const int BUFFER_SIZE = 65536;
34	// char used to symbolize whitespaces (no missing values can be handled with whitespaces)
35	private const char WHITESPACECHAR = (char)0;
36	private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
37	private Tokenizer tokenizer;
38	private List<List<object>> rowValues;
39
40	private int rows;
41	public int Rows {
42	get { return rows; }
43	set { rows = value; }
44	}
45
46	private int columns;
47	public int Columns {
48	get { return columns; }
49	set { columns = value; }
50	}
51
52	private List<IList> values;
53	public List<IList> Values {
54	get {
55	return values;
56	}
57	}
58
59	private List<string> variableNames;
60	public IEnumerable<string> VariableNames {
61	get {
62	if (variableNames.Count > 0) return variableNames;
63	else {
64	string[] names = new string[columns];
65	for (int i = 0; i < names.Length; i++) {
66	names[i] = "X" + i.ToString("000");
67	}
68	return names;
69	}
70	}
71	}
72
73	public TableFileParser() {
74	rowValues = new List<List<object>>();
75	variableNames = new List<string>();
76	}
77
78	public bool AreColumnNamesInFirstLine(string fileName) {
79	NumberFormatInfo numberFormat;
80	DateTimeFormatInfo dateTimeFormatInfo;
81	char separator;
82	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
83	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
84	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
85	}
86	}
87
88	public bool AreColumnNamesInFirstLine(Stream stream) {
89	NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
90	DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
91	char separator = ',';
92	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
93	}
94
95	public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
96	DateTimeFormatInfo dateTimeFormatInfo, char separator) {
97	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
98	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
99	}
100	}
101
102	public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
103	DateTimeFormatInfo dateTimeFormatInfo, char separator) {
104	using (StreamReader reader = new StreamReader(stream)) {
105	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
106	return tokenizer.PeekType() != TokenTypeEnum.Double;
107	}
108	}
109
110	/// <summary>
111	/// Parses a file and determines the format first
112	/// </summary>
113	/// <param name="fileName">file which is parsed</param>
114	/// <param name="columnNamesInFirstLine"></param>
115	public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) {
116	NumberFormatInfo numberFormat;
117	DateTimeFormatInfo dateTimeFormatInfo;
118	char separator;
119	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
120	Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
121	}
122
123	/// <summary>
124	/// Parses a file with the given formats
125	/// </summary>
126	/// <param name="fileName">file which is parsed</param>
127	/// <param name="numberFormat">Format of numbers</param>
128	/// <param name="dateTimeFormatInfo">Format of datetime</param>
129	/// <param name="separator">defines the separator</param>
130	/// <param name="columnNamesInFirstLine"></param>
131	public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
132	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
133	Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
134	}
135	}
136
137	/// <summary>
138	/// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
139	/// </summary>
140	/// <param name="stream">stream which is parsed</param>
141	/// <param name="columnNamesInFirstLine"></param>
142	public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) {
143	NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
144	DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
145	char separator = ',';
146	Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
147	}
148
149	/// <summary>
150	/// Parses a stream with the given formats.
151	/// </summary>
152	/// <param name="stream">Stream which is parsed</param>
153	/// <param name="numberFormat">Format of numbers</param>
154	/// <param name="dateTimeFormatInfo">Format of datetime</param>
155	/// <param name="separator">defines the separator</param>
156	/// <param name="columnNamesInFirstLine"></param>
157	public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
158	using (StreamReader reader = new StreamReader(stream)) {
159	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
160	// parse the file
161	Parse(columnNamesInFirstLine, lineLimit);
162	}
163
164	// translate the list of samples into a DoubleMatrixData item
165	rows = rowValues.Count;
166	columns = rowValues[0].Count;
167	values = new List<IList>();
168
169	//create columns
170	for (int col = 0; col < columns; col++) {
171	var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(100).Select(v => v.GetType());
172	if (!types.Any()) {
173	values.Add(new List<string>());
174	continue;
175	}
176
177	var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
178	if (columnType == typeof(double)) values.Add(new List<double>());
179	else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
180	else if (columnType == typeof(string)) values.Add(new List<string>());
181	else throw new InvalidOperationException();
182	}
183
184
185
186	//fill with values
187	foreach (List<object> row in rowValues) {
188	int columnIndex = 0;
189	foreach (object element in row) {
190	if (values[columnIndex] is List<double> && !(element is double))
191	values[columnIndex].Add(double.NaN);
192	else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
193	values[columnIndex].Add(DateTime.MinValue);
194	else if (values[columnIndex] is List<string> && !(element is string))
195	values[columnIndex].Add(element.ToString());
196	else
197	values[columnIndex].Add(element);
198	columnIndex++;
199	}
200	}
201	}
202
203	public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
204	DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
205	}
206
207	public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
208	using (StreamReader reader = new StreamReader(stream)) {
209	// skip first line
210	reader.ReadLine();
211	// read a block
212	char[] buffer = new char[BUFFER_SIZE];
213	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
214	// count frequency of special characters
215	Dictionary<char, int> charCounts = buffer.Take(charsRead)
216	.GroupBy(c => c)
217	.ToDictionary(g => g.Key, g => g.Count());
218
219	// depending on the characters occuring in the block
220	// we distinghish a number of different cases based on the the following rules:
221	// many points => it must be English number format, the other frequently occuring char is the separator
222	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
223	// => check the line in more detail:
224	// English: 0, 0, 0, 0
225	// German: 0,0 0,0 0,0 ...
226	// => if commas are followed by space => English format
227	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
228	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
229	if (OccurrencesOf(charCounts, '.') > 10) {
230	numberFormat = NumberFormatInfo.InvariantInfo;
231	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
232	separator = POSSIBLE_SEPARATORS
233	.Where(c => OccurrencesOf(charCounts, c) > 10)
234	.OrderBy(c => -OccurrencesOf(charCounts, c))
235	.DefaultIfEmpty(' ')
236	.First();
237	} else if (OccurrencesOf(charCounts, ',') > 10) {
238	// no points and many commas
239	// count the number of tokens (chains of only digits and commas) that contain multiple comma characters
240	int tokensWithMultipleCommas = 0;
241	for (int i = 0; i < charsRead; i++) {
242	int nCommas = 0;
243	while (i < charsRead && (buffer[i] == ',' \|\| Char.IsDigit(buffer[i]))) {
244	if (buffer[i] == ',') nCommas++;
245	i++;
246	}
247	if (nCommas > 2) tokensWithMultipleCommas++;
248	}
249	if (tokensWithMultipleCommas > 1) {
250	// English format (only integer values) with ',' as separator
251	numberFormat = NumberFormatInfo.InvariantInfo;
252	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
253	separator = ',';
254	} else {
255	char[] disallowedSeparators = new char[] { ',' };
256	// German format (real values)
257	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
258	dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
259	separator = POSSIBLE_SEPARATORS
260	.Except(disallowedSeparators)
261	.Where(c => OccurrencesOf(charCounts, c) > 10)
262	.OrderBy(c => -OccurrencesOf(charCounts, c))
263	.DefaultIfEmpty(' ')
264	.First();
265	}
266	} else {
267	// no points and no commas => English format
268	numberFormat = NumberFormatInfo.InvariantInfo;
269	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
270	separator = POSSIBLE_SEPARATORS
271	.Where(c => OccurrencesOf(charCounts, c) > 10)
272	.OrderBy(c => -OccurrencesOf(charCounts, c))
273	.DefaultIfEmpty(' ')
274	.First();
275	}
276	}
277	}
278
279	private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
280	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
281	}
282
283	#region tokenizer
284	internal enum TokenTypeEnum {
285	NewLine, Separator, String, Double, DateTime
286	}
287
288	internal class Tokenizer {
289	private StreamReader reader;
290	// we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary)
291	private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024];
292	private string[] stringVals = new string[1024];
293	private double[] doubleVals = new double[1024];
294	private DateTime[] dateTimeVals = new DateTime[1024];
295	private int tokenPos;
296	private int numTokens;
297	private NumberFormatInfo numberFormatInfo;
298	private DateTimeFormatInfo dateTimeFormatInfo;
299	private char separator;
300	private const string INTERNAL_SEPARATOR = "#";
301
302	private int currentLineNumber = 0;
303	public int CurrentLineNumber {
304	get { return currentLineNumber; }
305	private set { currentLineNumber = value; }
306	}
307	private string currentLine;
308	public string CurrentLine {
309	get { return currentLine; }
310	private set { currentLine = value; }
311	}
312	public long BytesRead {
313	get;
314	private set;
315	}
316
317
318	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
319	this.reader = reader;
320	this.numberFormatInfo = numberFormatInfo;
321	this.dateTimeFormatInfo = dateTimeFormatInfo;
322	this.separator = separator;
323	ReadNextTokens();
324	}
325
326	private void ReadNextTokens() {
327	if (!reader.EndOfStream) {
328	CurrentLine = reader.ReadLine();
329	try {
330	BytesRead = reader.BaseStream.Position;
331	} catch (IOException) {
332	BytesRead += CurrentLine.Length + 2; // guess
333	} catch (NotSupportedException) {
334	BytesRead += CurrentLine.Length + 2;
335	}
336	int i = 0;
337	foreach (var tok in Split(CurrentLine)) {
338	var trimmedStr = tok.Trim();
339	if (!string.IsNullOrEmpty(trimmedStr)) {
340	TokenTypeEnum type = TokenTypeEnum.String; // default
341	stringVals[i] = trimmedStr;
342	double doubleVal;
343	DateTime dateTimeValue;
344	if (trimmedStr.Equals(INTERNAL_SEPARATOR)) {
345	type = TokenTypeEnum.Separator;
346	} else if (double.TryParse(trimmedStr, NumberStyles.Float, numberFormatInfo, out doubleVal)) {
347	type = TokenTypeEnum.Double;
348	doubleVals[i] = doubleVal;
349	} else if (DateTime.TryParse(trimmedStr, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) {
350	type = TokenTypeEnum.DateTime;
351	dateTimeVals[i] = dateTimeValue;
352	}
353
354	// couldn't parse the token as an int or float number or datetime value so return a string token
355
356	tokenTypes[i] = type;
357	i++;
358
359	if (i >= tokenTypes.Length) {
360	// increase buffer size if necessary
361	IncreaseCapacity(ref tokenTypes);
362	IncreaseCapacity(ref doubleVals);
363	IncreaseCapacity(ref stringVals);
364	IncreaseCapacity(ref dateTimeVals);
365	}
366	}
367	}
368	tokenTypes[i] = TokenTypeEnum.NewLine;
369	numTokens = i + 1;
370	tokenPos = 0;
371	}
372	}
373
374	private static void IncreaseCapacity<T>(ref T[] arr) {
375	int n = (int)Math.Floor(arr.Length * 1.7); // guess
376	T[] arr2 = new T[n];
377	Array.Copy(arr, arr2, arr.Length);
378	arr = arr2;
379	}
380
381	private IEnumerable<string> Split(string line) {
382	string[] splitString;
383	if (separator == WHITESPACECHAR) {
384	//separate whitespaces
385	splitString = line.Split(new char[0], StringSplitOptions.RemoveEmptyEntries);
386	} else {
387	splitString = line.Split(separator);
388	}
389
390	for (int i = 0; i < splitString.Length - 1; i++) {
391	yield return splitString[i];
392	yield return INTERNAL_SEPARATOR;
393	}
394	// do not return the INTERNAL_SEPARATOR after the last string
395	yield return splitString[splitString.Length - 1];
396	}
397
398	public TokenTypeEnum PeekType() {
399	return tokenTypes[tokenPos];
400	}
401
402	public void Skip() {
403	// simply skips one token without returning the result values
404	tokenPos++;
405	if (numTokens == tokenPos) {
406	ReadNextTokens();
407	}
408	}
409
410	public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {
411	type = tokenTypes[tokenPos];
412	strVal = stringVals[tokenPos];
413	dblVal = doubleVals[tokenPos];
414	dateTimeVal = dateTimeVals[tokenPos];
415
416	Skip();
417	}
418
419	public bool HasNext() {
420	return numTokens > tokenPos \|\| !reader.EndOfStream;
421	}
422	}
423	#endregion
424
425	#region parsing
426	private void Parse(bool columnNamesInFirstLine, int lineLimit = -1) { // lineLimit = -1 means no limit
427	if (columnNamesInFirstLine) {
428	ParseVariableNames();
429	if (!tokenizer.HasNext())
430	Error(
431	"Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
432	"", tokenizer.CurrentLineNumber);
433	}
434	ParseValues(lineLimit);
435	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
436	}
437
438	private void ParseValues(int lineLimit = -1) {
439	int nLinesParsed = 0;
440	while (tokenizer.HasNext() && (lineLimit < 0 \|\| nLinesParsed < lineLimit)) {
441	if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
442	tokenizer.Skip();
443	nLinesParsed++;
444	} else {
445	List<object> row = new List<object>();
446	object value = NextValue(tokenizer);
447	row.Add(value);
448	while (tokenizer.HasNext() && tokenizer.PeekType() == TokenTypeEnum.Separator) {
449	ExpectType(TokenTypeEnum.Separator);
450	row.Add(NextValue(tokenizer));
451	}
452	ExpectType(TokenTypeEnum.NewLine);
453	nLinesParsed++;
454	// all rows have to have the same number of values
455	// the first row defines how many samples are needed
456	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
457	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
458	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
459	tokenizer.CurrentLineNumber);
460	}
461	rowValues.Add(row);
462	}
463
464	OnReport(tokenizer.BytesRead);
465	}
466	}
467
468	private object NextValue(Tokenizer tokenizer) {
469	if (tokenizer.PeekType() == TokenTypeEnum.Separator \|\| tokenizer.PeekType() == TokenTypeEnum.NewLine) return string.Empty;
470	TokenTypeEnum type;
471	string strVal;
472	double dblVal;
473	DateTime dateTimeVal;
474
475	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
476	switch (type) {
477	case TokenTypeEnum.Separator: return double.NaN;
478	case TokenTypeEnum.String: return strVal;
479	case TokenTypeEnum.Double: return dblVal;
480	case TokenTypeEnum.DateTime: return dateTimeVal;
481	}
482	// found an unexpected token => throw error
483	Error("Unexpected token.", strVal, tokenizer.CurrentLineNumber);
484	// this line is never executed because Error() throws an exception
485	throw new InvalidOperationException();
486	}
487
488	private void ParseVariableNames() {
489	// the first line must contain variable names
490	List<string> varNames = new List<string>();
491
492	TokenTypeEnum type;
493	string strVal;
494	double dblVal;
495	DateTime dateTimeVal;
496
497	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
498
499	// the first token must be a variable name
500	if (type != TokenTypeEnum.String)
501	throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type);
502	varNames.Add(strVal);
503
504	while (tokenizer.HasNext() && tokenizer.PeekType() == TokenTypeEnum.Separator) {
505	ExpectType(TokenTypeEnum.Separator);
506	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
507	varNames.Add(strVal);
508	}
509	ExpectType(TokenTypeEnum.NewLine);
510
511	variableNames = varNames;
512	}
513
514	private void ExpectType(TokenTypeEnum expectedToken) {
515	if (tokenizer.PeekType() != expectedToken)
516	throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType());
517	tokenizer.Skip();
518	}
519
520	private void Error(string message, string token, int lineNumber) {
521	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
522	}
523	#endregion
524
525	[Serializable]
526	public class DataFormatException : Exception {
527	private int line;
528	public int Line {
529	get { return line; }
530	}
531	private string token;
532	public string Token {
533	get { return token; }
534	}
535	public DataFormatException(string message, string token, int line)
536	: base(message + "\nToken: " + token + " (line: " + line + ")") {
537	this.token = token;
538	this.line = line;
539	}
540
541	public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
542	}
543	}
544	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences