Context Navigation

source: branches/HiveHiveEngine/HeuristicLab.Problems.DataAnalysis/3.4/TableFileParser.cs @ 7928

Visit:

Last change on this file since 7928 was 7259, checked in by swagner, 13 years ago
Updated year of copyrights to 2012 (#1716)
File size: 16.4 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2012 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22	using System;
23	using System.Collections;
24	using System.Collections.Generic;
25	using System.Globalization;
26	using System.IO;
27	using System.Linq;
28	using System.Runtime.Serialization;
29	using System.Text;
30
31	namespace HeuristicLab.Problems.DataAnalysis {
32	public class TableFileParser {
33	private const int BUFFER_SIZE = 1024;
34	private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
35	private Tokenizer tokenizer;
36	private List<List<object>> rowValues;
37
38	private int rows;
39	public int Rows {
40	get { return rows; }
41	set { rows = value; }
42	}
43
44	private int columns;
45	public int Columns {
46	get { return columns; }
47	set { columns = value; }
48	}
49
50	private List<IList> values;
51	public List<IList> Values {
52	get {
53	return values;
54	}
55	}
56
57	private List<string> variableNames;
58	public IEnumerable<string> VariableNames {
59	get {
60	if (variableNames.Count > 0) return variableNames;
61	else {
62	string[] names = new string[columns];
63	for (int i = 0; i < names.Length; i++) {
64	names[i] = "X" + i.ToString("000");
65	}
66	return names;
67	}
68	}
69	}
70
71	public TableFileParser() {
72	rowValues = new List<List<object>>();
73	variableNames = new List<string>();
74	}
75
76	public void Parse(string fileName) {
77	NumberFormatInfo numberFormat;
78	DateTimeFormatInfo dateTimeFormatInfo;
79	char separator;
80	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
81	using (StreamReader reader = new StreamReader(fileName)) {
82	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
83	// parse the file
84	Parse();
85	}
86
87	// translate the list of samples into a DoubleMatrixData item
88	rows = rowValues.Count;
89	columns = rowValues[0].Count;
90	values = new List<IList>();
91
92	//create columns
93	for (int col = 0; col < columns; col++) {
94	var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(10).Select(v => v.GetType());
95	if (!types.Any()) {
96	values.Add(new List<string>());
97	continue;
98	}
99
100	var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
101	if (columnType == typeof(double)) values.Add(new List<double>());
102	else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
103	else if (columnType == typeof(string)) values.Add(new List<string>());
104	else throw new InvalidOperationException();
105	}
106
107
108
109	//fill with values
110	foreach (List<object> row in rowValues) {
111	int columnIndex = 0;
112	foreach (object element in row) {
113	if (values[columnIndex] is List<double> && !(element is double))
114	values[columnIndex].Add(double.NaN);
115	else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
116	values[columnIndex].Add(DateTime.MinValue);
117	else if (values[columnIndex] is List<string> && !(element is string))
118	values[columnIndex].Add(string.Empty);
119	else
120	values[columnIndex].Add(element);
121	columnIndex++;
122	}
123	}
124	}
125
126	private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
127	using (StreamReader reader = new StreamReader(fileName)) {
128	// skip first line
129	reader.ReadLine();
130	// read a block
131	char[] buffer = new char[BUFFER_SIZE];
132	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
133	// count frequency of special characters
134	Dictionary<char, int> charCounts = buffer.Take(charsRead)
135	.GroupBy(c => c)
136	.ToDictionary(g => g.Key, g => g.Count());
137
138	// depending on the characters occuring in the block
139	// we distinghish a number of different cases based on the the following rules:
140	// many points => it must be English number format, the other frequently occuring char is the separator
141	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
142	// => check the line in more detail:
143	// English: 0, 0, 0, 0
144	// German: 0,0 0,0 0,0 ...
145	// => if commas are followed by space => English format
146	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
147	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
148	if (OccurrencesOf(charCounts, '.') > 10) {
149	numberFormat = NumberFormatInfo.InvariantInfo;
150	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
151	separator = POSSIBLE_SEPARATORS
152	.Where(c => OccurrencesOf(charCounts, c) > 10)
153	.OrderBy(c => -OccurrencesOf(charCounts, c))
154	.DefaultIfEmpty(' ')
155	.First();
156	} else if (OccurrencesOf(charCounts, ',') > 10) {
157	// no points and many commas
158	// count the number of tokens (chains of only digits and commas) that contain multiple comma characters
159	int tokensWithMultipleCommas = 0;
160	for (int i = 0; i < charsRead; i++) {
161	int nCommas = 0;
162	while (i < charsRead && (buffer[i] == ',' \|\| Char.IsDigit(buffer[i]))) {
163	if (buffer[i] == ',') nCommas++;
164	i++;
165	}
166	if (nCommas > 2) tokensWithMultipleCommas++;
167	}
168	if (tokensWithMultipleCommas > 1) {
169	// English format (only integer values) with ',' as separator
170	numberFormat = NumberFormatInfo.InvariantInfo;
171	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
172	separator = ',';
173	} else {
174	char[] disallowedSeparators = new char[] { ',' };
175	// German format (real values)
176	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
177	dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
178	separator = POSSIBLE_SEPARATORS
179	.Except(disallowedSeparators)
180	.Where(c => OccurrencesOf(charCounts, c) > 10)
181	.OrderBy(c => -OccurrencesOf(charCounts, c))
182	.DefaultIfEmpty(' ')
183	.First();
184	}
185	} else {
186	// no points and no commas => English format
187	numberFormat = NumberFormatInfo.InvariantInfo;
188	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
189	separator = POSSIBLE_SEPARATORS
190	.Where(c => OccurrencesOf(charCounts, c) > 10)
191	.OrderBy(c => -OccurrencesOf(charCounts, c))
192	.DefaultIfEmpty(' ')
193	.First();
194	}
195	}
196	}
197
198	private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
199	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
200	}
201
202	#region tokenizer
203	internal enum TokenTypeEnum {
204	NewLine, Separator, String, Double, DateTime
205	}
206
207	internal class Token {
208	public TokenTypeEnum type;
209	public string stringValue;
210	public double doubleValue;
211	public DateTime dateTimeValue;
212
213	public Token(TokenTypeEnum type, string value) {
214	this.type = type;
215	stringValue = value;
216	dateTimeValue = DateTime.MinValue;
217	doubleValue = 0.0;
218	}
219
220	public override string ToString() {
221	return stringValue;
222	}
223	}
224
225
226	internal class Tokenizer {
227	private StreamReader reader;
228	private List<Token> tokens;
229	private NumberFormatInfo numberFormatInfo;
230	private DateTimeFormatInfo dateTimeFormatInfo;
231	private char separator;
232	private const string INTERNAL_SEPARATOR = "#";
233
234	private int currentLineNumber = 0;
235	public int CurrentLineNumber {
236	get { return currentLineNumber; }
237	private set { currentLineNumber = value; }
238	}
239	private string currentLine;
240	public string CurrentLine {
241	get { return currentLine; }
242	private set { currentLine = value; }
243	}
244
245	private Token newlineToken;
246	public Token NewlineToken {
247	get { return newlineToken; }
248	private set { newlineToken = value; }
249	}
250	private Token separatorToken;
251	public Token SeparatorToken {
252	get { return separatorToken; }
253	private set { separatorToken = value; }
254	}
255
256	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
257	this.reader = reader;
258	this.numberFormatInfo = numberFormatInfo;
259	this.dateTimeFormatInfo = dateTimeFormatInfo;
260	this.separator = separator;
261	separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
262	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
263	tokens = new List<Token>();
264	ReadNextTokens();
265	}
266
267	private void ReadNextTokens() {
268	if (!reader.EndOfStream) {
269	CurrentLine = reader.ReadLine();
270	var newTokens = from str in Split(CurrentLine)
271	let trimmedStr = str.Trim()
272	where !string.IsNullOrEmpty(trimmedStr)
273	select MakeToken(trimmedStr);
274
275	tokens.AddRange(newTokens);
276	tokens.Add(NewlineToken);
277	CurrentLineNumber++;
278	}
279	}
280
281	private IEnumerable<string> Split(string line) {
282	StringBuilder subStr = new StringBuilder();
283	foreach (char c in line) {
284	if (c == separator) {
285	yield return subStr.ToString();
286	subStr = new StringBuilder();
287	// all separator characters are transformed to the internally used separator character
288	yield return INTERNAL_SEPARATOR;
289	} else {
290	subStr.Append(c);
291	}
292	}
293	yield return subStr.ToString();
294	}
295
296	private Token MakeToken(string strToken) {
297	Token token = new Token(TokenTypeEnum.String, strToken);
298	if (strToken.Equals(INTERNAL_SEPARATOR)) {
299	return SeparatorToken;
300	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
301	token.type = TokenTypeEnum.Double;
302	return token;
303	} else if (DateTime.TryParse(strToken, dateTimeFormatInfo, DateTimeStyles.None, out token.dateTimeValue)) {
304	token.type = TokenTypeEnum.DateTime;
305	return token;
306	}
307
308	// couldn't parse the token as an int or float number or datetime value so return a string token
309	return token;
310	}
311
312	public Token Peek() {
313	return tokens[0];
314	}
315
316	public Token Next() {
317	Token next = tokens[0];
318	tokens.RemoveAt(0);
319	if (tokens.Count == 0) {
320	ReadNextTokens();
321	}
322	return next;
323	}
324
325	public bool HasNext() {
326	return tokens.Count > 0 \|\| !reader.EndOfStream;
327	}
328	}
329	#endregion
330
331	#region parsing
332	private void Parse() {
333	ParseVariableNames();
334	if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
335	ParseValues();
336	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
337	}
338
339	private void ParseValues() {
340	while (tokenizer.HasNext()) {
341	if (tokenizer.Peek() == tokenizer.NewlineToken) {
342	tokenizer.Next();
343	} else {
344	List<object> row = new List<object>();
345	object value = NextValue(tokenizer);
346	row.Add(value);
347	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
348	Expect(tokenizer.SeparatorToken);
349	row.Add(NextValue(tokenizer));
350	}
351	Expect(tokenizer.NewlineToken);
352	// all rows have to have the same number of values
353	// the first row defines how many samples are needed
354	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
355	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
356	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
357	tokenizer.CurrentLineNumber);
358	}
359	rowValues.Add(row);
360	}
361	}
362	}
363
364	private object NextValue(Tokenizer tokenizer) {
365	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;
366	Token current = tokenizer.Next();
367	if (current.type == TokenTypeEnum.Separator) {
368	return double.NaN;
369	} else if (current.type == TokenTypeEnum.String) {
370	return current.stringValue;
371	} else if (current.type == TokenTypeEnum.Double) {
372	return current.doubleValue;
373	} else if (current.type == TokenTypeEnum.DateTime) {
374	return current.dateTimeValue;
375	}
376	// found an unexpected token => throw error
377	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
378	// this line is never executed because Error() throws an exception
379	throw new InvalidOperationException();
380	}
381
382	private void ParseVariableNames() {
383	//if first token is double no variables names are given
384	if (tokenizer.Peek().type == TokenTypeEnum.Double) return;
385
386	// the first line must contain variable names
387	List<Token> tokens = new List<Token>();
388	Token valueToken;
389	valueToken = tokenizer.Next();
390	tokens.Add(valueToken);
391	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
392	Expect(tokenizer.SeparatorToken);
393	valueToken = tokenizer.Next();
394	if (valueToken != tokenizer.NewlineToken) {
395	tokens.Add(valueToken);
396	}
397	}
398	if (valueToken != tokenizer.NewlineToken) {
399	Expect(tokenizer.NewlineToken);
400	}
401	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
402	}
403
404	private void Expect(Token expectedToken) {
405	Token actualToken = tokenizer.Next();
406	if (actualToken != expectedToken) {
407	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
408	}
409	}
410
411	private void Error(string message, string token, int lineNumber) {
412	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
413	}
414	#endregion
415
416	[Serializable]
417	private class DataFormatException : Exception {
418	private int line;
419	public int Line {
420	get { return line; }
421	}
422	private string token;
423	public string Token {
424	get { return token; }
425	}
426	public DataFormatException(string message, string token, int line)
427	: base(message + "\nToken: " + token + " (line: " + line + ")") {
428	this.token = token;
429	this.line = line;
430	}
431
432	public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
433	}
434	}
435	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences