Context Navigation

source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/TableFileParser.cs @ 5680

Visit:

Last change on this file since 5680 was 5445, checked in by swagner, 14 years ago
Updated year of copyrights (#1406)
File size: 13.5 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2011 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22	using System;
23	using System.Collections.Generic;
24	using System.Globalization;
25	using System.IO;
26	using System.Linq;
27	using System.Text;
28
29	namespace HeuristicLab.Problems.DataAnalysis {
30	public class TableFileParser {
31	private const int BUFFER_SIZE = 1024;
32	private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
33	private Tokenizer tokenizer;
34	private List<List<double>> rowValues;
35
36	private int rows;
37	public int Rows {
38	get { return rows; }
39	set { rows = value; }
40	}
41
42	private int columns;
43	public int Columns {
44	get { return columns; }
45	set { columns = value; }
46	}
47
48	private double[,] values;
49	public double[,] Values {
50	get {
51	return values;
52	}
53	}
54
55	private List<string> variableNames;
56	public IEnumerable<string> VariableNames {
57	get {
58	if (variableNames.Count > 0) return variableNames;
59	else {
60	string[] names = new string[columns];
61	for (int i = 0; i < names.Length; i++) {
62	names[i] = "X" + i.ToString("000");
63	}
64	return names;
65	}
66	}
67	}
68
69	public TableFileParser() {
70	rowValues = new List<List<double>>();
71	variableNames = new List<string>();
72	}
73
74	public void Parse(string fileName) {
75	NumberFormatInfo numberFormat;
76	char separator;
77	DetermineFileFormat(fileName, out numberFormat, out separator);
78	using (StreamReader reader = new StreamReader(fileName)) {
79	tokenizer = new Tokenizer(reader, numberFormat, separator);
80	// parse the file
81	Parse();
82	}
83
84	// translate the list of samples into a DoubleMatrixData item
85	rows = rowValues.Count;
86	columns = rowValues[0].Count;
87	values = new double[rows, columns];
88
89	int rowIndex = 0;
90	int columnIndex = 0;
91	foreach (List<double> row in rowValues) {
92	columnIndex = 0;
93	foreach (double element in row) {
94	values[rowIndex, columnIndex++] = element;
95	}
96	rowIndex++;
97	}
98	}
99
100	private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out char separator) {
101	using (StreamReader reader = new StreamReader(fileName)) {
102	// skip first line
103	reader.ReadLine();
104	// read a block
105	char[] buffer = new char[BUFFER_SIZE];
106	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
107	// count frequency of special characters
108	Dictionary<char, int> charCounts = buffer.Take(charsRead)
109	.GroupBy(c => c)
110	.ToDictionary(g => g.Key, g => g.Count());
111
112	// depending on the characters occuring in the block
113	// we distinghish a number of different cases based on the the following rules:
114	// many points => it must be English number format, the other frequently occuring char is the separator
115	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
116	// => check the line in more detail:
117	// English: 0, 0, 0, 0
118	// German: 0,0 0,0 0,0 ...
119	// => if commas are followed by space => English format
120	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
121	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
122	if (OccurrencesOf(charCounts, '.') > 10) {
123	numberFormat = NumberFormatInfo.InvariantInfo;
124	separator = POSSIBLE_SEPARATORS
125	.Where(c => OccurrencesOf(charCounts, c) > 10)
126	.OrderBy(c => -OccurrencesOf(charCounts, c))
127	.DefaultIfEmpty(' ')
128	.First();
129	} else if (OccurrencesOf(charCounts, ',') > 10) {
130	// no points and many commas
131	int countCommaNonDigitPairs = 0;
132	for (int i = 0; i < charsRead - 1; i++) {
133	if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
134	countCommaNonDigitPairs++;
135	}
136	}
137	if (countCommaNonDigitPairs > 10) {
138	// English format (only integer values) with ',' as separator
139	numberFormat = NumberFormatInfo.InvariantInfo;
140	separator = ',';
141	} else {
142	char[] disallowedSeparators = new char[] { ',' };
143	// German format (real values)
144	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
145	separator = POSSIBLE_SEPARATORS
146	.Except(disallowedSeparators)
147	.Where(c => OccurrencesOf(charCounts, c) > 10)
148	.OrderBy(c => -OccurrencesOf(charCounts, c))
149	.DefaultIfEmpty(' ')
150	.First();
151	}
152	} else {
153	// no points and no commas => English format
154	numberFormat = NumberFormatInfo.InvariantInfo;
155	separator = POSSIBLE_SEPARATORS
156	.Where(c => OccurrencesOf(charCounts, c) > 10)
157	.OrderBy(c => -OccurrencesOf(charCounts, c))
158	.DefaultIfEmpty(' ')
159	.First();
160	}
161	}
162	}
163
164	private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
165	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
166	}
167
168	#region tokenizer
169	internal enum TokenTypeEnum {
170	NewLine, Separator, String, Double
171	}
172
173	internal class Token {
174	public TokenTypeEnum type;
175	public string stringValue;
176	public double doubleValue;
177
178	public Token(TokenTypeEnum type, string value) {
179	this.type = type;
180	stringValue = value;
181	doubleValue = 0.0;
182	}
183
184	public override string ToString() {
185	return stringValue;
186	}
187	}
188
189
190	internal class Tokenizer {
191	private StreamReader reader;
192	private List<Token> tokens;
193	private NumberFormatInfo numberFormatInfo;
194	private char separator;
195	private const string INTERNAL_SEPARATOR = "#";
196
197	private int currentLineNumber = 0;
198	public int CurrentLineNumber {
199	get { return currentLineNumber; }
200	private set { currentLineNumber = value; }
201	}
202	private string currentLine;
203	public string CurrentLine {
204	get { return currentLine; }
205	private set { currentLine = value; }
206	}
207
208	private Token newlineToken;
209	public Token NewlineToken {
210	get { return newlineToken; }
211	private set { newlineToken = value; }
212	}
213	private Token separatorToken;
214	public Token SeparatorToken {
215	get { return separatorToken; }
216	private set { separatorToken = value; }
217	}
218
219	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
220	this.reader = reader;
221	this.numberFormatInfo = numberFormatInfo;
222	this.separator = separator;
223	separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
224	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
225	tokens = new List<Token>();
226	ReadNextTokens();
227	}
228
229	private void ReadNextTokens() {
230	if (!reader.EndOfStream) {
231	CurrentLine = reader.ReadLine();
232	var newTokens = from str in Split(CurrentLine)
233	let trimmedStr = str.Trim()
234	where !string.IsNullOrEmpty(trimmedStr)
235	select MakeToken(trimmedStr);
236
237	tokens.AddRange(newTokens);
238	tokens.Add(NewlineToken);
239	CurrentLineNumber++;
240	}
241	}
242
243	private IEnumerable<string> Split(string line) {
244	StringBuilder subStr = new StringBuilder();
245	foreach (char c in line) {
246	if (c == separator) {
247	yield return subStr.ToString();
248	subStr = new StringBuilder();
249	// all separator characters are transformed to the internally used separator character
250	yield return INTERNAL_SEPARATOR;
251	} else {
252	subStr.Append(c);
253	}
254	}
255	yield return subStr.ToString();
256	}
257
258	private Token MakeToken(string strToken) {
259	Token token = new Token(TokenTypeEnum.String, strToken);
260	if (strToken.Equals(INTERNAL_SEPARATOR)) {
261	return SeparatorToken;
262	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
263	token.type = TokenTypeEnum.Double;
264	return token;
265	}
266
267	// couldn't parse the token as an int or float number so return a string token
268	return token;
269	}
270
271	public Token Peek() {
272	return tokens[0];
273	}
274
275	public Token Next() {
276	Token next = tokens[0];
277	tokens.RemoveAt(0);
278	if (tokens.Count == 0) {
279	ReadNextTokens();
280	}
281	return next;
282	}
283
284	public bool HasNext() {
285	return tokens.Count > 0 \|\| !reader.EndOfStream;
286	}
287	}
288	#endregion
289
290	#region parsing
291	private void Parse() {
292	ParseVariableNames();
293	if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
294	ParseValues();
295	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
296	}
297
298	private void ParseValues() {
299	while (tokenizer.HasNext()) {
300	List<double> row = new List<double>();
301	row.Add(NextValue(tokenizer));
302	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
303	Expect(tokenizer.SeparatorToken);
304	row.Add(NextValue(tokenizer));
305	}
306	Expect(tokenizer.NewlineToken);
307	// all rows have to have the same number of values
308	// the first row defines how many samples are needed
309	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
310	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
311	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
312	}
313	// add the current row to the collection of rows and start a new row
314	rowValues.Add(row);
315	row = new List<double>();
316	}
317	}
318
319	private double NextValue(Tokenizer tokenizer) {
320	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
321	Token current = tokenizer.Next();
322	if (current.type == TokenTypeEnum.Separator \|\| current.type == TokenTypeEnum.String) {
323	return double.NaN;
324	} else if (current.type == TokenTypeEnum.Double) {
325	// just take the value
326	return current.doubleValue;
327	}
328	// found an unexpected token => throw error
329	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
330	// this line is never executed because Error() throws an exception
331	throw new InvalidOperationException();
332	}
333
334	private void ParseVariableNames() {
335	// if the first line doesn't start with a double value then we assume that the
336	// first line contains variable names
337	if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
338
339	List<Token> tokens = new List<Token>();
340	Token valueToken;
341	valueToken = tokenizer.Next();
342	tokens.Add(valueToken);
343	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
344	Expect(tokenizer.SeparatorToken);
345	valueToken = tokenizer.Next();
346	if (valueToken != tokenizer.NewlineToken) {
347	tokens.Add(valueToken);
348	}
349	}
350	if (valueToken != tokenizer.NewlineToken) {
351	Expect(tokenizer.NewlineToken);
352	}
353	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
354	}
355	}
356
357	private void Expect(Token expectedToken) {
358	Token actualToken = tokenizer.Next();
359	if (actualToken != expectedToken) {
360	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
361	}
362	}
363
364	private void Error(string message, string token, int lineNumber) {
365	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
366	}
367	#endregion
368	}
369	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences