Context Navigation

source: branches/ParallelEngine/HeuristicLab.Problems.DataAnalysis/3.3/TableFileParser.cs @ 5177

Visit:

Last change on this file since 5177 was 5096, checked in by gkronber, 14 years ago
Minor change in CSV importer to address a bug report. #1173
File size: 13.6 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22	using System;
23	using System.Collections.Generic;
24	using System.Globalization;
25	using System.IO;
26	using System.Linq;
27	using System.Text;
28
29	namespace HeuristicLab.Problems.DataAnalysis {
30	public class TableFileParser {
31	private const int BUFFER_SIZE = 1024;
32	private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
33	private const string VARIABLENAMES = "VARIABLENAMES";
34	private Tokenizer tokenizer;
35	private List<string> variableNames;
36	private List<List<double>> rowValues;
37
38	private int rows;
39	public int Rows {
40	get { return rows; }
41	set { rows = value; }
42	}
43
44	private int columns;
45	public int Columns {
46	get { return columns; }
47	set { columns = value; }
48	}
49
50	private double[,] values;
51	public double[,] Values {
52	get {
53	return values;
54	}
55	}
56
57	public IEnumerable<string> VariableNames {
58	get {
59	if (variableNames.Count > 0) return variableNames;
60	else {
61	string[] names = new string[columns];
62	for (int i = 0; i < names.Length; i++) {
63	names[i] = "X" + i.ToString("000");
64	}
65	return names;
66	}
67	}
68	}
69
70	public TableFileParser() {
71	rowValues = new List<List<double>>();
72	variableNames = new List<string>();
73	}
74
75	private void Reset() {
76	variableNames.Clear();
77	rowValues.Clear();
78	}
79
80	public void Parse(string fileName) {
81	NumberFormatInfo numberFormat;
82	char separator;
83	DetermineFileFormat(fileName, out numberFormat, out separator);
84	using (StreamReader reader = new StreamReader(fileName)) {
85	tokenizer = new Tokenizer(reader, numberFormat, separator);
86	// parse the file
87	Parse();
88	}
89
90	// translate the list of samples into a DoubleMatrixData item
91	rows = rowValues.Count;
92	columns = rowValues[0].Count;
93	values = new double[rows, columns];
94
95	int rowIndex = 0;
96	int columnIndex = 0;
97	foreach (List<double> row in rowValues) {
98	columnIndex = 0;
99	foreach (double element in row) {
100	values[rowIndex, columnIndex++] = element;
101	}
102	rowIndex++;
103	}
104	}
105
106	private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out char separator) {
107	using (StreamReader reader = new StreamReader(fileName)) {
108	// skip first line
109	reader.ReadLine();
110	// read a block
111	char[] buffer = new char[BUFFER_SIZE];
112	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
113	// count frequency of special characters
114	Dictionary<char, int> charCounts = buffer.Take(charsRead)
115	.GroupBy(c => c)
116	.ToDictionary(g => g.Key, g => g.Count());
117
118	// depending on the characters occuring in the block
119	// we distinghish a number of different cases based on the the following rules:
120	// many points => it must be English number format, the other frequently occuring char is the separator
121	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
122	// => check the line in more detail:
123	// English: 0, 0, 0, 0
124	// German: 0,0 0,0 0,0 ...
125	// => if commas are followed by space => English format
126	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
127	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
128	if (OccurrencesOf(charCounts, '.') > 10) {
129	numberFormat = NumberFormatInfo.InvariantInfo;
130	separator = POSSIBLE_SEPARATORS
131	.Where(c => OccurrencesOf(charCounts, c) > 10)
132	.OrderBy(c => -OccurrencesOf(charCounts, c))
133	.DefaultIfEmpty(' ')
134	.First();
135	} else if (OccurrencesOf(charCounts, ',') > 10) {
136	// no points and many commas
137	int countCommaNonDigitPairs = 0;
138	for (int i = 0; i < charsRead - 1; i++) {
139	if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
140	countCommaNonDigitPairs++;
141	}
142	}
143	if (countCommaNonDigitPairs > 10) {
144	// English format (only integer values) with ',' as separator
145	numberFormat = NumberFormatInfo.InvariantInfo;
146	separator = ',';
147	} else {
148	char[] disallowedSeparators = new char[] { ',' };
149	// German format (real values)
150	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
151	separator = POSSIBLE_SEPARATORS
152	.Except(disallowedSeparators)
153	.Where(c => OccurrencesOf(charCounts, c) > 10)
154	.OrderBy(c => -OccurrencesOf(charCounts, c))
155	.DefaultIfEmpty(' ')
156	.First();
157	}
158	} else {
159	// no points and no commas => English format
160	numberFormat = NumberFormatInfo.InvariantInfo;
161	separator = POSSIBLE_SEPARATORS
162	.Where(c => OccurrencesOf(charCounts, c) > 10)
163	.OrderBy(c => -OccurrencesOf(charCounts, c))
164	.DefaultIfEmpty(' ')
165	.First();
166	}
167	}
168	}
169
170	private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
171	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
172	}
173
174	#region tokenizer
175	internal enum TokenTypeEnum {
176	NewLine, Separator, String, Double
177	}
178
179	internal class Token {
180	public TokenTypeEnum type;
181	public string stringValue;
182	public double doubleValue;
183
184	public Token(TokenTypeEnum type, string value) {
185	this.type = type;
186	stringValue = value;
187	doubleValue = 0.0;
188	}
189
190	public override string ToString() {
191	return stringValue;
192	}
193	}
194
195
196	internal class Tokenizer {
197	private StreamReader reader;
198	private List<Token> tokens;
199	private NumberFormatInfo numberFormatInfo;
200	private char separator;
201	private const string INTERNAL_SEPARATOR = "#";
202
203	private int currentLineNumber = 0;
204	public int CurrentLineNumber {
205	get { return currentLineNumber; }
206	private set { currentLineNumber = value; }
207	}
208	private string currentLine;
209	public string CurrentLine {
210	get { return currentLine; }
211	private set { currentLine = value; }
212	}
213
214	private Token newlineToken;
215	public Token NewlineToken {
216	get { return newlineToken; }
217	private set { newlineToken = value; }
218	}
219	private Token separatorToken;
220	public Token SeparatorToken {
221	get { return separatorToken; }
222	private set { separatorToken = value; }
223	}
224
225	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
226	this.reader = reader;
227	this.numberFormatInfo = numberFormatInfo;
228	this.separator = separator;
229	separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
230	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
231	tokens = new List<Token>();
232	ReadNextTokens();
233	}
234
235	private void ReadNextTokens() {
236	if (!reader.EndOfStream) {
237	CurrentLine = reader.ReadLine();
238	var newTokens = from str in Split(CurrentLine)
239	let trimmedStr = str.Trim()
240	where !string.IsNullOrEmpty(trimmedStr)
241	select MakeToken(trimmedStr);
242
243	tokens.AddRange(newTokens);
244	tokens.Add(NewlineToken);
245	CurrentLineNumber++;
246	}
247	}
248
249	private IEnumerable<string> Split(string line) {
250	StringBuilder subStr = new StringBuilder();
251	foreach (char c in line) {
252	if (c == separator) {
253	yield return subStr.ToString();
254	subStr = new StringBuilder();
255	// all separator characters are transformed to the internally used separator character
256	yield return INTERNAL_SEPARATOR;
257	} else {
258	subStr.Append(c);
259	}
260	}
261	yield return subStr.ToString();
262	}
263
264	private Token MakeToken(string strToken) {
265	Token token = new Token(TokenTypeEnum.String, strToken);
266	if (strToken.Equals(INTERNAL_SEPARATOR)) {
267	return SeparatorToken;
268	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
269	token.type = TokenTypeEnum.Double;
270	return token;
271	}
272
273	// couldn't parse the token as an int or float number so return a string token
274	return token;
275	}
276
277	public Token Peek() {
278	return tokens[0];
279	}
280
281	public Token Next() {
282	Token next = tokens[0];
283	tokens.RemoveAt(0);
284	if (tokens.Count == 0) {
285	ReadNextTokens();
286	}
287	return next;
288	}
289
290	public bool HasNext() {
291	return tokens.Count > 0 \|\| !reader.EndOfStream;
292	}
293	}
294	#endregion
295
296	#region parsing
297	private void Parse() {
298	ParseVariableNames();
299	if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
300	ParseValues();
301	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
302	}
303
304	private void ParseValues() {
305	while (tokenizer.HasNext()) {
306	List<double> row = new List<double>();
307	row.Add(NextValue(tokenizer));
308	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
309	Expect(tokenizer.SeparatorToken);
310	row.Add(NextValue(tokenizer));
311	}
312	Expect(tokenizer.NewlineToken);
313	// all rows have to have the same number of values
314	// the first row defines how many samples are needed
315	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
316	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
317	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
318	}
319	// add the current row to the collection of rows and start a new row
320	rowValues.Add(row);
321	row = new List<double>();
322	}
323	}
324
325	private double NextValue(Tokenizer tokenizer) {
326	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
327	Token current = tokenizer.Next();
328	if (current.type == TokenTypeEnum.Separator \|\| current.type == TokenTypeEnum.String) {
329	return double.NaN;
330	} else if (current.type == TokenTypeEnum.Double) {
331	// just take the value
332	return current.doubleValue;
333	}
334	// found an unexpected token => throw error
335	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
336	// this line is never executed because Error() throws an exception
337	throw new InvalidOperationException();
338	}
339
340	private void ParseVariableNames() {
341	// if the first line doesn't start with a double value then we assume that the
342	// first line contains variable names
343	if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
344
345	List<Token> tokens = new List<Token>();
346	Token valueToken;
347	valueToken = tokenizer.Next();
348	tokens.Add(valueToken);
349	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
350	Expect(tokenizer.SeparatorToken);
351	valueToken = tokenizer.Next();
352	if (valueToken != tokenizer.NewlineToken) {
353	tokens.Add(valueToken);
354	}
355	}
356	if (valueToken != tokenizer.NewlineToken) {
357	Expect(tokenizer.NewlineToken);
358	}
359	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
360	}
361	}
362
363	private void Expect(Token expectedToken) {
364	Token actualToken = tokenizer.Next();
365	if (actualToken != expectedToken) {
366	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
367	}
368	}
369
370	private void Error(string message, string token, int lineNumber) {
371	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
372	}
373	#endregion
374	}
375	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences