Context Navigation

source: branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis/3.3/TableFileParser.cs @ 12711

Visit:

Last change on this file since 12711 was 5275, checked in by gkronber, 14 years ago
Merged changes from trunk to data analysis exploration branch and added fractional distance metric evaluator. #1142
File size: 13.6 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22	using System;
23	using System.Collections.Generic;
24	using System.Globalization;
25	using System.IO;
26	using System.Linq;
27	using System.Text;
28
29	namespace HeuristicLab.Problems.DataAnalysis {
30	public class TableFileParser {
31	private const int BUFFER_SIZE = 1024;
32	private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
33	private const string VARIABLENAMES = "VARIABLENAMES";
34	private Tokenizer tokenizer;
35	private List<string> variableNames;
36	private List<List<double>> rowValues;
37
38	private int rows;
39	public int Rows {
40	get { return rows; }
41	set { rows = value; }
42	}
43
44	private int columns;
45	public int Columns {
46	get { return columns; }
47	set { columns = value; }
48	}
49
50	private double[,] values;
51	public double[,] Values {
52	get {
53	return values;
54	}
55	}
56
57	public IEnumerable<string> VariableNames {
58	get {
59	if (variableNames.Count > 0) return variableNames;
60	else {
61	string[] names = new string[columns];
62	for (int i = 0; i < names.Length; i++) {
63	names[i] = "X" + i.ToString("000");
64	}
65	return names;
66	}
67	}
68	}
69
70	public TableFileParser() {
71	rowValues = new List<List<double>>();
72	variableNames = new List<string>();
73	}
74
75	private void Reset() {
76	variableNames.Clear();
77	rowValues.Clear();
78	}
79
80	public void Parse(string fileName) {
81	NumberFormatInfo numberFormat;
82	char separator;
83	DetermineFileFormat(fileName, out numberFormat, out separator);
84	using (StreamReader reader = new StreamReader(fileName)) {
85	tokenizer = new Tokenizer(reader, numberFormat, separator);
86	// parse the file
87	Parse();
88	}
89
90	// translate the list of samples into a DoubleMatrixData item
91	rows = rowValues.Count;
92	columns = rowValues[0].Count;
93	values = new double[rows, columns];
94
95	int rowIndex = 0;
96	int columnIndex = 0;
97	foreach (List<double> row in rowValues) {
98	columnIndex = 0;
99	foreach (double element in row) {
100	values[rowIndex, columnIndex++] = element;
101	}
102	rowIndex++;
103	}
104	}
105
106	private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out char separator) {
107	using (StreamReader reader = new StreamReader(fileName)) {
108	// skip first line
109	reader.ReadLine();
110	// read a block
111	char[] buffer = new char[BUFFER_SIZE];
112	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
113	// count frequency of special characters
114	Dictionary<char, int> charCounts = buffer.Take(charsRead)
115	.GroupBy(c => c)
116	.ToDictionary(g => g.Key, g => g.Count());
117
118	// depending on the characters occuring in the block
119	// we distinghish a number of different cases based on the the following rules:
120	// many points => it must be English number format, the other frequently occuring char is the separator
121	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
122	// => check the line in more detail:
123	// English: 0, 0, 0, 0
124	// German: 0,0 0,0 0,0 ...
125	// => if commas are followed by space => English format
126	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
127	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
128	if (OccurrencesOf(charCounts, '.') > 10) {
129	numberFormat = NumberFormatInfo.InvariantInfo;
130	separator = POSSIBLE_SEPARATORS
131	.Where(c => OccurrencesOf(charCounts, c) > 10)
132	.OrderBy(c => -OccurrencesOf(charCounts, c))
133	.DefaultIfEmpty(' ')
134	.First();
135	} else if (OccurrencesOf(charCounts, ',') > 10) {
136	// no points and many commas
137	int countCommaNonDigitPairs = 0;
138	for (int i = 0; i < charsRead - 1; i++) {
139	if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
140	countCommaNonDigitPairs++;
141	}
142	}
143	if (countCommaNonDigitPairs > 10) {
144	// English format (only integer values) with ',' as separator
145	numberFormat = NumberFormatInfo.InvariantInfo;
146	separator = ',';
147	} else {
148	char[] disallowedSeparators = new char[] { ',' };
149	// German format (real values)
150	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
151	separator = POSSIBLE_SEPARATORS
152	.Except(disallowedSeparators)
153	.Where(c => OccurrencesOf(charCounts, c) > 10)
154	.OrderBy(c => -OccurrencesOf(charCounts, c))
155	.DefaultIfEmpty(' ')
156	.First();
157	}
158	} else {
159	// no points and no commas => English format
160	numberFormat = NumberFormatInfo.InvariantInfo;
161	separator = POSSIBLE_SEPARATORS
162	.Where(c => OccurrencesOf(charCounts, c) > 10)
163	.OrderBy(c => -OccurrencesOf(charCounts, c))
164	.DefaultIfEmpty(' ')
165	.First();
166	}
167	}
168	}
169
170	private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
171	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
172	}
173
174	#region tokenizer
175	internal enum TokenTypeEnum {
176	NewLine, Separator, String, Double
177	}
178
179	internal class Token {
180	public TokenTypeEnum type;
181	public string stringValue;
182	public double doubleValue;
183
184	public Token(TokenTypeEnum type, string value) {
185	this.type = type;
186	stringValue = value;
187	doubleValue = 0.0;
188	}
189
190	public override string ToString() {
191	return stringValue;
192	}
193	}
194
195
196	internal class Tokenizer {
197	private StreamReader reader;
198	private List<Token> tokens;
199	private NumberFormatInfo numberFormatInfo;
200	private char separator;
201	private const string INTERNAL_SEPARATOR = "#";
202
203	private int currentLineNumber = 0;
204	public int CurrentLineNumber {
205	get { return currentLineNumber; }
206	private set { currentLineNumber = value; }
207	}
208	private string currentLine;
209	public string CurrentLine {
210	get { return currentLine; }
211	private set { currentLine = value; }
212	}
213
214	private Token newlineToken;
215	public Token NewlineToken {
216	get { return newlineToken; }
217	private set { newlineToken = value; }
218	}
219	private Token separatorToken;
220	public Token SeparatorToken {
221	get { return separatorToken; }
222	private set { separatorToken = value; }
223	}
224
225	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
226	this.reader = reader;
227	this.numberFormatInfo = numberFormatInfo;
228	this.separator = separator;
229	separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
230	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
231	tokens = new List<Token>();
232	ReadNextTokens();
233	}
234
235	private void ReadNextTokens() {
236	if (!reader.EndOfStream) {
237	CurrentLine = reader.ReadLine();
238	var newTokens = from str in Split(CurrentLine)
239	let trimmedStr = str.Trim()
240	where !string.IsNullOrEmpty(trimmedStr)
241	select MakeToken(trimmedStr);
242
243	tokens.AddRange(newTokens);
244	tokens.Add(NewlineToken);
245	CurrentLineNumber++;
246	}
247	}
248
249	private IEnumerable<string> Split(string line) {
250	StringBuilder subStr = new StringBuilder();
251	foreach (char c in line) {
252	if (c == separator) {
253	yield return subStr.ToString();
254	subStr = new StringBuilder();
255	// all separator characters are transformed to the internally used separator character
256	yield return INTERNAL_SEPARATOR;
257	} else {
258	subStr.Append(c);
259	}
260	}
261	yield return subStr.ToString();
262	}
263
264	private Token MakeToken(string strToken) {
265	Token token = new Token(TokenTypeEnum.String, strToken);
266	if (strToken.Equals(INTERNAL_SEPARATOR)) {
267	return SeparatorToken;
268	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
269	token.type = TokenTypeEnum.Double;
270	return token;
271	}
272
273	// couldn't parse the token as an int or float number so return a string token
274	return token;
275	}
276
277	public Token Peek() {
278	return tokens[0];
279	}
280
281	public Token Next() {
282	Token next = tokens[0];
283	tokens.RemoveAt(0);
284	if (tokens.Count == 0) {
285	ReadNextTokens();
286	}
287	return next;
288	}
289
290	public bool HasNext() {
291	return tokens.Count > 0 \|\| !reader.EndOfStream;
292	}
293	}
294	#endregion
295
296	#region parsing
297	private void Parse() {
298	ParseVariableNames();
299	if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
300	ParseValues();
301	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
302	}
303
304	private void ParseValues() {
305	while (tokenizer.HasNext()) {
306	List<double> row = new List<double>();
307	row.Add(NextValue(tokenizer));
308	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
309	Expect(tokenizer.SeparatorToken);
310	row.Add(NextValue(tokenizer));
311	}
312	Expect(tokenizer.NewlineToken);
313	// all rows have to have the same number of values
314	// the first row defines how many samples are needed
315	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
316	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
317	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
318	}
319	// add the current row to the collection of rows and start a new row
320	rowValues.Add(row);
321	row = new List<double>();
322	}
323	}
324
325	private double NextValue(Tokenizer tokenizer) {
326	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
327	Token current = tokenizer.Next();
328	if (current.type == TokenTypeEnum.Separator \|\| current.type == TokenTypeEnum.String) {
329	return double.NaN;
330	} else if (current.type == TokenTypeEnum.Double) {
331	// just take the value
332	return current.doubleValue;
333	}
334	// found an unexpected token => throw error
335	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
336	// this line is never executed because Error() throws an exception
337	throw new InvalidOperationException();
338	}
339
340	private void ParseVariableNames() {
341	// if the first line doesn't start with a double value then we assume that the
342	// first line contains variable names
343	if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
344
345	List<Token> tokens = new List<Token>();
346	Token valueToken;
347	valueToken = tokenizer.Next();
348	tokens.Add(valueToken);
349	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
350	Expect(tokenizer.SeparatorToken);
351	valueToken = tokenizer.Next();
352	if (valueToken != tokenizer.NewlineToken) {
353	tokens.Add(valueToken);
354	}
355	}
356	if (valueToken != tokenizer.NewlineToken) {
357	Expect(tokenizer.NewlineToken);
358	}
359	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
360	}
361	}
362
363	private void Expect(Token expectedToken) {
364	Token actualToken = tokenizer.Next();
365	if (actualToken != expectedToken) {
366	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
367	}
368	}
369
370	private void Error(string message, string token, int lineNumber) {
371	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
372	}
373	#endregion
374	}
375	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences