Context Navigation

source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/TableFileParser.cs @ 5809

Visit:

Last change on this file since 5809 was 5809, checked in by mkommend, 13 years ago
#1418: Reintegrated branch into trunk.
File size: 14.1 KB

Rev	Line
[2]	1	#region License Information
	2	/* HeuristicLab
[5445]	3	* Copyright (C) 2002-2011 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[2]	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
	22	using System;
	23	using System.Collections.Generic;
	24	using System.Globalization;
	25	using System.IO;
[2446]	26	using System.Linq;
[5484]	27	using System.Runtime.Serialization;
[2446]	28	using System.Text;
[2]	29
[3373]	30	namespace HeuristicLab.Problems.DataAnalysis {
[5013]	31	public class TableFileParser {
	32	private const int BUFFER_SIZE = 1024;
	33	private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
[2]	34	private Tokenizer tokenizer;
[3264]	35	private List<List<double>> rowValues;
[2]	36
	37	private int rows;
	38	public int Rows {
	39	get { return rows; }
	40	set { rows = value; }
	41	}
	42
	43	private int columns;
	44	public int Columns {
	45	get { return columns; }
	46	set { columns = value; }
	47	}
	48
[3264]	49	private double[,] values;
	50	public double[,] Values {
[2]	51	get {
[3264]	52	return values;
[2]	53	}
	54	}
	55
[5369]	56	private List<string> variableNames;
[3264]	57	public IEnumerable<string> VariableNames {
[2]	58	get {
[3264]	59	if (variableNames.Count > 0) return variableNames;
	60	else {
[273]	61	string[] names = new string[columns];
[1221]	62	for (int i = 0; i < names.Length; i++) {
[273]	63	names[i] = "X" + i.ToString("000");
	64	}
	65	return names;
[2]	66	}
	67	}
	68	}
	69
[5013]	70	public TableFileParser() {
[3264]	71	rowValues = new List<List<double>>();
	72	variableNames = new List<string>();
[2]	73	}
	74
[3264]	75	public void Parse(string fileName) {
[5013]	76	NumberFormatInfo numberFormat;
	77	char separator;
	78	DetermineFileFormat(fileName, out numberFormat, out separator);
	79	using (StreamReader reader = new StreamReader(fileName)) {
	80	tokenizer = new Tokenizer(reader, numberFormat, separator);
	81	// parse the file
	82	Parse();
	83	}
	84
[2]	85	// translate the list of samples into a DoubleMatrixData item
[3264]	86	rows = rowValues.Count;
	87	columns = rowValues[0].Count;
	88	values = new double[rows, columns];
[2]	89
[3264]	90	int rowIndex = 0;
	91	int columnIndex = 0;
	92	foreach (List<double> row in rowValues) {
	93	columnIndex = 0;
[1221]	94	foreach (double element in row) {
[3264]	95	values[rowIndex, columnIndex++] = element;
[2]	96	}
[3264]	97	rowIndex++;
[2]	98	}
	99	}
	100
[5013]	101	private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out char separator) {
	102	using (StreamReader reader = new StreamReader(fileName)) {
	103	// skip first line
	104	reader.ReadLine();
	105	// read a block
	106	char[] buffer = new char[BUFFER_SIZE];
	107	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
	108	// count frequency of special characters
	109	Dictionary<char, int> charCounts = buffer.Take(charsRead)
	110	.GroupBy(c => c)
	111	.ToDictionary(g => g.Key, g => g.Count());
	112
	113	// depending on the characters occuring in the block
	114	// we distinghish a number of different cases based on the the following rules:
	115	// many points => it must be English number format, the other frequently occuring char is the separator
	116	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
	117	// => check the line in more detail:
	118	// English: 0, 0, 0, 0
	119	// German: 0,0 0,0 0,0 ...
	120	// => if commas are followed by space => English format
	121	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
	122	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
	123	if (OccurrencesOf(charCounts, '.') > 10) {
	124	numberFormat = NumberFormatInfo.InvariantInfo;
	125	separator = POSSIBLE_SEPARATORS
	126	.Where(c => OccurrencesOf(charCounts, c) > 10)
	127	.OrderBy(c => -OccurrencesOf(charCounts, c))
	128	.DefaultIfEmpty(' ')
	129	.First();
	130	} else if (OccurrencesOf(charCounts, ',') > 10) {
	131	// no points and many commas
	132	int countCommaNonDigitPairs = 0;
	133	for (int i = 0; i < charsRead - 1; i++) {
	134	if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
	135	countCommaNonDigitPairs++;
	136	}
[1221]	137	}
[5013]	138	if (countCommaNonDigitPairs > 10) {
	139	// English format (only integer values) with ',' as separator
	140	numberFormat = NumberFormatInfo.InvariantInfo;
	141	separator = ',';
	142	} else {
	143	char[] disallowedSeparators = new char[] { ',' };
	144	// German format (real values)
[5096]	145	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
[5013]	146	separator = POSSIBLE_SEPARATORS
	147	.Except(disallowedSeparators)
	148	.Where(c => OccurrencesOf(charCounts, c) > 10)
	149	.OrderBy(c => -OccurrencesOf(charCounts, c))
	150	.DefaultIfEmpty(' ')
	151	.First();
[405]	152	}
[5013]	153	} else {
	154	// no points and no commas => English format
	155	numberFormat = NumberFormatInfo.InvariantInfo;
	156	separator = POSSIBLE_SEPARATORS
	157	.Where(c => OccurrencesOf(charCounts, c) > 10)
	158	.OrderBy(c => -OccurrencesOf(charCounts, c))
	159	.DefaultIfEmpty(' ')
	160	.First();
[405]	161	}
	162	}
	163	}
	164
[5013]	165	private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
	166	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
	167	}
	168
[2]	169	#region tokenizer
	170	internal enum TokenTypeEnum {
[3264]	171	NewLine, Separator, String, Double
[2]	172	}
	173
	174	internal class Token {
	175	public TokenTypeEnum type;
	176	public string stringValue;
	177	public double doubleValue;
	178
	179	public Token(TokenTypeEnum type, string value) {
	180	this.type = type;
	181	stringValue = value;
	182	doubleValue = 0.0;
	183	}
	184
	185	public override string ToString() {
	186	return stringValue;
	187	}
	188	}
	189
	190
[3264]	191	internal class Tokenizer {
[2]	192	private StreamReader reader;
	193	private List<Token> tokens;
[405]	194	private NumberFormatInfo numberFormatInfo;
[5013]	195	private char separator;
	196	private const string INTERNAL_SEPARATOR = "#";
[2]	197
[3264]	198	private int currentLineNumber = 0;
	199	public int CurrentLineNumber {
	200	get { return currentLineNumber; }
	201	private set { currentLineNumber = value; }
	202	}
	203	private string currentLine;
	204	public string CurrentLine {
	205	get { return currentLine; }
	206	private set { currentLine = value; }
	207	}
[2]	208
[3264]	209	private Token newlineToken;
	210	public Token NewlineToken {
	211	get { return newlineToken; }
	212	private set { newlineToken = value; }
	213	}
	214	private Token separatorToken;
	215	public Token SeparatorToken {
	216	get { return separatorToken; }
	217	private set { separatorToken = value; }
	218	}
[2]	219
[3264]	220	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
[2]	221	this.reader = reader;
[405]	222	this.numberFormatInfo = numberFormatInfo;
[5013]	223	this.separator = separator;
	224	separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
[3264]	225	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
[2]	226	tokens = new List<Token>();
	227	ReadNextTokens();
	228	}
	229
	230	private void ReadNextTokens() {
[1221]	231	if (!reader.EndOfStream) {
[2]	232	CurrentLine = reader.ReadLine();
[2446]	233	var newTokens = from str in Split(CurrentLine)
	234	let trimmedStr = str.Trim()
	235	where !string.IsNullOrEmpty(trimmedStr)
[5013]	236	select MakeToken(trimmedStr);
[2]	237
[2446]	238	tokens.AddRange(newTokens);
[2]	239	tokens.Add(NewlineToken);
	240	CurrentLineNumber++;
	241	}
	242	}
	243
[2446]	244	private IEnumerable<string> Split(string line) {
	245	StringBuilder subStr = new StringBuilder();
	246	foreach (char c in line) {
[5013]	247	if (c == separator) {
[2446]	248	yield return subStr.ToString();
	249	subStr = new StringBuilder();
[5013]	250	// all separator characters are transformed to the internally used separator character
	251	yield return INTERNAL_SEPARATOR;
[2446]	252	} else {
	253	subStr.Append(c);
	254	}
	255	}
	256	yield return subStr.ToString();
	257	}
	258
[2]	259	private Token MakeToken(string strToken) {
[406]	260	Token token = new Token(TokenTypeEnum.String, strToken);
[5013]	261	if (strToken.Equals(INTERNAL_SEPARATOR)) {
[2446]	262	return SeparatorToken;
[1221]	263	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
[406]	264	token.type = TokenTypeEnum.Double;
	265	return token;
[2]	266	}
[2446]	267
[406]	268	// couldn't parse the token as an int or float number so return a string token
	269	return token;
[2]	270	}
	271
	272	public Token Peek() {
	273	return tokens[0];
	274	}
	275
	276	public Token Next() {
	277	Token next = tokens[0];
	278	tokens.RemoveAt(0);
[1221]	279	if (tokens.Count == 0) {
[2]	280	ReadNextTokens();
	281	}
	282	return next;
	283	}
	284
	285	public bool HasNext() {
	286	return tokens.Count > 0 \|\| !reader.EndOfStream;
	287	}
	288	}
	289	#endregion
	290
	291	#region parsing
[3264]	292	private void Parse() {
	293	ParseVariableNames();
[1221]	294	if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[3264]	295	ParseValues();
	296	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[2]	297	}
	298
[3264]	299	private void ParseValues() {
[1221]	300	while (tokenizer.HasNext()) {
[2446]	301	List<double> row = new List<double>();
[3264]	302	row.Add(NextValue(tokenizer));
	303	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
	304	Expect(tokenizer.SeparatorToken);
	305	row.Add(NextValue(tokenizer));
[2446]	306	}
[3264]	307	Expect(tokenizer.NewlineToken);
	308	// all rows have to have the same number of values
	309	// the first row defines how many samples are needed
	310	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
	311	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
	312	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
[2446]	313	}
	314	// add the current row to the collection of rows and start a new row
[3264]	315	rowValues.Add(row);
[2446]	316	row = new List<double>();
	317	}
	318	}
	319
[3264]	320	private double NextValue(Tokenizer tokenizer) {
	321	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
[2446]	322	Token current = tokenizer.Next();
[3264]	323	if (current.type == TokenTypeEnum.Separator \|\| current.type == TokenTypeEnum.String) {
[2446]	324	return double.NaN;
	325	} else if (current.type == TokenTypeEnum.Double) {
	326	// just take the value
	327	return current.doubleValue;
[2]	328	}
[3264]	329	// found an unexpected token => throw error
	330	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
	331	// this line is never executed because Error() throws an exception
	332	throw new InvalidOperationException();
[2]	333	}
	334
[3264]	335	private void ParseVariableNames() {
	336	// if the first line doesn't start with a double value then we assume that the
	337	// first line contains variable names
	338	if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
[2446]	339
[2]	340	List<Token> tokens = new List<Token>();
[1221]	341	Token valueToken;
	342	valueToken = tokenizer.Next();
[2446]	343	tokens.Add(valueToken);
[3264]	344	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
	345	Expect(tokenizer.SeparatorToken);
[2]	346	valueToken = tokenizer.Next();
[3264]	347	if (valueToken != tokenizer.NewlineToken) {
[2446]	348	tokens.Add(valueToken);
	349	}
[2]	350	}
[3264]	351	if (valueToken != tokenizer.NewlineToken) {
	352	Expect(tokenizer.NewlineToken);
[2446]	353	}
[3264]	354	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
[2]	355	}
	356	}
	357
	358	private void Expect(Token expectedToken) {
	359	Token actualToken = tokenizer.Next();
[1221]	360	if (actualToken != expectedToken) {
[273]	361	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
[2]	362	}
	363	}
	364
[273]	365	private void Error(string message, string token, int lineNumber) {
	366	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
[2]	367	}
	368	#endregion
[5484]	369
	370	[Serializable]
	371	private class DataFormatException : Exception {
	372	private int line;
	373	public int Line {
	374	get { return line; }
	375	}
	376	private string token;
	377	public string Token {
	378	get { return token; }
	379	}
	380	public DataFormatException(string message, string token, int line)
	381	: base(message + "\nToken: " + token + " (line: " + line + ")") {
	382	this.token = token;
	383	this.line = line;
	384	}
	385
	386	public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
	387	}
[2]	388	}
	389	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences