Context Navigation

source: branches/DataAnalysis/HeuristicLab.Problems.DataAnalysis/3.3/TableFileParser.cs @ 8347

Visit:

Last change on this file since 8347 was 5275, checked in by gkronber, 14 years ago
Merged changes from trunk to data analysis exploration branch and added fractional distance metric evaluator. #1142
File size: 13.6 KB

Rev	Line
[5275]	1	#region License Information
	2	/* HeuristicLab
	3	* Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
	22	using System;
	23	using System.Collections.Generic;
	24	using System.Globalization;
	25	using System.IO;
	26	using System.Linq;
	27	using System.Text;
	28
	29	namespace HeuristicLab.Problems.DataAnalysis {
	30	public class TableFileParser {
	31	private const int BUFFER_SIZE = 1024;
	32	private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
	33	private const string VARIABLENAMES = "VARIABLENAMES";
	34	private Tokenizer tokenizer;
	35	private List<string> variableNames;
	36	private List<List<double>> rowValues;
	37
	38	private int rows;
	39	public int Rows {
	40	get { return rows; }
	41	set { rows = value; }
	42	}
	43
	44	private int columns;
	45	public int Columns {
	46	get { return columns; }
	47	set { columns = value; }
	48	}
	49
	50	private double[,] values;
	51	public double[,] Values {
	52	get {
	53	return values;
	54	}
	55	}
	56
	57	public IEnumerable<string> VariableNames {
	58	get {
	59	if (variableNames.Count > 0) return variableNames;
	60	else {
	61	string[] names = new string[columns];
	62	for (int i = 0; i < names.Length; i++) {
	63	names[i] = "X" + i.ToString("000");
	64	}
	65	return names;
	66	}
	67	}
	68	}
	69
	70	public TableFileParser() {
	71	rowValues = new List<List<double>>();
	72	variableNames = new List<string>();
	73	}
	74
	75	private void Reset() {
	76	variableNames.Clear();
	77	rowValues.Clear();
	78	}
	79
	80	public void Parse(string fileName) {
	81	NumberFormatInfo numberFormat;
	82	char separator;
	83	DetermineFileFormat(fileName, out numberFormat, out separator);
	84	using (StreamReader reader = new StreamReader(fileName)) {
	85	tokenizer = new Tokenizer(reader, numberFormat, separator);
	86	// parse the file
	87	Parse();
	88	}
	89
	90	// translate the list of samples into a DoubleMatrixData item
	91	rows = rowValues.Count;
	92	columns = rowValues[0].Count;
	93	values = new double[rows, columns];
	94
	95	int rowIndex = 0;
	96	int columnIndex = 0;
	97	foreach (List<double> row in rowValues) {
	98	columnIndex = 0;
	99	foreach (double element in row) {
	100	values[rowIndex, columnIndex++] = element;
	101	}
	102	rowIndex++;
	103	}
	104	}
	105
	106	private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out char separator) {
	107	using (StreamReader reader = new StreamReader(fileName)) {
	108	// skip first line
	109	reader.ReadLine();
	110	// read a block
	111	char[] buffer = new char[BUFFER_SIZE];
	112	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
	113	// count frequency of special characters
	114	Dictionary<char, int> charCounts = buffer.Take(charsRead)
	115	.GroupBy(c => c)
	116	.ToDictionary(g => g.Key, g => g.Count());
	117
	118	// depending on the characters occuring in the block
	119	// we distinghish a number of different cases based on the the following rules:
	120	// many points => it must be English number format, the other frequently occuring char is the separator
	121	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
	122	// => check the line in more detail:
	123	// English: 0, 0, 0, 0
	124	// German: 0,0 0,0 0,0 ...
	125	// => if commas are followed by space => English format
	126	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
	127	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
	128	if (OccurrencesOf(charCounts, '.') > 10) {
	129	numberFormat = NumberFormatInfo.InvariantInfo;
	130	separator = POSSIBLE_SEPARATORS
	131	.Where(c => OccurrencesOf(charCounts, c) > 10)
	132	.OrderBy(c => -OccurrencesOf(charCounts, c))
	133	.DefaultIfEmpty(' ')
	134	.First();
	135	} else if (OccurrencesOf(charCounts, ',') > 10) {
	136	// no points and many commas
	137	int countCommaNonDigitPairs = 0;
	138	for (int i = 0; i < charsRead - 1; i++) {
	139	if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
	140	countCommaNonDigitPairs++;
	141	}
	142	}
	143	if (countCommaNonDigitPairs > 10) {
	144	// English format (only integer values) with ',' as separator
	145	numberFormat = NumberFormatInfo.InvariantInfo;
	146	separator = ',';
	147	} else {
	148	char[] disallowedSeparators = new char[] { ',' };
	149	// German format (real values)
	150	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
	151	separator = POSSIBLE_SEPARATORS
	152	.Except(disallowedSeparators)
	153	.Where(c => OccurrencesOf(charCounts, c) > 10)
	154	.OrderBy(c => -OccurrencesOf(charCounts, c))
	155	.DefaultIfEmpty(' ')
	156	.First();
	157	}
	158	} else {
	159	// no points and no commas => English format
	160	numberFormat = NumberFormatInfo.InvariantInfo;
	161	separator = POSSIBLE_SEPARATORS
	162	.Where(c => OccurrencesOf(charCounts, c) > 10)
	163	.OrderBy(c => -OccurrencesOf(charCounts, c))
	164	.DefaultIfEmpty(' ')
	165	.First();
	166	}
	167	}
	168	}
	169
	170	private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
	171	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
	172	}
	173
	174	#region tokenizer
	175	internal enum TokenTypeEnum {
	176	NewLine, Separator, String, Double
	177	}
	178
	179	internal class Token {
	180	public TokenTypeEnum type;
	181	public string stringValue;
	182	public double doubleValue;
	183
	184	public Token(TokenTypeEnum type, string value) {
	185	this.type = type;
	186	stringValue = value;
	187	doubleValue = 0.0;
	188	}
	189
	190	public override string ToString() {
	191	return stringValue;
	192	}
	193	}
	194
	195
	196	internal class Tokenizer {
	197	private StreamReader reader;
	198	private List<Token> tokens;
	199	private NumberFormatInfo numberFormatInfo;
	200	private char separator;
	201	private const string INTERNAL_SEPARATOR = "#";
	202
	203	private int currentLineNumber = 0;
	204	public int CurrentLineNumber {
	205	get { return currentLineNumber; }
	206	private set { currentLineNumber = value; }
	207	}
	208	private string currentLine;
	209	public string CurrentLine {
	210	get { return currentLine; }
	211	private set { currentLine = value; }
	212	}
	213
	214	private Token newlineToken;
	215	public Token NewlineToken {
	216	get { return newlineToken; }
	217	private set { newlineToken = value; }
	218	}
	219	private Token separatorToken;
	220	public Token SeparatorToken {
	221	get { return separatorToken; }
	222	private set { separatorToken = value; }
	223	}
	224
	225	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
	226	this.reader = reader;
	227	this.numberFormatInfo = numberFormatInfo;
	228	this.separator = separator;
	229	separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
	230	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
	231	tokens = new List<Token>();
	232	ReadNextTokens();
	233	}
	234
	235	private void ReadNextTokens() {
	236	if (!reader.EndOfStream) {
	237	CurrentLine = reader.ReadLine();
	238	var newTokens = from str in Split(CurrentLine)
	239	let trimmedStr = str.Trim()
	240	where !string.IsNullOrEmpty(trimmedStr)
	241	select MakeToken(trimmedStr);
	242
	243	tokens.AddRange(newTokens);
	244	tokens.Add(NewlineToken);
	245	CurrentLineNumber++;
	246	}
	247	}
	248
	249	private IEnumerable<string> Split(string line) {
	250	StringBuilder subStr = new StringBuilder();
	251	foreach (char c in line) {
	252	if (c == separator) {
	253	yield return subStr.ToString();
	254	subStr = new StringBuilder();
	255	// all separator characters are transformed to the internally used separator character
	256	yield return INTERNAL_SEPARATOR;
	257	} else {
	258	subStr.Append(c);
	259	}
	260	}
	261	yield return subStr.ToString();
	262	}
	263
	264	private Token MakeToken(string strToken) {
	265	Token token = new Token(TokenTypeEnum.String, strToken);
	266	if (strToken.Equals(INTERNAL_SEPARATOR)) {
	267	return SeparatorToken;
	268	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
	269	token.type = TokenTypeEnum.Double;
	270	return token;
	271	}
	272
	273	// couldn't parse the token as an int or float number so return a string token
	274	return token;
	275	}
	276
	277	public Token Peek() {
	278	return tokens[0];
	279	}
	280
	281	public Token Next() {
	282	Token next = tokens[0];
	283	tokens.RemoveAt(0);
	284	if (tokens.Count == 0) {
	285	ReadNextTokens();
	286	}
	287	return next;
	288	}
	289
	290	public bool HasNext() {
	291	return tokens.Count > 0 \|\| !reader.EndOfStream;
	292	}
	293	}
	294	#endregion
	295
	296	#region parsing
	297	private void Parse() {
	298	ParseVariableNames();
	299	if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
	300	ParseValues();
	301	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
	302	}
	303
	304	private void ParseValues() {
	305	while (tokenizer.HasNext()) {
	306	List<double> row = new List<double>();
	307	row.Add(NextValue(tokenizer));
	308	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
	309	Expect(tokenizer.SeparatorToken);
	310	row.Add(NextValue(tokenizer));
	311	}
	312	Expect(tokenizer.NewlineToken);
	313	// all rows have to have the same number of values
	314	// the first row defines how many samples are needed
	315	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
	316	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
	317	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
	318	}
	319	// add the current row to the collection of rows and start a new row
	320	rowValues.Add(row);
	321	row = new List<double>();
	322	}
	323	}
	324
	325	private double NextValue(Tokenizer tokenizer) {
	326	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
	327	Token current = tokenizer.Next();
	328	if (current.type == TokenTypeEnum.Separator \|\| current.type == TokenTypeEnum.String) {
	329	return double.NaN;
	330	} else if (current.type == TokenTypeEnum.Double) {
	331	// just take the value
	332	return current.doubleValue;
	333	}
	334	// found an unexpected token => throw error
	335	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
	336	// this line is never executed because Error() throws an exception
	337	throw new InvalidOperationException();
	338	}
	339
	340	private void ParseVariableNames() {
	341	// if the first line doesn't start with a double value then we assume that the
	342	// first line contains variable names
	343	if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
	344
	345	List<Token> tokens = new List<Token>();
	346	Token valueToken;
	347	valueToken = tokenizer.Next();
	348	tokens.Add(valueToken);
	349	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
	350	Expect(tokenizer.SeparatorToken);
	351	valueToken = tokenizer.Next();
	352	if (valueToken != tokenizer.NewlineToken) {
	353	tokens.Add(valueToken);
	354	}
	355	}
	356	if (valueToken != tokenizer.NewlineToken) {
	357	Expect(tokenizer.NewlineToken);
	358	}
	359	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
	360	}
	361	}
	362
	363	private void Expect(Token expectedToken) {
	364	Token actualToken = tokenizer.Next();
	365	if (actualToken != expectedToken) {
	366	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
	367	}
	368	}
	369
	370	private void Error(string message, string token, int lineNumber) {
	371	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
	372	}
	373	#endregion
	374	}
	375	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences