Context Navigation

source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/TableFileParser.cs @ 7005

Visit:

Last change on this file since 7005 was 5445, checked in by swagner, 14 years ago
Updated year of copyrights (#1406)
File size: 13.5 KB

Rev	Line
[2]	1	#region License Information
	2	/* HeuristicLab
[5445]	3	* Copyright (C) 2002-2011 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[2]	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
	22	using System;
	23	using System.Collections.Generic;
	24	using System.Globalization;
	25	using System.IO;
[2446]	26	using System.Linq;
	27	using System.Text;
[2]	28
[3373]	29	namespace HeuristicLab.Problems.DataAnalysis {
[5013]	30	public class TableFileParser {
	31	private const int BUFFER_SIZE = 1024;
	32	private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
[2]	33	private Tokenizer tokenizer;
[3264]	34	private List<List<double>> rowValues;
[2]	35
	36	private int rows;
	37	public int Rows {
	38	get { return rows; }
	39	set { rows = value; }
	40	}
	41
	42	private int columns;
	43	public int Columns {
	44	get { return columns; }
	45	set { columns = value; }
	46	}
	47
[3264]	48	private double[,] values;
	49	public double[,] Values {
[2]	50	get {
[3264]	51	return values;
[2]	52	}
	53	}
	54
[5369]	55	private List<string> variableNames;
[3264]	56	public IEnumerable<string> VariableNames {
[2]	57	get {
[3264]	58	if (variableNames.Count > 0) return variableNames;
	59	else {
[273]	60	string[] names = new string[columns];
[1221]	61	for (int i = 0; i < names.Length; i++) {
[273]	62	names[i] = "X" + i.ToString("000");
	63	}
	64	return names;
[2]	65	}
	66	}
	67	}
	68
[5013]	69	public TableFileParser() {
[3264]	70	rowValues = new List<List<double>>();
	71	variableNames = new List<string>();
[2]	72	}
	73
[3264]	74	public void Parse(string fileName) {
[5013]	75	NumberFormatInfo numberFormat;
	76	char separator;
	77	DetermineFileFormat(fileName, out numberFormat, out separator);
	78	using (StreamReader reader = new StreamReader(fileName)) {
	79	tokenizer = new Tokenizer(reader, numberFormat, separator);
	80	// parse the file
	81	Parse();
	82	}
	83
[2]	84	// translate the list of samples into a DoubleMatrixData item
[3264]	85	rows = rowValues.Count;
	86	columns = rowValues[0].Count;
	87	values = new double[rows, columns];
[2]	88
[3264]	89	int rowIndex = 0;
	90	int columnIndex = 0;
	91	foreach (List<double> row in rowValues) {
	92	columnIndex = 0;
[1221]	93	foreach (double element in row) {
[3264]	94	values[rowIndex, columnIndex++] = element;
[2]	95	}
[3264]	96	rowIndex++;
[2]	97	}
	98	}
	99
[5013]	100	private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out char separator) {
	101	using (StreamReader reader = new StreamReader(fileName)) {
	102	// skip first line
	103	reader.ReadLine();
	104	// read a block
	105	char[] buffer = new char[BUFFER_SIZE];
	106	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
	107	// count frequency of special characters
	108	Dictionary<char, int> charCounts = buffer.Take(charsRead)
	109	.GroupBy(c => c)
	110	.ToDictionary(g => g.Key, g => g.Count());
	111
	112	// depending on the characters occuring in the block
	113	// we distinghish a number of different cases based on the the following rules:
	114	// many points => it must be English number format, the other frequently occuring char is the separator
	115	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
	116	// => check the line in more detail:
	117	// English: 0, 0, 0, 0
	118	// German: 0,0 0,0 0,0 ...
	119	// => if commas are followed by space => English format
	120	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
	121	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
	122	if (OccurrencesOf(charCounts, '.') > 10) {
	123	numberFormat = NumberFormatInfo.InvariantInfo;
	124	separator = POSSIBLE_SEPARATORS
	125	.Where(c => OccurrencesOf(charCounts, c) > 10)
	126	.OrderBy(c => -OccurrencesOf(charCounts, c))
	127	.DefaultIfEmpty(' ')
	128	.First();
	129	} else if (OccurrencesOf(charCounts, ',') > 10) {
	130	// no points and many commas
	131	int countCommaNonDigitPairs = 0;
	132	for (int i = 0; i < charsRead - 1; i++) {
	133	if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
	134	countCommaNonDigitPairs++;
	135	}
[1221]	136	}
[5013]	137	if (countCommaNonDigitPairs > 10) {
	138	// English format (only integer values) with ',' as separator
	139	numberFormat = NumberFormatInfo.InvariantInfo;
	140	separator = ',';
	141	} else {
	142	char[] disallowedSeparators = new char[] { ',' };
	143	// German format (real values)
[5096]	144	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
[5013]	145	separator = POSSIBLE_SEPARATORS
	146	.Except(disallowedSeparators)
	147	.Where(c => OccurrencesOf(charCounts, c) > 10)
	148	.OrderBy(c => -OccurrencesOf(charCounts, c))
	149	.DefaultIfEmpty(' ')
	150	.First();
[405]	151	}
[5013]	152	} else {
	153	// no points and no commas => English format
	154	numberFormat = NumberFormatInfo.InvariantInfo;
	155	separator = POSSIBLE_SEPARATORS
	156	.Where(c => OccurrencesOf(charCounts, c) > 10)
	157	.OrderBy(c => -OccurrencesOf(charCounts, c))
	158	.DefaultIfEmpty(' ')
	159	.First();
[405]	160	}
	161	}
	162	}
	163
[5013]	164	private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
	165	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
	166	}
	167
[2]	168	#region tokenizer
	169	internal enum TokenTypeEnum {
[3264]	170	NewLine, Separator, String, Double
[2]	171	}
	172
	173	internal class Token {
	174	public TokenTypeEnum type;
	175	public string stringValue;
	176	public double doubleValue;
	177
	178	public Token(TokenTypeEnum type, string value) {
	179	this.type = type;
	180	stringValue = value;
	181	doubleValue = 0.0;
	182	}
	183
	184	public override string ToString() {
	185	return stringValue;
	186	}
	187	}
	188
	189
[3264]	190	internal class Tokenizer {
[2]	191	private StreamReader reader;
	192	private List<Token> tokens;
[405]	193	private NumberFormatInfo numberFormatInfo;
[5013]	194	private char separator;
	195	private const string INTERNAL_SEPARATOR = "#";
[2]	196
[3264]	197	private int currentLineNumber = 0;
	198	public int CurrentLineNumber {
	199	get { return currentLineNumber; }
	200	private set { currentLineNumber = value; }
	201	}
	202	private string currentLine;
	203	public string CurrentLine {
	204	get { return currentLine; }
	205	private set { currentLine = value; }
	206	}
[2]	207
[3264]	208	private Token newlineToken;
	209	public Token NewlineToken {
	210	get { return newlineToken; }
	211	private set { newlineToken = value; }
	212	}
	213	private Token separatorToken;
	214	public Token SeparatorToken {
	215	get { return separatorToken; }
	216	private set { separatorToken = value; }
	217	}
[2]	218
[3264]	219	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
[2]	220	this.reader = reader;
[405]	221	this.numberFormatInfo = numberFormatInfo;
[5013]	222	this.separator = separator;
	223	separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
[3264]	224	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
[2]	225	tokens = new List<Token>();
	226	ReadNextTokens();
	227	}
	228
	229	private void ReadNextTokens() {
[1221]	230	if (!reader.EndOfStream) {
[2]	231	CurrentLine = reader.ReadLine();
[2446]	232	var newTokens = from str in Split(CurrentLine)
	233	let trimmedStr = str.Trim()
	234	where !string.IsNullOrEmpty(trimmedStr)
[5013]	235	select MakeToken(trimmedStr);
[2]	236
[2446]	237	tokens.AddRange(newTokens);
[2]	238	tokens.Add(NewlineToken);
	239	CurrentLineNumber++;
	240	}
	241	}
	242
[2446]	243	private IEnumerable<string> Split(string line) {
	244	StringBuilder subStr = new StringBuilder();
	245	foreach (char c in line) {
[5013]	246	if (c == separator) {
[2446]	247	yield return subStr.ToString();
	248	subStr = new StringBuilder();
[5013]	249	// all separator characters are transformed to the internally used separator character
	250	yield return INTERNAL_SEPARATOR;
[2446]	251	} else {
	252	subStr.Append(c);
	253	}
	254	}
	255	yield return subStr.ToString();
	256	}
	257
[2]	258	private Token MakeToken(string strToken) {
[406]	259	Token token = new Token(TokenTypeEnum.String, strToken);
[5013]	260	if (strToken.Equals(INTERNAL_SEPARATOR)) {
[2446]	261	return SeparatorToken;
[1221]	262	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
[406]	263	token.type = TokenTypeEnum.Double;
	264	return token;
[2]	265	}
[2446]	266
[406]	267	// couldn't parse the token as an int or float number so return a string token
	268	return token;
[2]	269	}
	270
	271	public Token Peek() {
	272	return tokens[0];
	273	}
	274
	275	public Token Next() {
	276	Token next = tokens[0];
	277	tokens.RemoveAt(0);
[1221]	278	if (tokens.Count == 0) {
[2]	279	ReadNextTokens();
	280	}
	281	return next;
	282	}
	283
	284	public bool HasNext() {
	285	return tokens.Count > 0 \|\| !reader.EndOfStream;
	286	}
	287	}
	288	#endregion
	289
	290	#region parsing
[3264]	291	private void Parse() {
	292	ParseVariableNames();
[1221]	293	if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[3264]	294	ParseValues();
	295	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[2]	296	}
	297
[3264]	298	private void ParseValues() {
[1221]	299	while (tokenizer.HasNext()) {
[2446]	300	List<double> row = new List<double>();
[3264]	301	row.Add(NextValue(tokenizer));
	302	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
	303	Expect(tokenizer.SeparatorToken);
	304	row.Add(NextValue(tokenizer));
[2446]	305	}
[3264]	306	Expect(tokenizer.NewlineToken);
	307	// all rows have to have the same number of values
	308	// the first row defines how many samples are needed
	309	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
	310	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
	311	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
[2446]	312	}
	313	// add the current row to the collection of rows and start a new row
[3264]	314	rowValues.Add(row);
[2446]	315	row = new List<double>();
	316	}
	317	}
	318
[3264]	319	private double NextValue(Tokenizer tokenizer) {
	320	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
[2446]	321	Token current = tokenizer.Next();
[3264]	322	if (current.type == TokenTypeEnum.Separator \|\| current.type == TokenTypeEnum.String) {
[2446]	323	return double.NaN;
	324	} else if (current.type == TokenTypeEnum.Double) {
	325	// just take the value
	326	return current.doubleValue;
[2]	327	}
[3264]	328	// found an unexpected token => throw error
	329	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
	330	// this line is never executed because Error() throws an exception
	331	throw new InvalidOperationException();
[2]	332	}
	333
[3264]	334	private void ParseVariableNames() {
	335	// if the first line doesn't start with a double value then we assume that the
	336	// first line contains variable names
	337	if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
[2446]	338
[2]	339	List<Token> tokens = new List<Token>();
[1221]	340	Token valueToken;
	341	valueToken = tokenizer.Next();
[2446]	342	tokens.Add(valueToken);
[3264]	343	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
	344	Expect(tokenizer.SeparatorToken);
[2]	345	valueToken = tokenizer.Next();
[3264]	346	if (valueToken != tokenizer.NewlineToken) {
[2446]	347	tokens.Add(valueToken);
	348	}
[2]	349	}
[3264]	350	if (valueToken != tokenizer.NewlineToken) {
	351	Expect(tokenizer.NewlineToken);
[2446]	352	}
[3264]	353	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
[2]	354	}
	355	}
	356
	357	private void Expect(Token expectedToken) {
	358	Token actualToken = tokenizer.Next();
[1221]	359	if (actualToken != expectedToken) {
[273]	360	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
[2]	361	}
	362	}
	363
[273]	364	private void Error(string message, string token, int lineNumber) {
	365	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
[2]	366	}
	367	#endregion
	368	}
	369	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences