Context Navigation

TableFileParser.cs @ 14545

Visit:

Last change on this file since 14545 was 6760, checked in by epitzer, 13 years ago
#1530 integrate changes from trunk
File size: 16.2 KB

Rev	Line
[2]	1	#region License Information
	2	/* HeuristicLab
[5445]	3	* Copyright (C) 2002-2011 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[2]	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
	22	using System;
[6760]	23	using System.Collections;
[2]	24	using System.Collections.Generic;
	25	using System.Globalization;
	26	using System.IO;
[2446]	27	using System.Linq;
[5484]	28	using System.Runtime.Serialization;
[2446]	29	using System.Text;
[2]	30
[3373]	31	namespace HeuristicLab.Problems.DataAnalysis {
[5013]	32	public class TableFileParser {
	33	private const int BUFFER_SIZE = 1024;
	34	private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
[2]	35	private Tokenizer tokenizer;
[6760]	36	private List<List<object>> rowValues;
[2]	37
	38	private int rows;
	39	public int Rows {
	40	get { return rows; }
	41	set { rows = value; }
	42	}
	43
	44	private int columns;
	45	public int Columns {
	46	get { return columns; }
	47	set { columns = value; }
	48	}
	49
[6760]	50	private List<IList> values;
	51	public List<IList> Values {
[2]	52	get {
[3264]	53	return values;
[2]	54	}
	55	}
	56
[5369]	57	private List<string> variableNames;
[3264]	58	public IEnumerable<string> VariableNames {
[2]	59	get {
[3264]	60	if (variableNames.Count > 0) return variableNames;
	61	else {
[273]	62	string[] names = new string[columns];
[1221]	63	for (int i = 0; i < names.Length; i++) {
[273]	64	names[i] = "X" + i.ToString("000");
	65	}
	66	return names;
[2]	67	}
	68	}
	69	}
	70
[5013]	71	public TableFileParser() {
[6760]	72	rowValues = new List<List<object>>();
[3264]	73	variableNames = new List<string>();
[2]	74	}
	75
[3264]	76	public void Parse(string fileName) {
[5013]	77	NumberFormatInfo numberFormat;
[6760]	78	DateTimeFormatInfo dateTimeFormatInfo;
[5013]	79	char separator;
[6760]	80	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
[5013]	81	using (StreamReader reader = new StreamReader(fileName)) {
[6760]	82	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
[5013]	83	// parse the file
	84	Parse();
	85	}
	86
[2]	87	// translate the list of samples into a DoubleMatrixData item
[3264]	88	rows = rowValues.Count;
	89	columns = rowValues[0].Count;
[6760]	90	values = new List<IList>();
[2]	91
[6760]	92	//create columns
	93	for (int col = 0; col < columns; col++) {
	94	var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(10).Select(v => v.GetType());
	95	if (!types.Any()) {
	96	values.Add(new List<string>());
	97	continue;
[2]	98	}
[6760]	99
	100	var columnType = types.GroupBy(v => v).OrderBy(v => v).Last().Key;
	101	if (columnType == typeof(double)) values.Add(new List<double>());
	102	else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
	103	else if (columnType == typeof(string)) values.Add(new List<string>());
	104	else throw new InvalidOperationException();
[2]	105	}
[6760]	106
	107
	108
	109	//fill with values
	110	foreach (List<object> row in rowValues) {
	111	int columnIndex = 0;
	112	foreach (object element in row) {
	113	//handle missing values with default values
	114	if (element as string == string.Empty) {
	115	if (values[columnIndex] is List<double>) values[columnIndex].Add(double.NaN);
	116	else if (values[columnIndex] is List<DateTime>) values[columnIndex].Add(DateTime.MinValue);
	117	else if (values[columnIndex] is List<string>) values[columnIndex].Add(string.Empty);
	118	else throw new InvalidOperationException();
	119	} else values[columnIndex].Add(element);
	120	columnIndex++;
	121	}
	122	}
[2]	123	}
	124
[6760]	125	private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
[5013]	126	using (StreamReader reader = new StreamReader(fileName)) {
	127	// skip first line
	128	reader.ReadLine();
	129	// read a block
	130	char[] buffer = new char[BUFFER_SIZE];
	131	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
	132	// count frequency of special characters
	133	Dictionary<char, int> charCounts = buffer.Take(charsRead)
	134	.GroupBy(c => c)
	135	.ToDictionary(g => g.Key, g => g.Count());
	136
	137	// depending on the characters occuring in the block
	138	// we distinghish a number of different cases based on the the following rules:
	139	// many points => it must be English number format, the other frequently occuring char is the separator
	140	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
	141	// => check the line in more detail:
	142	// English: 0, 0, 0, 0
	143	// German: 0,0 0,0 0,0 ...
	144	// => if commas are followed by space => English format
	145	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
	146	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
	147	if (OccurrencesOf(charCounts, '.') > 10) {
	148	numberFormat = NumberFormatInfo.InvariantInfo;
[6760]	149	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
[5013]	150	separator = POSSIBLE_SEPARATORS
	151	.Where(c => OccurrencesOf(charCounts, c) > 10)
	152	.OrderBy(c => -OccurrencesOf(charCounts, c))
	153	.DefaultIfEmpty(' ')
	154	.First();
	155	} else if (OccurrencesOf(charCounts, ',') > 10) {
	156	// no points and many commas
	157	int countCommaNonDigitPairs = 0;
	158	for (int i = 0; i < charsRead - 1; i++) {
	159	if (buffer[i] == ',' && !Char.IsDigit(buffer[i + 1])) {
	160	countCommaNonDigitPairs++;
	161	}
[1221]	162	}
[5013]	163	if (countCommaNonDigitPairs > 10) {
	164	// English format (only integer values) with ',' as separator
	165	numberFormat = NumberFormatInfo.InvariantInfo;
[6760]	166	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
[5013]	167	separator = ',';
	168	} else {
	169	char[] disallowedSeparators = new char[] { ',' };
	170	// German format (real values)
[5096]	171	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
[6760]	172	dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
[5013]	173	separator = POSSIBLE_SEPARATORS
	174	.Except(disallowedSeparators)
	175	.Where(c => OccurrencesOf(charCounts, c) > 10)
	176	.OrderBy(c => -OccurrencesOf(charCounts, c))
	177	.DefaultIfEmpty(' ')
	178	.First();
[405]	179	}
[5013]	180	} else {
	181	// no points and no commas => English format
	182	numberFormat = NumberFormatInfo.InvariantInfo;
[6760]	183	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
[5013]	184	separator = POSSIBLE_SEPARATORS
	185	.Where(c => OccurrencesOf(charCounts, c) > 10)
	186	.OrderBy(c => -OccurrencesOf(charCounts, c))
	187	.DefaultIfEmpty(' ')
	188	.First();
[405]	189	}
	190	}
	191	}
	192
[5013]	193	private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
	194	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
	195	}
	196
[2]	197	#region tokenizer
	198	internal enum TokenTypeEnum {
[6760]	199	NewLine, Separator, String, Double, DateTime
[2]	200	}
	201
	202	internal class Token {
	203	public TokenTypeEnum type;
	204	public string stringValue;
	205	public double doubleValue;
[6760]	206	public DateTime dateTimeValue;
[2]	207
	208	public Token(TokenTypeEnum type, string value) {
	209	this.type = type;
	210	stringValue = value;
[6760]	211	dateTimeValue = DateTime.MinValue;
[2]	212	doubleValue = 0.0;
	213	}
	214
	215	public override string ToString() {
	216	return stringValue;
	217	}
	218	}
	219
	220
[3264]	221	internal class Tokenizer {
[2]	222	private StreamReader reader;
	223	private List<Token> tokens;
[405]	224	private NumberFormatInfo numberFormatInfo;
[6760]	225	private DateTimeFormatInfo dateTimeFormatInfo;
[5013]	226	private char separator;
	227	private const string INTERNAL_SEPARATOR = "#";
[2]	228
[3264]	229	private int currentLineNumber = 0;
	230	public int CurrentLineNumber {
	231	get { return currentLineNumber; }
	232	private set { currentLineNumber = value; }
	233	}
	234	private string currentLine;
	235	public string CurrentLine {
	236	get { return currentLine; }
	237	private set { currentLine = value; }
	238	}
[2]	239
[3264]	240	private Token newlineToken;
	241	public Token NewlineToken {
	242	get { return newlineToken; }
	243	private set { newlineToken = value; }
	244	}
	245	private Token separatorToken;
	246	public Token SeparatorToken {
	247	get { return separatorToken; }
	248	private set { separatorToken = value; }
	249	}
[2]	250
[6760]	251	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
[2]	252	this.reader = reader;
[405]	253	this.numberFormatInfo = numberFormatInfo;
[6760]	254	this.dateTimeFormatInfo = dateTimeFormatInfo;
[5013]	255	this.separator = separator;
	256	separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
[3264]	257	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
[2]	258	tokens = new List<Token>();
	259	ReadNextTokens();
	260	}
	261
	262	private void ReadNextTokens() {
[1221]	263	if (!reader.EndOfStream) {
[2]	264	CurrentLine = reader.ReadLine();
[2446]	265	var newTokens = from str in Split(CurrentLine)
	266	let trimmedStr = str.Trim()
	267	where !string.IsNullOrEmpty(trimmedStr)
[5013]	268	select MakeToken(trimmedStr);
[2]	269
[2446]	270	tokens.AddRange(newTokens);
[2]	271	tokens.Add(NewlineToken);
	272	CurrentLineNumber++;
	273	}
	274	}
	275
[2446]	276	private IEnumerable<string> Split(string line) {
	277	StringBuilder subStr = new StringBuilder();
	278	foreach (char c in line) {
[5013]	279	if (c == separator) {
[2446]	280	yield return subStr.ToString();
	281	subStr = new StringBuilder();
[5013]	282	// all separator characters are transformed to the internally used separator character
	283	yield return INTERNAL_SEPARATOR;
[2446]	284	} else {
	285	subStr.Append(c);
	286	}
	287	}
	288	yield return subStr.ToString();
	289	}
	290
[2]	291	private Token MakeToken(string strToken) {
[406]	292	Token token = new Token(TokenTypeEnum.String, strToken);
[5013]	293	if (strToken.Equals(INTERNAL_SEPARATOR)) {
[2446]	294	return SeparatorToken;
[1221]	295	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
[406]	296	token.type = TokenTypeEnum.Double;
	297	return token;
[6760]	298	} else if (DateTime.TryParse(strToken, out token.dateTimeValue)) {
	299	token.type = TokenTypeEnum.DateTime;
	300	return token;
[2]	301	}
[2446]	302
[6760]	303	// couldn't parse the token as an int or float number or datetime value so return a string token
[406]	304	return token;
[2]	305	}
	306
	307	public Token Peek() {
	308	return tokens[0];
	309	}
	310
	311	public Token Next() {
	312	Token next = tokens[0];
	313	tokens.RemoveAt(0);
[1221]	314	if (tokens.Count == 0) {
[2]	315	ReadNextTokens();
	316	}
	317	return next;
	318	}
	319
	320	public bool HasNext() {
	321	return tokens.Count > 0 \|\| !reader.EndOfStream;
	322	}
	323	}
	324	#endregion
	325
	326	#region parsing
[3264]	327	private void Parse() {
	328	ParseVariableNames();
[1221]	329	if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[3264]	330	ParseValues();
	331	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[2]	332	}
	333
[3264]	334	private void ParseValues() {
[1221]	335	while (tokenizer.HasNext()) {
[6760]	336	if (tokenizer.Peek() == tokenizer.NewlineToken) {
	337	tokenizer.Next();
	338	} else {
	339	List<object> row = new List<object>();
	340	object value = NextValue(tokenizer);
	341	row.Add(value);
	342	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
	343	Expect(tokenizer.SeparatorToken);
	344	row.Add(NextValue(tokenizer));
	345	}
	346	Expect(tokenizer.NewlineToken);
	347	// all rows have to have the same number of values
	348	// the first row defines how many samples are needed
	349	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
	350	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
	351	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
	352	tokenizer.CurrentLineNumber);
	353	}
	354	rowValues.Add(row);
[2446]	355	}
	356	}
	357	}
	358
[6760]	359	private object NextValue(Tokenizer tokenizer) {
	360	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;
[2446]	361	Token current = tokenizer.Next();
[6760]	362	if (current.type == TokenTypeEnum.Separator) {
[2446]	363	return double.NaN;
[6760]	364	} else if (current.type == TokenTypeEnum.String) {
	365	return current.stringValue;
[2446]	366	} else if (current.type == TokenTypeEnum.Double) {
	367	return current.doubleValue;
[6760]	368	} else if (current.type == TokenTypeEnum.DateTime) {
	369	return current.dateTimeValue;
[2]	370	}
[3264]	371	// found an unexpected token => throw error
	372	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
	373	// this line is never executed because Error() throws an exception
	374	throw new InvalidOperationException();
[2]	375	}
	376
[3264]	377	private void ParseVariableNames() {
[6760]	378	//if first token is double no variables names are given
	379	if (tokenizer.Peek().type == TokenTypeEnum.Double) return;
[2446]	380
[6760]	381	// the first line must contain variable names
	382	List<Token> tokens = new List<Token>();
	383	Token valueToken;
	384	valueToken = tokenizer.Next();
	385	tokens.Add(valueToken);
	386	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
	387	Expect(tokenizer.SeparatorToken);
[1221]	388	valueToken = tokenizer.Next();
[3264]	389	if (valueToken != tokenizer.NewlineToken) {
[6760]	390	tokens.Add(valueToken);
[2446]	391	}
[2]	392	}
[6760]	393	if (valueToken != tokenizer.NewlineToken) {
	394	Expect(tokenizer.NewlineToken);
	395	}
	396	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
[2]	397	}
	398
	399	private void Expect(Token expectedToken) {
	400	Token actualToken = tokenizer.Next();
[1221]	401	if (actualToken != expectedToken) {
[273]	402	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
[2]	403	}
	404	}
	405
[273]	406	private void Error(string message, string token, int lineNumber) {
	407	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
[2]	408	}
	409	#endregion
[5484]	410
	411	[Serializable]
	412	private class DataFormatException : Exception {
	413	private int line;
	414	public int Line {
	415	get { return line; }
	416	}
	417	private string token;
	418	public string Token {
	419	get { return token; }
	420	}
	421	public DataFormatException(string message, string token, int line)
	422	: base(message + "\nToken: " + token + " (line: " + line + ")") {
	423	this.token = token;
	424	this.line = line;
	425	}
	426
	427	public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
	428	}
[2]	429	}
	430	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/PersistenceSpeedUp/HeuristicLab.Problems.DataAnalysis/3.4/TableFileParser.cs @ 14545

Download in other formats: