Context Navigation

source: branches/GP-MoveOperators/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 10149

Visit:

Last change on this file since 10149 was 8660, checked in by gkronber, 12 years ago
#1847 merged r8205:8635 from trunk into branch
File size: 18.5 KB

Rev	Line
[7849]	1	#region License Information
	2	/* HeuristicLab
	3	* Copyright (C) 2002-2012 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
	22
	23	using System;
	24	using System.Collections;
	25	using System.Collections.Generic;
	26	using System.Globalization;
	27	using System.IO;
	28	using System.Linq;
	29	using System.Runtime.Serialization;
	30	using System.Text;
	31
	32	namespace HeuristicLab.Problems.Instances.DataAnalysis {
	33	public class TableFileParser {
[8660]	34	private const int BUFFER_SIZE = 65536;
[7849]	35	private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
	36	private Tokenizer tokenizer;
	37	private List<List<object>> rowValues;
	38
	39	private int rows;
	40	public int Rows {
	41	get { return rows; }
	42	set { rows = value; }
	43	}
	44
	45	private int columns;
	46	public int Columns {
	47	get { return columns; }
	48	set { columns = value; }
	49	}
	50
	51	private List<IList> values;
	52	public List<IList> Values {
	53	get {
	54	return values;
	55	}
	56	}
	57
	58	private List<string> variableNames;
	59	public IEnumerable<string> VariableNames {
	60	get {
	61	if (variableNames.Count > 0) return variableNames;
	62	else {
	63	string[] names = new string[columns];
	64	for (int i = 0; i < names.Length; i++) {
	65	names[i] = "X" + i.ToString("000");
	66	}
	67	return names;
	68	}
	69	}
	70	}
	71
	72	public TableFileParser() {
	73	rowValues = new List<List<object>>();
	74	variableNames = new List<string>();
	75	}
	76
[7851]	77	/// <summary>
	78	/// Parses a file and determines the format first
	79	/// </summary>
	80	/// <param name="fileName">file which is parsed</param>
	81	public void Parse(string fileName) {
	82	NumberFormatInfo numberFormat;
	83	DateTimeFormatInfo dateTimeFormatInfo;
	84	char separator;
	85	DetermineFileFormat(new FileStream(fileName, FileMode.Open), out numberFormat, out dateTimeFormatInfo, out separator);
	86	Parse(new FileStream(fileName, FileMode.Open), numberFormat, dateTimeFormatInfo, separator);
	87	}
	88
	89	/// <summary>
	90	/// Parses a file with the given formats
	91	/// </summary>
	92	/// <param name="fileName">file which is parsed</param>
	93	/// <param name="numberFormat">Format of numbers</param>
	94	/// <param name="dateTimeFormatInfo">Format of datetime</param>
	95	/// <param name="separator">defines the separator</param>
[7849]	96	public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
	97	Parse(new FileStream(fileName, FileMode.Open), numberFormat, dateTimeFormatInfo, separator);
	98	}
	99
[7851]	100	/// <summary>
	101	/// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
	102	/// </summary>
	103	/// <param name="stream">stream which is parsed</param>
	104	public void Parse(Stream stream) {
	105	NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
	106	DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	107	char separator = ',';
	108	Parse(stream, numberFormat, dateTimeFormatInfo, separator);
	109	}
	110
	111	/// <summary>
	112	/// Parses a stream with the given formats.
	113	/// </summary>
	114	/// <param name="stream">Stream which is parsed</param>
	115	/// <param name="numberFormat">Format of numbers</param>
	116	/// <param name="dateTimeFormatInfo">Format of datetime</param>
	117	/// <param name="separator">defines the separator</param>
[7849]	118	public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
	119	using (StreamReader reader = new StreamReader(stream)) {
	120	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
	121	// parse the file
	122	Parse();
	123	}
	124
	125	// translate the list of samples into a DoubleMatrixData item
	126	rows = rowValues.Count;
	127	columns = rowValues[0].Count;
	128	values = new List<IList>();
	129
	130	//create columns
	131	for (int col = 0; col < columns; col++) {
	132	var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(10).Select(v => v.GetType());
	133	if (!types.Any()) {
	134	values.Add(new List<string>());
	135	continue;
	136	}
	137
	138	var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
	139	if (columnType == typeof(double)) values.Add(new List<double>());
	140	else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
	141	else if (columnType == typeof(string)) values.Add(new List<string>());
	142	else throw new InvalidOperationException();
	143	}
	144
	145
	146
	147	//fill with values
	148	foreach (List<object> row in rowValues) {
	149	int columnIndex = 0;
	150	foreach (object element in row) {
	151	if (values[columnIndex] is List<double> && !(element is double))
	152	values[columnIndex].Add(double.NaN);
	153	else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
	154	values[columnIndex].Add(DateTime.MinValue);
	155	else if (values[columnIndex] is List<string> && !(element is string))
	156	values[columnIndex].Add(string.Empty);
	157	else
	158	values[columnIndex].Add(element);
	159	columnIndex++;
	160	}
	161	}
	162	}
	163
	164	public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
	165	DetermineFileFormat(new FileStream(path, FileMode.Open), out numberFormat, out dateTimeFormatInfo, out separator);
	166	}
	167
	168	public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
	169	using (StreamReader reader = new StreamReader(stream)) {
	170	// skip first line
	171	reader.ReadLine();
	172	// read a block
	173	char[] buffer = new char[BUFFER_SIZE];
	174	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
	175	// count frequency of special characters
	176	Dictionary<char, int> charCounts = buffer.Take(charsRead)
	177	.GroupBy(c => c)
	178	.ToDictionary(g => g.Key, g => g.Count());
	179
	180	// depending on the characters occuring in the block
	181	// we distinghish a number of different cases based on the the following rules:
	182	// many points => it must be English number format, the other frequently occuring char is the separator
	183	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
	184	// => check the line in more detail:
	185	// English: 0, 0, 0, 0
	186	// German: 0,0 0,0 0,0 ...
	187	// => if commas are followed by space => English format
	188	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
	189	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
	190	if (OccurrencesOf(charCounts, '.') > 10) {
	191	numberFormat = NumberFormatInfo.InvariantInfo;
	192	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	193	separator = POSSIBLE_SEPARATORS
	194	.Where(c => OccurrencesOf(charCounts, c) > 10)
	195	.OrderBy(c => -OccurrencesOf(charCounts, c))
	196	.DefaultIfEmpty(' ')
	197	.First();
	198	} else if (OccurrencesOf(charCounts, ',') > 10) {
	199	// no points and many commas
	200	// count the number of tokens (chains of only digits and commas) that contain multiple comma characters
	201	int tokensWithMultipleCommas = 0;
	202	for (int i = 0; i < charsRead; i++) {
	203	int nCommas = 0;
	204	while (i < charsRead && (buffer[i] == ',' \|\| Char.IsDigit(buffer[i]))) {
	205	if (buffer[i] == ',') nCommas++;
	206	i++;
	207	}
	208	if (nCommas > 2) tokensWithMultipleCommas++;
	209	}
	210	if (tokensWithMultipleCommas > 1) {
	211	// English format (only integer values) with ',' as separator
	212	numberFormat = NumberFormatInfo.InvariantInfo;
	213	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	214	separator = ',';
	215	} else {
	216	char[] disallowedSeparators = new char[] { ',' };
	217	// German format (real values)
	218	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
	219	dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
	220	separator = POSSIBLE_SEPARATORS
	221	.Except(disallowedSeparators)
	222	.Where(c => OccurrencesOf(charCounts, c) > 10)
	223	.OrderBy(c => -OccurrencesOf(charCounts, c))
	224	.DefaultIfEmpty(' ')
	225	.First();
	226	}
	227	} else {
	228	// no points and no commas => English format
	229	numberFormat = NumberFormatInfo.InvariantInfo;
	230	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	231	separator = POSSIBLE_SEPARATORS
	232	.Where(c => OccurrencesOf(charCounts, c) > 10)
	233	.OrderBy(c => -OccurrencesOf(charCounts, c))
	234	.DefaultIfEmpty(' ')
	235	.First();
	236	}
	237	}
	238	}
	239
	240	private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
	241	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
	242	}
	243
	244	#region tokenizer
	245	internal enum TokenTypeEnum {
	246	NewLine, Separator, String, Double, DateTime
	247	}
	248
	249	internal class Token {
	250	public TokenTypeEnum type;
	251	public string stringValue;
	252	public double doubleValue;
	253	public DateTime dateTimeValue;
	254
	255	public Token(TokenTypeEnum type, string value) {
	256	this.type = type;
	257	stringValue = value;
	258	dateTimeValue = DateTime.MinValue;
	259	doubleValue = 0.0;
	260	}
	261
	262	public override string ToString() {
	263	return stringValue;
	264	}
	265	}
	266
	267
	268	internal class Tokenizer {
	269	private StreamReader reader;
	270	private List<Token> tokens;
	271	private NumberFormatInfo numberFormatInfo;
	272	private DateTimeFormatInfo dateTimeFormatInfo;
	273	private char separator;
	274	private const string INTERNAL_SEPARATOR = "#";
	275
	276	private int currentLineNumber = 0;
	277	public int CurrentLineNumber {
	278	get { return currentLineNumber; }
	279	private set { currentLineNumber = value; }
	280	}
	281	private string currentLine;
	282	public string CurrentLine {
	283	get { return currentLine; }
	284	private set { currentLine = value; }
	285	}
	286
	287	private Token newlineToken;
	288	public Token NewlineToken {
	289	get { return newlineToken; }
	290	private set { newlineToken = value; }
	291	}
	292	private Token separatorToken;
	293	public Token SeparatorToken {
	294	get { return separatorToken; }
	295	private set { separatorToken = value; }
	296	}
	297
	298	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
	299	this.reader = reader;
	300	this.numberFormatInfo = numberFormatInfo;
	301	this.dateTimeFormatInfo = dateTimeFormatInfo;
	302	this.separator = separator;
	303	separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
	304	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
	305	tokens = new List<Token>();
	306	ReadNextTokens();
	307	}
	308
	309	private void ReadNextTokens() {
	310	if (!reader.EndOfStream) {
	311	CurrentLine = reader.ReadLine();
	312	var newTokens = from str in Split(CurrentLine)
	313	let trimmedStr = str.Trim()
	314	where !string.IsNullOrEmpty(trimmedStr)
	315	select MakeToken(trimmedStr);
	316
	317	tokens.AddRange(newTokens);
	318	tokens.Add(NewlineToken);
	319	CurrentLineNumber++;
	320	}
	321	}
	322
	323	private IEnumerable<string> Split(string line) {
	324	StringBuilder subStr = new StringBuilder();
	325	foreach (char c in line) {
	326	if (c == separator) {
	327	yield return subStr.ToString();
	328	subStr = new StringBuilder();
	329	// all separator characters are transformed to the internally used separator character
	330	yield return INTERNAL_SEPARATOR;
	331	} else {
	332	subStr.Append(c);
	333	}
	334	}
	335	yield return subStr.ToString();
	336	}
	337
	338	private Token MakeToken(string strToken) {
	339	Token token = new Token(TokenTypeEnum.String, strToken);
	340	if (strToken.Equals(INTERNAL_SEPARATOR)) {
	341	return SeparatorToken;
	342	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
	343	token.type = TokenTypeEnum.Double;
	344	return token;
	345	} else if (DateTime.TryParse(strToken, dateTimeFormatInfo, DateTimeStyles.None, out token.dateTimeValue)) {
	346	token.type = TokenTypeEnum.DateTime;
	347	return token;
	348	}
	349
	350	// couldn't parse the token as an int or float number or datetime value so return a string token
	351	return token;
	352	}
	353
	354	public Token Peek() {
	355	return tokens[0];
	356	}
	357
	358	public Token Next() {
	359	Token next = tokens[0];
	360	tokens.RemoveAt(0);
	361	if (tokens.Count == 0) {
	362	ReadNextTokens();
	363	}
	364	return next;
	365	}
	366
	367	public bool HasNext() {
	368	return tokens.Count > 0 \|\| !reader.EndOfStream;
	369	}
	370	}
	371	#endregion
	372
	373	#region parsing
	374	private void Parse() {
	375	ParseVariableNames();
	376	if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
	377	ParseValues();
	378	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
	379	}
	380
	381	private void ParseValues() {
	382	while (tokenizer.HasNext()) {
	383	if (tokenizer.Peek() == tokenizer.NewlineToken) {
	384	tokenizer.Next();
	385	} else {
	386	List<object> row = new List<object>();
	387	object value = NextValue(tokenizer);
	388	row.Add(value);
	389	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
	390	Expect(tokenizer.SeparatorToken);
	391	row.Add(NextValue(tokenizer));
	392	}
	393	Expect(tokenizer.NewlineToken);
	394	// all rows have to have the same number of values
	395	// the first row defines how many samples are needed
	396	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
	397	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
	398	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
	399	tokenizer.CurrentLineNumber);
	400	}
	401	rowValues.Add(row);
	402	}
	403	}
	404	}
	405
	406	private object NextValue(Tokenizer tokenizer) {
	407	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;
	408	Token current = tokenizer.Next();
	409	if (current.type == TokenTypeEnum.Separator) {
	410	return double.NaN;
	411	} else if (current.type == TokenTypeEnum.String) {
	412	return current.stringValue;
	413	} else if (current.type == TokenTypeEnum.Double) {
	414	return current.doubleValue;
	415	} else if (current.type == TokenTypeEnum.DateTime) {
	416	return current.dateTimeValue;
	417	}
	418	// found an unexpected token => throw error
	419	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
	420	// this line is never executed because Error() throws an exception
	421	throw new InvalidOperationException();
	422	}
	423
	424	private void ParseVariableNames() {
	425	//if first token is double no variables names are given
	426	if (tokenizer.Peek().type == TokenTypeEnum.Double) return;
	427
	428	// the first line must contain variable names
	429	List<Token> tokens = new List<Token>();
	430	Token valueToken;
	431	valueToken = tokenizer.Next();
	432	tokens.Add(valueToken);
	433	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
	434	Expect(tokenizer.SeparatorToken);
	435	valueToken = tokenizer.Next();
	436	if (valueToken != tokenizer.NewlineToken) {
	437	tokens.Add(valueToken);
	438	}
	439	}
	440	if (valueToken != tokenizer.NewlineToken) {
	441	Expect(tokenizer.NewlineToken);
	442	}
	443	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
	444	}
	445
	446	private void Expect(Token expectedToken) {
	447	Token actualToken = tokenizer.Next();
	448	if (actualToken != expectedToken) {
	449	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
	450	}
	451	}
	452
	453	private void Error(string message, string token, int lineNumber) {
	454	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
	455	}
	456	#endregion
	457
	458	[Serializable]
	459	private class DataFormatException : Exception {
	460	private int line;
	461	public int Line {
	462	get { return line; }
	463	}
	464	private string token;
	465	public string Token {
	466	get { return token; }
	467	}
	468	public DataFormatException(string message, string token, int line)
	469	: base(message + "\nToken: " + token + " (line: " + line + ")") {
	470	this.token = token;
	471	this.line = line;
	472	}
	473
	474	public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
	475	}
	476	}
	477	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences