Context Navigation

source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 8564

Visit:

Last change on this file since 8564 was 8564, checked in by gkronber, 12 years ago

added an extension to calculate the range of IEnumerable<double>
increased the buffer size for the heuristic determination of separator characters in the table file parser (to make it work with files that have more than 1024 bytes in the second line).

File size: 18.5 KB

Rev	Line
[7849]	1	#region License Information
	2	/* HeuristicLab
	3	* Copyright (C) 2002-2012 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
	22
	23	using System;
	24	using System.Collections;
	25	using System.Collections.Generic;
	26	using System.Globalization;
	27	using System.IO;
	28	using System.Linq;
	29	using System.Runtime.Serialization;
	30	using System.Text;
	31
	32	namespace HeuristicLab.Problems.Instances.DataAnalysis {
	33	public class TableFileParser {
[8564]	34	private const int BUFFER_SIZE = 65536;
[7849]	35	private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
	36	private Tokenizer tokenizer;
	37	private List<List<object>> rowValues;
	38
	39	private int rows;
	40	public int Rows {
	41	get { return rows; }
	42	set { rows = value; }
	43	}
	44
	45	private int columns;
	46	public int Columns {
	47	get { return columns; }
	48	set { columns = value; }
	49	}
	50
	51	private List<IList> values;
	52	public List<IList> Values {
	53	get {
	54	return values;
	55	}
	56	}
	57
	58	private List<string> variableNames;
	59	public IEnumerable<string> VariableNames {
	60	get {
	61	if (variableNames.Count > 0) return variableNames;
	62	else {
	63	string[] names = new string[columns];
	64	for (int i = 0; i < names.Length; i++) {
	65	names[i] = "X" + i.ToString("000");
	66	}
	67	return names;
	68	}
	69	}
	70	}
	71
	72	public TableFileParser() {
	73	rowValues = new List<List<object>>();
	74	variableNames = new List<string>();
	75	}
	76
[7851]	77	/// <summary>
	78	/// Parses a file and determines the format first
	79	/// </summary>
	80	/// <param name="fileName">file which is parsed</param>
	81	public void Parse(string fileName) {
	82	NumberFormatInfo numberFormat;
	83	DateTimeFormatInfo dateTimeFormatInfo;
	84	char separator;
	85	DetermineFileFormat(new FileStream(fileName, FileMode.Open), out numberFormat, out dateTimeFormatInfo, out separator);
	86	Parse(new FileStream(fileName, FileMode.Open), numberFormat, dateTimeFormatInfo, separator);
	87	}
	88
	89	/// <summary>
	90	/// Parses a file with the given formats
	91	/// </summary>
	92	/// <param name="fileName">file which is parsed</param>
	93	/// <param name="numberFormat">Format of numbers</param>
	94	/// <param name="dateTimeFormatInfo">Format of datetime</param>
	95	/// <param name="separator">defines the separator</param>
[7849]	96	public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
	97	Parse(new FileStream(fileName, FileMode.Open), numberFormat, dateTimeFormatInfo, separator);
	98	}
	99
[7851]	100	/// <summary>
	101	/// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
	102	/// </summary>
	103	/// <param name="stream">stream which is parsed</param>
	104	public void Parse(Stream stream) {
	105	NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
	106	DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	107	char separator = ',';
	108	Parse(stream, numberFormat, dateTimeFormatInfo, separator);
	109	}
	110
	111	/// <summary>
	112	/// Parses a stream with the given formats.
	113	/// </summary>
	114	/// <param name="stream">Stream which is parsed</param>
	115	/// <param name="numberFormat">Format of numbers</param>
	116	/// <param name="dateTimeFormatInfo">Format of datetime</param>
	117	/// <param name="separator">defines the separator</param>
[7849]	118	public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
	119	using (StreamReader reader = new StreamReader(stream)) {
	120	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
	121	// parse the file
	122	Parse();
	123	}
	124
	125	// translate the list of samples into a DoubleMatrixData item
	126	rows = rowValues.Count;
	127	columns = rowValues[0].Count;
	128	values = new List<IList>();
	129
	130	//create columns
	131	for (int col = 0; col < columns; col++) {
	132	var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(10).Select(v => v.GetType());
	133	if (!types.Any()) {
	134	values.Add(new List<string>());
	135	continue;
	136	}
	137
	138	var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
	139	if (columnType == typeof(double)) values.Add(new List<double>());
	140	else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
	141	else if (columnType == typeof(string)) values.Add(new List<string>());
	142	else throw new InvalidOperationException();
	143	}
	144
	145
	146
	147	//fill with values
	148	foreach (List<object> row in rowValues) {
	149	int columnIndex = 0;
	150	foreach (object element in row) {
	151	if (values[columnIndex] is List<double> && !(element is double))
	152	values[columnIndex].Add(double.NaN);
	153	else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
	154	values[columnIndex].Add(DateTime.MinValue);
	155	else if (values[columnIndex] is List<string> && !(element is string))
	156	values[columnIndex].Add(string.Empty);
	157	else
	158	values[columnIndex].Add(element);
	159	columnIndex++;
	160	}
	161	}
	162	}
	163
	164	public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
	165	DetermineFileFormat(new FileStream(path, FileMode.Open), out numberFormat, out dateTimeFormatInfo, out separator);
	166	}
	167
	168	public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
	169	using (StreamReader reader = new StreamReader(stream)) {
	170	// skip first line
	171	reader.ReadLine();
	172	// read a block
	173	char[] buffer = new char[BUFFER_SIZE];
	174	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
	175	// count frequency of special characters
	176	Dictionary<char, int> charCounts = buffer.Take(charsRead)
	177	.GroupBy(c => c)
	178	.ToDictionary(g => g.Key, g => g.Count());
	179
	180	// depending on the characters occuring in the block
	181	// we distinghish a number of different cases based on the the following rules:
	182	// many points => it must be English number format, the other frequently occuring char is the separator
	183	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
	184	// => check the line in more detail:
	185	// English: 0, 0, 0, 0
	186	// German: 0,0 0,0 0,0 ...
	187	// => if commas are followed by space => English format
	188	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
	189	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
	190	if (OccurrencesOf(charCounts, '.') > 10) {
	191	numberFormat = NumberFormatInfo.InvariantInfo;
	192	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	193	separator = POSSIBLE_SEPARATORS
	194	.Where(c => OccurrencesOf(charCounts, c) > 10)
	195	.OrderBy(c => -OccurrencesOf(charCounts, c))
	196	.DefaultIfEmpty(' ')
	197	.First();
	198	} else if (OccurrencesOf(charCounts, ',') > 10) {
	199	// no points and many commas
	200	// count the number of tokens (chains of only digits and commas) that contain multiple comma characters
	201	int tokensWithMultipleCommas = 0;
	202	for (int i = 0; i < charsRead; i++) {
	203	int nCommas = 0;
	204	while (i < charsRead && (buffer[i] == ',' \|\| Char.IsDigit(buffer[i]))) {
	205	if (buffer[i] == ',') nCommas++;
	206	i++;
	207	}
	208	if (nCommas > 2) tokensWithMultipleCommas++;
	209	}
	210	if (tokensWithMultipleCommas > 1) {
	211	// English format (only integer values) with ',' as separator
	212	numberFormat = NumberFormatInfo.InvariantInfo;
	213	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	214	separator = ',';
	215	} else {
	216	char[] disallowedSeparators = new char[] { ',' };
	217	// German format (real values)
	218	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
	219	dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
	220	separator = POSSIBLE_SEPARATORS
	221	.Except(disallowedSeparators)
	222	.Where(c => OccurrencesOf(charCounts, c) > 10)
	223	.OrderBy(c => -OccurrencesOf(charCounts, c))
	224	.DefaultIfEmpty(' ')
	225	.First();
	226	}
	227	} else {
	228	// no points and no commas => English format
	229	numberFormat = NumberFormatInfo.InvariantInfo;
	230	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	231	separator = POSSIBLE_SEPARATORS
	232	.Where(c => OccurrencesOf(charCounts, c) > 10)
	233	.OrderBy(c => -OccurrencesOf(charCounts, c))
	234	.DefaultIfEmpty(' ')
	235	.First();
	236	}
	237	}
	238	}
	239
	240	private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
	241	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
	242	}
	243
	244	#region tokenizer
	245	internal enum TokenTypeEnum {
	246	NewLine, Separator, String, Double, DateTime
	247	}
	248
	249	internal class Token {
	250	public TokenTypeEnum type;
	251	public string stringValue;
	252	public double doubleValue;
	253	public DateTime dateTimeValue;
	254
	255	public Token(TokenTypeEnum type, string value) {
	256	this.type = type;
	257	stringValue = value;
	258	dateTimeValue = DateTime.MinValue;
	259	doubleValue = 0.0;
	260	}
	261
	262	public override string ToString() {
	263	return stringValue;
	264	}
	265	}
	266
	267
	268	internal class Tokenizer {
	269	private StreamReader reader;
	270	private List<Token> tokens;
	271	private NumberFormatInfo numberFormatInfo;
	272	private DateTimeFormatInfo dateTimeFormatInfo;
	273	private char separator;
	274	private const string INTERNAL_SEPARATOR = "#";
	275
	276	private int currentLineNumber = 0;
	277	public int CurrentLineNumber {
	278	get { return currentLineNumber; }
	279	private set { currentLineNumber = value; }
	280	}
	281	private string currentLine;
	282	public string CurrentLine {
	283	get { return currentLine; }
	284	private set { currentLine = value; }
	285	}
	286
	287	private Token newlineToken;
	288	public Token NewlineToken {
	289	get { return newlineToken; }
	290	private set { newlineToken = value; }
	291	}
	292	private Token separatorToken;
	293	public Token SeparatorToken {
	294	get { return separatorToken; }
	295	private set { separatorToken = value; }
	296	}
	297
	298	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
	299	this.reader = reader;
	300	this.numberFormatInfo = numberFormatInfo;
	301	this.dateTimeFormatInfo = dateTimeFormatInfo;
	302	this.separator = separator;
	303	separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
	304	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
	305	tokens = new List<Token>();
	306	ReadNextTokens();
	307	}
	308
	309	private void ReadNextTokens() {
	310	if (!reader.EndOfStream) {
	311	CurrentLine = reader.ReadLine();
	312	var newTokens = from str in Split(CurrentLine)
	313	let trimmedStr = str.Trim()
	314	where !string.IsNullOrEmpty(trimmedStr)
	315	select MakeToken(trimmedStr);
	316
	317	tokens.AddRange(newTokens);
	318	tokens.Add(NewlineToken);
	319	CurrentLineNumber++;
	320	}
	321	}
	322
	323	private IEnumerable<string> Split(string line) {
	324	StringBuilder subStr = new StringBuilder();
	325	foreach (char c in line) {
	326	if (c == separator) {
	327	yield return subStr.ToString();
	328	subStr = new StringBuilder();
	329	// all separator characters are transformed to the internally used separator character
	330	yield return INTERNAL_SEPARATOR;
	331	} else {
	332	subStr.Append(c);
	333	}
	334	}
	335	yield return subStr.ToString();
	336	}
	337
	338	private Token MakeToken(string strToken) {
	339	Token token = new Token(TokenTypeEnum.String, strToken);
	340	if (strToken.Equals(INTERNAL_SEPARATOR)) {
	341	return SeparatorToken;
	342	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
	343	token.type = TokenTypeEnum.Double;
	344	return token;
	345	} else if (DateTime.TryParse(strToken, dateTimeFormatInfo, DateTimeStyles.None, out token.dateTimeValue)) {
	346	token.type = TokenTypeEnum.DateTime;
	347	return token;
	348	}
	349
	350	// couldn't parse the token as an int or float number or datetime value so return a string token
	351	return token;
	352	}
	353
	354	public Token Peek() {
	355	return tokens[0];
	356	}
	357
	358	public Token Next() {
	359	Token next = tokens[0];
	360	tokens.RemoveAt(0);
	361	if (tokens.Count == 0) {
	362	ReadNextTokens();
	363	}
	364	return next;
	365	}
	366
	367	public bool HasNext() {
	368	return tokens.Count > 0 \|\| !reader.EndOfStream;
	369	}
	370	}
	371	#endregion
	372
	373	#region parsing
	374	private void Parse() {
	375	ParseVariableNames();
	376	if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
	377	ParseValues();
	378	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
	379	}
	380
	381	private void ParseValues() {
	382	while (tokenizer.HasNext()) {
	383	if (tokenizer.Peek() == tokenizer.NewlineToken) {
	384	tokenizer.Next();
	385	} else {
	386	List<object> row = new List<object>();
	387	object value = NextValue(tokenizer);
	388	row.Add(value);
	389	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
	390	Expect(tokenizer.SeparatorToken);
	391	row.Add(NextValue(tokenizer));
	392	}
	393	Expect(tokenizer.NewlineToken);
	394	// all rows have to have the same number of values
	395	// the first row defines how many samples are needed
	396	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
	397	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
	398	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
	399	tokenizer.CurrentLineNumber);
	400	}
	401	rowValues.Add(row);
	402	}
	403	}
	404	}
	405
	406	private object NextValue(Tokenizer tokenizer) {
	407	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;
	408	Token current = tokenizer.Next();
	409	if (current.type == TokenTypeEnum.Separator) {
	410	return double.NaN;
	411	} else if (current.type == TokenTypeEnum.String) {
	412	return current.stringValue;
	413	} else if (current.type == TokenTypeEnum.Double) {
	414	return current.doubleValue;
	415	} else if (current.type == TokenTypeEnum.DateTime) {
	416	return current.dateTimeValue;
	417	}
	418	// found an unexpected token => throw error
	419	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
	420	// this line is never executed because Error() throws an exception
	421	throw new InvalidOperationException();
	422	}
	423
	424	private void ParseVariableNames() {
	425	//if first token is double no variables names are given
	426	if (tokenizer.Peek().type == TokenTypeEnum.Double) return;
	427
	428	// the first line must contain variable names
	429	List<Token> tokens = new List<Token>();
	430	Token valueToken;
	431	valueToken = tokenizer.Next();
	432	tokens.Add(valueToken);
	433	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
	434	Expect(tokenizer.SeparatorToken);
	435	valueToken = tokenizer.Next();
	436	if (valueToken != tokenizer.NewlineToken) {
	437	tokens.Add(valueToken);
	438	}
	439	}
	440	if (valueToken != tokenizer.NewlineToken) {
	441	Expect(tokenizer.NewlineToken);
	442	}
	443	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
	444	}
	445
	446	private void Expect(Token expectedToken) {
	447	Token actualToken = tokenizer.Next();
	448	if (actualToken != expectedToken) {
	449	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
	450	}
	451	}
	452
	453	private void Error(string message, string token, int lineNumber) {
	454	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
	455	}
	456	#endregion
	457
	458	[Serializable]
	459	private class DataFormatException : Exception {
	460	private int line;
	461	public int Line {
	462	get { return line; }
	463	}
	464	private string token;
	465	public string Token {
	466	get { return token; }
	467	}
	468	public DataFormatException(string message, string token, int line)
	469	: base(message + "\nToken: " + token + " (line: " + line + ")") {
	470	this.token = token;
	471	this.line = line;
	472	}
	473
	474	public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
	475	}
	476	}
	477	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences