Context Navigation

source: branches/HeuristicLab.Hive.Azure/HeuristicLab.Problems.DataAnalysis/3.4/TableFileParser.cs @ 7317

Visit:

Last change on this file since 7317 was 7270, checked in by spimming, 13 years ago

merged changes from trunk into branch

File size: 16.4 KB

Rev	Line
[2]	1	#region License Information
	2	/* HeuristicLab
[7270]	3	* Copyright (C) 2002-2012 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[2]	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
	22	using System;
[6740]	23	using System.Collections;
[2]	24	using System.Collections.Generic;
	25	using System.Globalization;
	26	using System.IO;
[2446]	27	using System.Linq;
[5484]	28	using System.Runtime.Serialization;
[2446]	29	using System.Text;
[2]	30
[3373]	31	namespace HeuristicLab.Problems.DataAnalysis {
[5013]	32	public class TableFileParser {
	33	private const int BUFFER_SIZE = 1024;
	34	private readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t' };
[2]	35	private Tokenizer tokenizer;
[6740]	36	private List<List<object>> rowValues;
[2]	37
	38	private int rows;
	39	public int Rows {
	40	get { return rows; }
	41	set { rows = value; }
	42	}
	43
	44	private int columns;
	45	public int Columns {
	46	get { return columns; }
	47	set { columns = value; }
	48	}
	49
[6740]	50	private List<IList> values;
	51	public List<IList> Values {
[2]	52	get {
[3264]	53	return values;
[2]	54	}
	55	}
	56
[5369]	57	private List<string> variableNames;
[3264]	58	public IEnumerable<string> VariableNames {
[2]	59	get {
[3264]	60	if (variableNames.Count > 0) return variableNames;
	61	else {
[273]	62	string[] names = new string[columns];
[1221]	63	for (int i = 0; i < names.Length; i++) {
[273]	64	names[i] = "X" + i.ToString("000");
	65	}
	66	return names;
[2]	67	}
	68	}
	69	}
	70
[5013]	71	public TableFileParser() {
[6740]	72	rowValues = new List<List<object>>();
[3264]	73	variableNames = new List<string>();
[2]	74	}
	75
[3264]	76	public void Parse(string fileName) {
[5013]	77	NumberFormatInfo numberFormat;
[6740]	78	DateTimeFormatInfo dateTimeFormatInfo;
[5013]	79	char separator;
[6740]	80	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
[5013]	81	using (StreamReader reader = new StreamReader(fileName)) {
[6740]	82	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
[5013]	83	// parse the file
	84	Parse();
	85	}
	86
[2]	87	// translate the list of samples into a DoubleMatrixData item
[3264]	88	rows = rowValues.Count;
	89	columns = rowValues[0].Count;
[6740]	90	values = new List<IList>();
[2]	91
[6740]	92	//create columns
	93	for (int col = 0; col < columns; col++) {
	94	var types = rowValues.Select(r => r[col]).Where(v => v != null && v as string != string.Empty).Take(10).Select(v => v.GetType());
	95	if (!types.Any()) {
	96	values.Add(new List<string>());
	97	continue;
[2]	98	}
[6740]	99
[6776]	100	var columnType = types.GroupBy(v => v).OrderBy(v => v.Count()).Last().Key;
[6740]	101	if (columnType == typeof(double)) values.Add(new List<double>());
	102	else if (columnType == typeof(DateTime)) values.Add(new List<DateTime>());
	103	else if (columnType == typeof(string)) values.Add(new List<string>());
	104	else throw new InvalidOperationException();
[2]	105	}
[6740]	106
	107
	108
	109	//fill with values
	110	foreach (List<object> row in rowValues) {
	111	int columnIndex = 0;
	112	foreach (object element in row) {
[6776]	113	if (values[columnIndex] is List<double> && !(element is double))
	114	values[columnIndex].Add(double.NaN);
	115	else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
	116	values[columnIndex].Add(DateTime.MinValue);
	117	else if (values[columnIndex] is List<string> && !(element is string))
	118	values[columnIndex].Add(string.Empty);
	119	else
	120	values[columnIndex].Add(element);
[6740]	121	columnIndex++;
	122	}
	123	}
[2]	124	}
	125
[6740]	126	private void DetermineFileFormat(string fileName, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
[5013]	127	using (StreamReader reader = new StreamReader(fileName)) {
	128	// skip first line
	129	reader.ReadLine();
	130	// read a block
	131	char[] buffer = new char[BUFFER_SIZE];
	132	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
	133	// count frequency of special characters
	134	Dictionary<char, int> charCounts = buffer.Take(charsRead)
	135	.GroupBy(c => c)
	136	.ToDictionary(g => g.Key, g => g.Count());
	137
	138	// depending on the characters occuring in the block
	139	// we distinghish a number of different cases based on the the following rules:
	140	// many points => it must be English number format, the other frequently occuring char is the separator
	141	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
	142	// => check the line in more detail:
	143	// English: 0, 0, 0, 0
	144	// German: 0,0 0,0 0,0 ...
	145	// => if commas are followed by space => English format
	146	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
	147	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
	148	if (OccurrencesOf(charCounts, '.') > 10) {
	149	numberFormat = NumberFormatInfo.InvariantInfo;
[6740]	150	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
[5013]	151	separator = POSSIBLE_SEPARATORS
	152	.Where(c => OccurrencesOf(charCounts, c) > 10)
	153	.OrderBy(c => -OccurrencesOf(charCounts, c))
	154	.DefaultIfEmpty(' ')
	155	.First();
	156	} else if (OccurrencesOf(charCounts, ',') > 10) {
	157	// no points and many commas
[6963]	158	// count the number of tokens (chains of only digits and commas) that contain multiple comma characters
	159	int tokensWithMultipleCommas = 0;
	160	for (int i = 0; i < charsRead; i++) {
	161	int nCommas = 0;
	162	while (i < charsRead && (buffer[i] == ',' \|\| Char.IsDigit(buffer[i]))) {
	163	if (buffer[i] == ',') nCommas++;
	164	i++;
[5013]	165	}
[6963]	166	if (nCommas > 2) tokensWithMultipleCommas++;
[1221]	167	}
[6963]	168	if (tokensWithMultipleCommas > 1) {
[5013]	169	// English format (only integer values) with ',' as separator
	170	numberFormat = NumberFormatInfo.InvariantInfo;
[6740]	171	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
[5013]	172	separator = ',';
	173	} else {
	174	char[] disallowedSeparators = new char[] { ',' };
	175	// German format (real values)
[5096]	176	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
[6740]	177	dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
[5013]	178	separator = POSSIBLE_SEPARATORS
	179	.Except(disallowedSeparators)
	180	.Where(c => OccurrencesOf(charCounts, c) > 10)
	181	.OrderBy(c => -OccurrencesOf(charCounts, c))
	182	.DefaultIfEmpty(' ')
	183	.First();
[405]	184	}
[5013]	185	} else {
	186	// no points and no commas => English format
	187	numberFormat = NumberFormatInfo.InvariantInfo;
[6740]	188	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
[5013]	189	separator = POSSIBLE_SEPARATORS
	190	.Where(c => OccurrencesOf(charCounts, c) > 10)
	191	.OrderBy(c => -OccurrencesOf(charCounts, c))
	192	.DefaultIfEmpty(' ')
	193	.First();
[405]	194	}
	195	}
	196	}
	197
[5013]	198	private int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
	199	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
	200	}
	201
[2]	202	#region tokenizer
	203	internal enum TokenTypeEnum {
[6740]	204	NewLine, Separator, String, Double, DateTime
[2]	205	}
	206
	207	internal class Token {
	208	public TokenTypeEnum type;
	209	public string stringValue;
	210	public double doubleValue;
[6740]	211	public DateTime dateTimeValue;
[2]	212
	213	public Token(TokenTypeEnum type, string value) {
	214	this.type = type;
	215	stringValue = value;
[6740]	216	dateTimeValue = DateTime.MinValue;
[2]	217	doubleValue = 0.0;
	218	}
	219
	220	public override string ToString() {
	221	return stringValue;
	222	}
	223	}
	224
	225
[3264]	226	internal class Tokenizer {
[2]	227	private StreamReader reader;
	228	private List<Token> tokens;
[405]	229	private NumberFormatInfo numberFormatInfo;
[6740]	230	private DateTimeFormatInfo dateTimeFormatInfo;
[5013]	231	private char separator;
	232	private const string INTERNAL_SEPARATOR = "#";
[2]	233
[3264]	234	private int currentLineNumber = 0;
	235	public int CurrentLineNumber {
	236	get { return currentLineNumber; }
	237	private set { currentLineNumber = value; }
	238	}
	239	private string currentLine;
	240	public string CurrentLine {
	241	get { return currentLine; }
	242	private set { currentLine = value; }
	243	}
[2]	244
[3264]	245	private Token newlineToken;
	246	public Token NewlineToken {
	247	get { return newlineToken; }
	248	private set { newlineToken = value; }
	249	}
	250	private Token separatorToken;
	251	public Token SeparatorToken {
	252	get { return separatorToken; }
	253	private set { separatorToken = value; }
	254	}
[2]	255
[6740]	256	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
[2]	257	this.reader = reader;
[405]	258	this.numberFormatInfo = numberFormatInfo;
[6740]	259	this.dateTimeFormatInfo = dateTimeFormatInfo;
[5013]	260	this.separator = separator;
	261	separatorToken = new Token(TokenTypeEnum.Separator, INTERNAL_SEPARATOR);
[3264]	262	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
[2]	263	tokens = new List<Token>();
	264	ReadNextTokens();
	265	}
	266
	267	private void ReadNextTokens() {
[1221]	268	if (!reader.EndOfStream) {
[2]	269	CurrentLine = reader.ReadLine();
[2446]	270	var newTokens = from str in Split(CurrentLine)
	271	let trimmedStr = str.Trim()
	272	where !string.IsNullOrEmpty(trimmedStr)
[5013]	273	select MakeToken(trimmedStr);
[2]	274
[2446]	275	tokens.AddRange(newTokens);
[2]	276	tokens.Add(NewlineToken);
	277	CurrentLineNumber++;
	278	}
	279	}
	280
[2446]	281	private IEnumerable<string> Split(string line) {
	282	StringBuilder subStr = new StringBuilder();
	283	foreach (char c in line) {
[5013]	284	if (c == separator) {
[2446]	285	yield return subStr.ToString();
	286	subStr = new StringBuilder();
[5013]	287	// all separator characters are transformed to the internally used separator character
	288	yield return INTERNAL_SEPARATOR;
[2446]	289	} else {
	290	subStr.Append(c);
	291	}
	292	}
	293	yield return subStr.ToString();
	294	}
	295
[2]	296	private Token MakeToken(string strToken) {
[406]	297	Token token = new Token(TokenTypeEnum.String, strToken);
[5013]	298	if (strToken.Equals(INTERNAL_SEPARATOR)) {
[2446]	299	return SeparatorToken;
[1221]	300	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
[406]	301	token.type = TokenTypeEnum.Double;
	302	return token;
[6776]	303	} else if (DateTime.TryParse(strToken, dateTimeFormatInfo, DateTimeStyles.None, out token.dateTimeValue)) {
[6740]	304	token.type = TokenTypeEnum.DateTime;
	305	return token;
[2]	306	}
[2446]	307
[6740]	308	// couldn't parse the token as an int or float number or datetime value so return a string token
[406]	309	return token;
[2]	310	}
	311
	312	public Token Peek() {
	313	return tokens[0];
	314	}
	315
	316	public Token Next() {
	317	Token next = tokens[0];
	318	tokens.RemoveAt(0);
[1221]	319	if (tokens.Count == 0) {
[2]	320	ReadNextTokens();
	321	}
	322	return next;
	323	}
	324
	325	public bool HasNext() {
	326	return tokens.Count > 0 \|\| !reader.EndOfStream;
	327	}
	328	}
	329	#endregion
	330
	331	#region parsing
[3264]	332	private void Parse() {
	333	ParseVariableNames();
[1221]	334	if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[3264]	335	ParseValues();
	336	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[2]	337	}
	338
[3264]	339	private void ParseValues() {
[1221]	340	while (tokenizer.HasNext()) {
[6742]	341	if (tokenizer.Peek() == tokenizer.NewlineToken) {
	342	tokenizer.Next();
	343	} else {
	344	List<object> row = new List<object>();
	345	object value = NextValue(tokenizer);
	346	row.Add(value);
	347	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
	348	Expect(tokenizer.SeparatorToken);
	349	row.Add(NextValue(tokenizer));
	350	}
	351	Expect(tokenizer.NewlineToken);
	352	// all rows have to have the same number of values
	353	// the first row defines how many samples are needed
	354	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
	355	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
	356	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
	357	tokenizer.CurrentLineNumber);
	358	}
	359	rowValues.Add(row);
[2446]	360	}
	361	}
	362	}
	363
[6740]	364	private object NextValue(Tokenizer tokenizer) {
[6742]	365	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return string.Empty;
[2446]	366	Token current = tokenizer.Next();
[6740]	367	if (current.type == TokenTypeEnum.Separator) {
[2446]	368	return double.NaN;
[6740]	369	} else if (current.type == TokenTypeEnum.String) {
	370	return current.stringValue;
[2446]	371	} else if (current.type == TokenTypeEnum.Double) {
	372	return current.doubleValue;
[6740]	373	} else if (current.type == TokenTypeEnum.DateTime) {
	374	return current.dateTimeValue;
[2]	375	}
[3264]	376	// found an unexpected token => throw error
	377	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
	378	// this line is never executed because Error() throws an exception
	379	throw new InvalidOperationException();
[2]	380	}
	381
[3264]	382	private void ParseVariableNames() {
[6740]	383	//if first token is double no variables names are given
	384	if (tokenizer.Peek().type == TokenTypeEnum.Double) return;
[2446]	385
[6740]	386	// the first line must contain variable names
	387	List<Token> tokens = new List<Token>();
	388	Token valueToken;
	389	valueToken = tokenizer.Next();
	390	tokens.Add(valueToken);
	391	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
	392	Expect(tokenizer.SeparatorToken);
[1221]	393	valueToken = tokenizer.Next();
[3264]	394	if (valueToken != tokenizer.NewlineToken) {
[6740]	395	tokens.Add(valueToken);
[2446]	396	}
[2]	397	}
[6740]	398	if (valueToken != tokenizer.NewlineToken) {
	399	Expect(tokenizer.NewlineToken);
	400	}
	401	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
[2]	402	}
	403
	404	private void Expect(Token expectedToken) {
	405	Token actualToken = tokenizer.Next();
[1221]	406	if (actualToken != expectedToken) {
[273]	407	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
[2]	408	}
	409	}
	410
[273]	411	private void Error(string message, string token, int lineNumber) {
	412	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
[2]	413	}
	414	#endregion
[5484]	415
	416	[Serializable]
	417	private class DataFormatException : Exception {
	418	private int line;
	419	public int Line {
	420	get { return line; }
	421	}
	422	private string token;
	423	public string Token {
	424	get { return token; }
	425	}
	426	public DataFormatException(string message, string token, int line)
	427	: base(message + "\nToken: " + token + " (line: " + line + ")") {
	428	this.token = token;
	429	this.line = line;
	430	}
	431
	432	public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
	433	}
[2]	434	}
	435	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences