Context Navigation

TableFileParser.cs @ 17511

Visit:

Last change on this file since 17511 was 17448, checked in by pfleck, 5 years ago
#3040 Replaced own Vector with MathNet.Numerics Vector.
File size: 29.2 KB

Rev	Line
[7849]	1	#region License Information
	2	/* HeuristicLab
[17180]	3	* Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[7849]	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
	22	using System;
	23	using System.Collections;
	24	using System.Collections.Generic;
[13526]	25	using System.Diagnostics.Contracts;
[7849]	26	using System.Globalization;
	27	using System.IO;
	28	using System.Linq;
[13440]	29	using System.Text;
[17414]	30	using HeuristicLab.Problems.DataAnalysis;
[7849]	31
[17448]	32	using DoubleVector = MathNet.Numerics.LinearAlgebra.Vector<double>;
	33
[7849]	34	namespace HeuristicLab.Problems.Instances.DataAnalysis {
[13414]	35	public class TableFileParser : Progress<long> { // reports the number of bytes read
[8564]	36	private const int BUFFER_SIZE = 65536;
[9652]	37	// char used to symbolize whitespaces (no missing values can be handled with whitespaces)
	38	private const char WHITESPACECHAR = (char)0;
	39	private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
[7849]	40	private Tokenizer tokenizer;
[13440]	41	private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file
[7849]	42
[13584]	43
	44	private Encoding encoding = Encoding.Default;
	45
	46	public Encoding Encoding {
	47	get { return encoding; }
	48	set {
	49	if (value == null) throw new ArgumentNullException("Encoding");
	50	encoding = value;
	51	}
	52	}
	53
	54
[7849]	55	private int rows;
	56	public int Rows {
	57	get { return rows; }
	58	set { rows = value; }
	59	}
	60
	61	private int columns;
	62	public int Columns {
	63	get { return columns; }
	64	set { columns = value; }
	65	}
	66
	67	private List<IList> values;
	68	public List<IList> Values {
	69	get {
	70	return values;
	71	}
	72	}
	73
	74	private List<string> variableNames;
	75	public IEnumerable<string> VariableNames {
	76	get {
	77	if (variableNames.Count > 0) return variableNames;
	78	else {
	79	string[] names = new string[columns];
	80	for (int i = 0; i < names.Length; i++) {
	81	names[i] = "X" + i.ToString("000");
	82	}
	83	return names;
	84	}
	85	}
	86	}
	87
	88	public TableFileParser() {
	89	variableNames = new List<string>();
	90	}
	91
[9608]	92	public bool AreColumnNamesInFirstLine(string fileName) {
[17414]	93	var formatOptions = DetermineFileFormat(fileName);
[9608]	94	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
[17414]	95	return AreColumnNamesInFirstLine(stream, formatOptions);
[9608]	96	}
	97	}
	98
	99	public bool AreColumnNamesInFirstLine(Stream stream) {
[17414]	100	var formatOptions = new TableFileFormatOptions {
	101	NumberFormat = NumberFormatInfo.InvariantInfo,
	102	DateTimeFormat = DateTimeFormatInfo.InvariantInfo,
	103	ColumnSeparator = ','
	104	};
	105	return AreColumnNamesInFirstLine(stream, formatOptions);
[9608]	106	}
	107
[17414]	108	public bool AreColumnNamesInFirstLine(string fileName, TableFileFormatOptions formatOptions) {
[9608]	109	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
[17414]	110	return AreColumnNamesInFirstLine(stream, formatOptions);
[9608]	111	}
	112	}
	113
[17414]	114	public bool AreColumnNamesInFirstLine(Stream stream, TableFileFormatOptions formatOptions) {
[13584]	115	using (StreamReader reader = new StreamReader(stream, Encoding)) {
[17414]	116	tokenizer = new Tokenizer(reader, formatOptions);
[13440]	117	return (tokenizer.PeekType() != TokenTypeEnum.Double);
[9608]	118	}
	119	}
	120
[7851]	121	/// <summary>
	122	/// Parses a file and determines the format first
	123	/// </summary>
	124	/// <param name="fileName">file which is parsed</param>
[9608]	125	/// <param name="columnNamesInFirstLine"></param>
[13413]	126	public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) {
[17414]	127	var formatOptions = DetermineFileFormat(fileName);
[13440]	128	EstimateNumberOfLines(fileName);
[17414]	129	Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), formatOptions, columnNamesInFirstLine, lineLimit);
[7851]	130	}
	131
	132	/// <summary>
	133	/// Parses a file with the given formats
	134	/// </summary>
	135	/// <param name="fileName">file which is parsed</param>
	136	/// <param name="numberFormat">Format of numbers</param>
	137	/// <param name="dateTimeFormatInfo">Format of datetime</param>
	138	/// <param name="separator">defines the separator</param>
[9608]	139	/// <param name="columnNamesInFirstLine"></param>
[17414]	140	public void Parse(string fileName, TableFileFormatOptions formatOptions, bool columnNamesInFirstLine, int lineLimit = -1) {
[13440]	141	EstimateNumberOfLines(fileName);
[9608]	142	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
[17414]	143	Parse(stream, formatOptions, columnNamesInFirstLine, lineLimit);
[9608]	144	}
[7849]	145	}
	146
[13440]	147	// determines the number of newline characters in the first 64KB to guess the number of rows for a file
	148	private void EstimateNumberOfLines(string fileName) {
	149	var len = new System.IO.FileInfo(fileName).Length;
[13526]	150	var buf = new char[1024 * 1024];
[13584]	151	using (var reader = new StreamReader(fileName, Encoding)) {
[13445]	152	reader.ReadBlock(buf, 0, buf.Length);
	153	}
[13440]	154	int numNewLine = 0;
[13442]	155	int charsInCurrentLine = 0, charsInFirstLine = 0; // the first line (names) and the last line (incomplete) are not representative
	156	foreach (var ch in buf) {
	157	charsInCurrentLine++;
	158	if (ch == '\n') {
	159	if (numNewLine == 0) charsInFirstLine = charsInCurrentLine; // store the number of chars in the first line
	160	charsInCurrentLine = 0;
	161	numNewLine++;
	162	}
	163	}
	164	if (numNewLine <= 1) {
[13440]	165	// fail -> keep the default setting
	166	return;
	167	} else {
[13442]	168	double charsPerLineFactor = (buf.Length - charsInFirstLine - charsInCurrentLine) / ((double)numNewLine - 1);
[13440]	169	double estimatedLines = len / charsPerLineFactor;
	170	estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough
	171	}
	172	}
	173
[7851]	174	/// <summary>
	175	/// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
	176	/// </summary>
	177	/// <param name="stream">stream which is parsed</param>
[9608]	178	/// <param name="columnNamesInFirstLine"></param>
[13413]	179	public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) {
[17414]	180	var formatOptions = new TableFileFormatOptions {
	181	NumberFormat = NumberFormatInfo.InvariantInfo,
	182	DateTimeFormat = DateTimeFormatInfo.InvariantInfo,
	183	ColumnSeparator = ','
	184	};
	185	Parse(stream, formatOptions, columnNamesInFirstLine, lineLimit);
[7851]	186	}
	187
	188	/// <summary>
	189	/// Parses a stream with the given formats.
	190	/// </summary>
	191	/// <param name="stream">Stream which is parsed</param>
	192	/// <param name="numberFormat">Format of numbers</param>
	193	/// <param name="dateTimeFormatInfo">Format of datetime</param>
	194	/// <param name="separator">defines the separator</param>
[9608]	195	/// <param name="columnNamesInFirstLine"></param>
[17414]	196	public void Parse(Stream stream, TableFileFormatOptions formatOptions, bool columnNamesInFirstLine, int lineLimit = -1) {
[14296]	197	if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
	198
	199	using (var reader = new StreamReader(stream)) {
[17414]	200	tokenizer = new Tokenizer(reader, formatOptions);
[14296]	201	var strValues = new List<List<string>>();
[13440]	202	values = new List<IList>();
[14296]	203	Prepare(columnNamesInFirstLine, strValues);
[13447]	204
	205	int nLinesParsed = 0;
	206	int colIdx = 0;
	207	while (tokenizer.HasNext() && (lineLimit < 0 \|\| nLinesParsed < lineLimit)) {
	208	if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
	209	tokenizer.Skip();
	210
	211	// all rows have to have the same number of values
[14296]	212	// the first row defines how many elements are needed
	213	if (colIdx > 0 && values.Count != colIdx) {
	214	// read at least one value in the row (support for skipping empty lines)
	215	Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine +
[13447]	216	"Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "",
[14296]	217	tokenizer.CurrentLineNumber);
[13440]	218	}
[13447]	219	OnReport(tokenizer.BytesRead);
	220
	221	nLinesParsed++;
	222	colIdx = 0;
[13440]	223	} else {
[13447]	224	// read one value
[14296]	225	TokenTypeEnum type;
	226	string strVal;
	227	double dblVal;
	228	DateTime dateTimeVal;
[13447]	229	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
	230
[14296]	231	if (colIdx == values.Count) {
	232	Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine +
[13447]	233	"Line " + tokenizer.CurrentLineNumber + " has more columns.", "",
	234	tokenizer.CurrentLineNumber);
[13440]	235	}
[13526]	236	if (!IsColumnTypeCompatible(values[colIdx], type)) {
[14296]	237	values[colIdx] = strValues[colIdx];
[13526]	238	}
[14296]	239
[13447]	240	// add the value to the column
[14296]	241	AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal);
	242	if (!(values[colIdx] is List<string>)) { // optimization: don't store the string values in another list if the column is list<string>
	243	strValues[colIdx].Add(strVal);
	244	}
	245	colIdx++;
[13440]	246	}
[7849]	247	}
	248	}
	249
[14296]	250	if (!values.Any() \|\| values.First().Count == 0)
	251	Error("Couldn't parse data values. Probably because of incorrect number format " +
	252	"(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
	253
[13441]	254	this.rows = values.First().Count;
[13447]	255	this.columns = values.Count;
[13441]	256
[17414]	257	// see if any string column can be converted to vectors
	258	if (formatOptions.VectorSeparator != null) {
	259	for (int i = 0; i < values.Count; i++) {
	260	if (!(values[i] is List<string> stringList)) continue;
	261
	262	var strings = new string[stringList.Count][];
	263	var doubles = new double[strings.Length][];
	264	bool allDoubles = true;
	265	for (int j = 0; j < strings.Length && allDoubles; j++) {
	266	strings[j] = stringList[j].Split(formatOptions.VectorSeparator.Value);
	267	doubles[j] = new double[strings[j].Length];
	268	for (int k = 0; k < doubles[j].Length && allDoubles; k++) {
	269	allDoubles = double.TryParse(strings[j][k], NumberStyles.Float, formatOptions.NumberFormat, out doubles[j][k]);
	270	}
	271	}
	272
	273	if (allDoubles) {
	274	var vectorList = new List<DoubleVector>(stringList.Count);
	275	for (int j = 0; j < doubles.Length; j++) {
[17448]	276	vectorList.Add(DoubleVector.Build.Dense(doubles[j]));
[17414]	277	}
	278
	279	values[i] = vectorList;
	280	}
	281	}
	282	}
	283
[15513]	284	// replace lists with undefined type (object) with double-lists
	285	for (int i = 0; i < values.Count; i++) {
	286	if (values[i] is List<object>) {
	287	values[i] = Enumerable.Repeat(double.NaN, rows).ToList();
	288	}
	289	}
	290
[13440]	291	// after everything has been parsed make sure the lists are as compact as possible
	292	foreach (var l in values) {
	293	var dblList = l as List<double>;
	294	var byteList = l as List<byte>;
	295	var dateList = l as List<DateTime>;
	296	var stringList = l as List<string>;
	297	var objList = l as List<object>;
[17414]	298	var vecList = l as List<DoubleVector>;
[13440]	299	if (dblList != null) dblList.TrimExcess();
	300	if (byteList != null) byteList.TrimExcess();
	301	if (dateList != null) dateList.TrimExcess();
	302	if (stringList != null) stringList.TrimExcess();
	303	if (objList != null) objList.TrimExcess();
[17414]	304	if (vecList != null) vecList.TrimExcess();
[7849]	305	}
[13442]	306
	307	// for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction)
	308	GC.Collect(2, GCCollectionMode.Forced);
[7849]	309	}
	310
[14296]	311	private void Prepare(bool columnNamesInFirstLine, List<List<string>> strValues) {
	312	if (columnNamesInFirstLine) {
	313	ParseVariableNames();
	314	if (!tokenizer.HasNext())
	315	Error(
	316	"Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
	317	"", tokenizer.CurrentLineNumber);
	318	}
	319	// read first line to determine types and allocate specific lists
	320	// read values... start in first row
	321	int colIdx = 0;
	322	while (tokenizer.PeekType() != TokenTypeEnum.NewLine) {
	323	// read one value
	324	TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal;
	325	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
	326
	327	// initialize column
	328	values.Add(CreateList(type, estimatedNumberOfLines));
	329	if (type == TokenTypeEnum.String)
	330	strValues.Add(new List<string>(0)); // optimization: don't store the string values in another list if the column is list<string>
	331	else
	332	strValues.Add(new List<string>(estimatedNumberOfLines));
	333
	334	AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal);
	335	if (type != TokenTypeEnum.String)
	336	strValues[colIdx].Add(strVal);
	337	colIdx++;
	338	}
	339	tokenizer.Skip(); // skip newline
	340	}
	341
[13447]	342	#region type-dependent dispatch
[13526]	343	private bool IsColumnTypeCompatible(IList list, TokenTypeEnum tokenType) {
[15513]	344	return (list is List<object>) \|\| // unknown lists are compatible to everything (potential conversion)
	345	(list is List<string>) \|\| // all tokens can be added to a string list
[13526]	346	(tokenType == TokenTypeEnum.Missing) \|\| // empty entries are allowed in all columns
	347	(tokenType == TokenTypeEnum.Double && list is List<double>) \|\|
	348	(tokenType == TokenTypeEnum.DateTime && list is List<DateTime>);
	349	}
	350
	351	// all columns are converted to string columns when we find an non-empty value that has incorrect type
	352	private IList ConvertToStringColumn(IList list) {
	353	var dblL = list as List<double>;
	354	if (dblL != null) {
	355	var l = new List<string>(dblL.Capacity);
	356	l.AddRange(dblL.Select(dbl => dbl.ToString()));
	357	return l;
[13447]	358	}
[13526]	359
	360	var dtL = list as List<DateTime>;
	361	if (dtL != null) {
	362	var l = new List<string>(dtL.Capacity);
	363	l.AddRange(dtL.Select(dbl => dbl.ToString()));
	364	return l;
	365	}
	366
	367	if (list is List<string>) return list;
	368
	369	throw new InvalidProgramException(string.Format("Cannot convert column of type {0} to string column", list.GetType()));
[13447]	370	}
	371
[13526]	372	private void AddValue(TokenTypeEnum type, IList list, string strVal, double dblVal, DateTime dateTimeVal) {
[15513]	373	// Add value if list has a defined type
[13447]	374	var dblList = list as List<double>;
[13526]	375	if (dblList != null) {
	376	AddValue(type, dblList, dblVal);
	377	return;
[13447]	378	}
	379	var strList = list as List<string>;
[13526]	380	if (strList != null) {
	381	AddValue(type, strList, strVal);
	382	return;
[13447]	383	}
[13526]	384	var dtList = list as List<DateTime>;
	385	if (dtList != null) {
	386	AddValue(type, dtList, dateTimeVal);
	387	return;
	388	}
	389
[15513]	390	// Undefined list-type
	391	if (type == TokenTypeEnum.Missing) {
	392	// add null to track number of missing values
	393	list.Add(null);
	394	} else { // first non-missing value for undefined list-type
	395	var newList = ConvertList(type, list, estimatedNumberOfLines);
	396	// replace list
	397	var idx = values.IndexOf(list);
	398	values[idx] = newList;
	399	// recursively call AddValue
	400	AddValue(type, newList, strVal, dblVal, dateTimeVal);
	401	}
[13447]	402	}
	403
[15513]	404	private static void AddValue(TokenTypeEnum type, List<double> list, double dblVal) {
[13526]	405	Contract.Assert(type == TokenTypeEnum.Missing \|\| type == TokenTypeEnum.Double);
	406	list.Add(type == TokenTypeEnum.Missing ? double.NaN : dblVal);
[13447]	407	}
	408
[15513]	409	private static void AddValue(TokenTypeEnum type, List<string> list, string strVal) {
[13526]	410	// assumes that strVal is always set to the original token read from the input file
	411	list.Add(type == TokenTypeEnum.Missing ? string.Empty : strVal);
	412	}
	413
[15513]	414	private static void AddValue(TokenTypeEnum type, List<DateTime> list, DateTime dtVal) {
[13526]	415	Contract.Assert(type == TokenTypeEnum.Missing \|\| type == TokenTypeEnum.DateTime);
	416	list.Add(type == TokenTypeEnum.Missing ? DateTime.MinValue : dtVal);
	417	}
	418
[15513]	419	private static IList CreateList(TokenTypeEnum type, int estimatedNumberOfLines) {
[13447]	420	switch (type) {
	421	case TokenTypeEnum.String:
	422	return new List<string>(estimatedNumberOfLines);
	423	case TokenTypeEnum.Double:
	424	return new List<double>(estimatedNumberOfLines);
	425	case TokenTypeEnum.DateTime:
	426	return new List<DateTime>(estimatedNumberOfLines);
[15513]	427	case TokenTypeEnum.Missing: // List<object> represent list of unknown type
	428	return new List<object>(estimatedNumberOfLines);
[13447]	429	default:
	430	throw new InvalidOperationException();
	431	}
	432	}
[15513]	433
	434	private static IList ConvertList(TokenTypeEnum type, IList list, int estimatedNumberOfLines) {
	435	var newList = CreateList(type, estimatedNumberOfLines);
	436	object missingValue = GetMissingValue(type);
	437	for (int i = 0; i < list.Count; i++)
	438	newList.Add(missingValue);
	439	return newList;
	440	}
	441	private static object GetMissingValue(TokenTypeEnum type) {
	442	switch (type) {
	443	case TokenTypeEnum.String: return string.Empty;
	444	case TokenTypeEnum.Double: return double.NaN;
	445	case TokenTypeEnum.DateTime: return DateTime.MinValue;
	446	default: throw new ArgumentOutOfRangeException("type", type, "No missing value defined");
	447	}
	448	}
[13447]	449	#endregion
	450
[17414]	451	public static TableFileFormatOptions DetermineFileFormat(string path) {
	452	return DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite));
[7849]	453	}
	454
[17414]	455	public static TableFileFormatOptions DetermineFileFormat(Stream stream) {
[7849]	456	using (StreamReader reader = new StreamReader(stream)) {
	457	// skip first line
	458	reader.ReadLine();
	459	// read a block
	460	char[] buffer = new char[BUFFER_SIZE];
	461	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
	462	// count frequency of special characters
	463	Dictionary<char, int> charCounts = buffer.Take(charsRead)
	464	.GroupBy(c => c)
	465	.ToDictionary(g => g.Key, g => g.Count());
	466
	467	// depending on the characters occuring in the block
	468	// we distinghish a number of different cases based on the the following rules:
	469	// many points => it must be English number format, the other frequently occuring char is the separator
	470	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
	471	// => check the line in more detail:
	472	// English: 0, 0, 0, 0
	473	// German: 0,0 0,0 0,0 ...
	474	// => if commas are followed by space => English format
	475	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
	476	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
	477	if (OccurrencesOf(charCounts, '.') > 10) {
[17414]	478	return new TableFileFormatOptions {
	479	NumberFormat = NumberFormatInfo.InvariantInfo,
	480	DateTimeFormat = DateTimeFormatInfo.InvariantInfo,
	481	ColumnSeparator = POSSIBLE_SEPARATORS
	482	.Where(c => OccurrencesOf(charCounts, c) > 10)
	483	.OrderBy(c => -OccurrencesOf(charCounts, c))
	484	.DefaultIfEmpty(' ')
	485	.First()
	486	};
[7849]	487	} else if (OccurrencesOf(charCounts, ',') > 10) {
	488	// no points and many commas
	489	// count the number of tokens (chains of only digits and commas) that contain multiple comma characters
	490	int tokensWithMultipleCommas = 0;
	491	for (int i = 0; i < charsRead; i++) {
	492	int nCommas = 0;
	493	while (i < charsRead && (buffer[i] == ',' \|\| Char.IsDigit(buffer[i]))) {
	494	if (buffer[i] == ',') nCommas++;
	495	i++;
	496	}
	497	if (nCommas > 2) tokensWithMultipleCommas++;
	498	}
	499	if (tokensWithMultipleCommas > 1) {
	500	// English format (only integer values) with ',' as separator
[17414]	501	return new TableFileFormatOptions {
	502	NumberFormat = NumberFormatInfo.InvariantInfo,
	503	DateTimeFormat = DateTimeFormatInfo.InvariantInfo,
	504	ColumnSeparator = ','
	505	};
[7849]	506	} else {
[13526]	507	char[] disallowedSeparators = new char[] { ',' }; // n. def. contains a space so ' ' should be disallowed to, however existing unit tests would fail
[7849]	508	// German format (real values)
[17414]	509	return new TableFileFormatOptions {
	510	NumberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE")),
	511	DateTimeFormat = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE")),
	512	ColumnSeparator = POSSIBLE_SEPARATORS
	513	.Except(disallowedSeparators)
	514	.Where(c => OccurrencesOf(charCounts, c) > 10)
	515	.OrderBy(c => -OccurrencesOf(charCounts, c))
	516	.DefaultIfEmpty(' ')
	517	.First()
	518	};
	519	}
	520	} else {
	521	// no points and no commas => English format
	522	return new TableFileFormatOptions {
	523	NumberFormat = NumberFormatInfo.InvariantInfo,
	524	DateTimeFormat = DateTimeFormatInfo.InvariantInfo,
	525	ColumnSeparator = POSSIBLE_SEPARATORS
[7849]	526	.Where(c => OccurrencesOf(charCounts, c) > 10)
	527	.OrderBy(c => -OccurrencesOf(charCounts, c))
[13584]	528	.DefaultIfEmpty(' ')
[17414]	529	.First()
	530	};
[7849]	531	}
	532	}
	533	}
	534
	535	private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
	536	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
	537	}
	538
	539	#region tokenizer
[13447]	540	// the tokenizer reads full lines and returns separated tokens in the line as well as a terminating end-of-line character
[7849]	541	internal enum TokenTypeEnum {
[13526]	542	NewLine, String, Double, DateTime, Missing
[7849]	543	}
	544
	545	internal class Tokenizer {
	546	private StreamReader reader;
[13411]	547	// we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary)
	548	private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024];
	549	private string[] stringVals = new string[1024];
	550	private double[] doubleVals = new double[1024];
	551	private DateTime[] dateTimeVals = new DateTime[1024];
	552	private int tokenPos;
	553	private int numTokens;
[7849]	554	private NumberFormatInfo numberFormatInfo;
	555	private DateTimeFormatInfo dateTimeFormatInfo;
	556	private char separator;
	557
[13447]	558	// arrays for string.Split()
	559	private readonly char[] whiteSpaceSeparators = new char[0]; // string split uses separators as default
	560	private readonly char[] separators;
	561
[7849]	562	private int currentLineNumber = 0;
	563	public int CurrentLineNumber {
	564	get { return currentLineNumber; }
	565	private set { currentLineNumber = value; }
	566	}
	567	private string currentLine;
	568	public string CurrentLine {
	569	get { return currentLine; }
	570	private set { currentLine = value; }
	571	}
[13414]	572	public long BytesRead {
	573	get;
	574	private set;
	575	}
[7849]	576
[17414]	577	public Tokenizer(StreamReader reader, TableFileFormatOptions formatOptions) {
[7849]	578	this.reader = reader;
[17414]	579	this.numberFormatInfo = formatOptions.NumberFormat;
	580	this.dateTimeFormatInfo = formatOptions.DateTimeFormat;
	581	this.separator = formatOptions.ColumnSeparator;
[13447]	582	this.separators = new char[] { separator };
[7849]	583	ReadNextTokens();
	584	}
	585
[13447]	586	public bool HasNext() {
	587	return numTokens > tokenPos \|\| !reader.EndOfStream;
	588	}
	589
	590	public TokenTypeEnum PeekType() {
	591	return tokenTypes[tokenPos];
	592	}
	593
	594	public void Skip() {
	595	// simply skips one token without returning the result values
	596	tokenPos++;
	597	if (numTokens == tokenPos) {
	598	ReadNextTokens();
	599	}
	600	}
	601
	602	public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {
	603	type = tokenTypes[tokenPos];
	604	strVal = stringVals[tokenPos];
	605	dblVal = doubleVals[tokenPos];
	606	dateTimeVal = dateTimeVals[tokenPos];
	607	Skip();
	608	}
	609
[7849]	610	private void ReadNextTokens() {
	611	if (!reader.EndOfStream) {
	612	CurrentLine = reader.ReadLine();
[13447]	613	CurrentLineNumber++;
[13925]	614	if (reader.BaseStream.CanSeek) {
[13414]	615	BytesRead = reader.BaseStream.Position;
[13925]	616	} else {
[13414]	617	BytesRead += CurrentLine.Length + 2; // guess
[13584]	618	}
[13411]	619	int i = 0;
[13447]	620	if (!string.IsNullOrWhiteSpace(CurrentLine)) {
	621	foreach (var tok in Split(CurrentLine)) {
	622	TokenTypeEnum type;
[13411]	623	double doubleVal;
	624	DateTime dateTimeValue;
[13447]	625	type = TokenTypeEnum.String; // default
	626	stringVals[i] = tok.Trim();
	627	if (double.TryParse(tok, NumberStyles.Float, numberFormatInfo, out doubleVal)) {
[13411]	628	type = TokenTypeEnum.Double;
	629	doubleVals[i] = doubleVal;
[14296]	630	} else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.NoCurrentDateDefault, out dateTimeValue)
[14408]	631	&& (dateTimeValue.Year > 1 \|\| dateTimeValue.Month > 1 \|\| dateTimeValue.Day > 1)// if no date is given it is returned as 1.1.0001 -> don't allow this
[14296]	632	) {
[13411]	633	type = TokenTypeEnum.DateTime;
	634	dateTimeVals[i] = dateTimeValue;
[13526]	635	} else if (string.IsNullOrWhiteSpace(tok)) {
	636	type = TokenTypeEnum.Missing;
[13411]	637	}
[7849]	638
[13447]	639	// couldn't parse the token as an int or float number or datetime value so return a string token
[13411]	640
	641	tokenTypes[i] = type;
	642	i++;
	643
	644	if (i >= tokenTypes.Length) {
	645	// increase buffer size if necessary
	646	IncreaseCapacity(ref tokenTypes);
	647	IncreaseCapacity(ref doubleVals);
	648	IncreaseCapacity(ref stringVals);
	649	IncreaseCapacity(ref dateTimeVals);
	650	}
	651	}
	652	}
	653	tokenTypes[i] = TokenTypeEnum.NewLine;
	654	numTokens = i + 1;
	655	tokenPos = 0;
[7849]	656	}
	657	}
	658
[13447]	659	private IEnumerable<string> Split(string line) {
	660	return separator == WHITESPACECHAR ?
	661	line.Split(whiteSpaceSeparators, StringSplitOptions.RemoveEmptyEntries) :
	662	line.Split(separators);
	663	}
	664
[13411]	665	private static void IncreaseCapacity<T>(ref T[] arr) {
	666	int n = (int)Math.Floor(arr.Length * 1.7); // guess
	667	T[] arr2 = new T[n];
	668	Array.Copy(arr, arr2, arr.Length);
	669	arr = arr2;
	670	}
[7849]	671	}
	672	#endregion
	673
	674	#region parsing
	675
	676	private void ParseVariableNames() {
	677	// the first line must contain variable names
[13411]	678	List<string> varNames = new List<string>();
	679
	680	TokenTypeEnum type;
	681	string strVal;
	682	double dblVal;
	683	DateTime dateTimeVal;
	684
	685	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
	686
	687	// the first token must be a variable name
	688	if (type != TokenTypeEnum.String)
	689	throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type);
	690	varNames.Add(strVal);
	691
[13447]	692	while (tokenizer.HasNext() && tokenizer.PeekType() != TokenTypeEnum.NewLine) {
[13411]	693	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
	694	varNames.Add(strVal);
[7849]	695	}
[13411]	696	ExpectType(TokenTypeEnum.NewLine);
	697
	698	variableNames = varNames;
[7849]	699	}
	700
[13411]	701	private void ExpectType(TokenTypeEnum expectedToken) {
	702	if (tokenizer.PeekType() != expectedToken)
	703	throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType());
	704	tokenizer.Skip();
[7849]	705	}
	706
	707	private void Error(string message, string token, int lineNumber) {
[14285]	708	throw new IOException(string.Format("Error while parsing. {0} (token: {1} lineNumber: {2}).", message, token, lineNumber));
[7849]	709	}
	710	#endregion
	711	}
	712	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/3040_VectorBasedGP/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 17511

Download in other formats: