Context Navigation

source: stable/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 14113

Visit:

Last change on this file since 14113 was 13974, checked in by gkronber, 8 years ago
#2071: merged r13411,r13413,r13414,r13415,r13419,r13440,r13441,r13442,r13445,r13447,r13525,r13526,r13529,r13584,r13901,r13925 from trunk to stable
File size: 26.4 KB

Rev	Line
[7849]	1	#region License Information
	2	/* HeuristicLab
[12009]	3	* Copyright (C) 2002-2015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[7849]	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
	22
	23	using System;
	24	using System.Collections;
	25	using System.Collections.Generic;
[13974]	26	using System.Diagnostics.Contracts;
[7849]	27	using System.Globalization;
	28	using System.IO;
	29	using System.Linq;
	30	using System.Runtime.Serialization;
[13974]	31	using System.Text;
[7849]	32
	33	namespace HeuristicLab.Problems.Instances.DataAnalysis {
[13974]	34	public class TableFileParser : Progress<long> { // reports the number of bytes read
[8564]	35	private const int BUFFER_SIZE = 65536;
[9753]	36	// char used to symbolize whitespaces (no missing values can be handled with whitespaces)
	37	private const char WHITESPACECHAR = (char)0;
	38	private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
[7849]	39	private Tokenizer tokenizer;
[13974]	40	private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file
[7849]	41
[13974]	42
	43	private Encoding encoding = Encoding.Default;
	44
	45	public Encoding Encoding {
	46	get { return encoding; }
	47	set {
	48	if (value == null) throw new ArgumentNullException("Encoding");
	49	encoding = value;
	50	}
	51	}
	52
	53
[7849]	54	private int rows;
	55	public int Rows {
	56	get { return rows; }
	57	set { rows = value; }
	58	}
	59
	60	private int columns;
	61	public int Columns {
	62	get { return columns; }
	63	set { columns = value; }
	64	}
	65
	66	private List<IList> values;
	67	public List<IList> Values {
	68	get {
	69	return values;
	70	}
	71	}
	72
	73	private List<string> variableNames;
	74	public IEnumerable<string> VariableNames {
	75	get {
	76	if (variableNames.Count > 0) return variableNames;
	77	else {
	78	string[] names = new string[columns];
	79	for (int i = 0; i < names.Length; i++) {
	80	names[i] = "X" + i.ToString("000");
	81	}
	82	return names;
	83	}
	84	}
	85	}
	86
	87	public TableFileParser() {
	88	variableNames = new List<string>();
	89	}
	90
[9651]	91	public bool AreColumnNamesInFirstLine(string fileName) {
	92	NumberFormatInfo numberFormat;
	93	DateTimeFormatInfo dateTimeFormatInfo;
	94	char separator;
	95	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
	96	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
	97	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
	98	}
	99	}
	100
	101	public bool AreColumnNamesInFirstLine(Stream stream) {
	102	NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
	103	DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	104	char separator = ',';
	105	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
	106	}
	107
	108	public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
	109	DateTimeFormatInfo dateTimeFormatInfo, char separator) {
	110	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
	111	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
	112	}
	113	}
	114
	115	public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
	116	DateTimeFormatInfo dateTimeFormatInfo, char separator) {
[13974]	117	using (StreamReader reader = new StreamReader(stream, Encoding)) {
[9651]	118	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
[13974]	119	return (tokenizer.PeekType() != TokenTypeEnum.Double);
[9651]	120	}
	121	}
	122
[7851]	123	/// <summary>
	124	/// Parses a file and determines the format first
	125	/// </summary>
	126	/// <param name="fileName">file which is parsed</param>
[9651]	127	/// <param name="columnNamesInFirstLine"></param>
[13974]	128	public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) {
[7851]	129	NumberFormatInfo numberFormat;
	130	DateTimeFormatInfo dateTimeFormatInfo;
	131	char separator;
[9651]	132	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
[13974]	133	EstimateNumberOfLines(fileName);
	134	Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
[7851]	135	}
	136
	137	/// <summary>
	138	/// Parses a file with the given formats
	139	/// </summary>
	140	/// <param name="fileName">file which is parsed</param>
	141	/// <param name="numberFormat">Format of numbers</param>
	142	/// <param name="dateTimeFormatInfo">Format of datetime</param>
	143	/// <param name="separator">defines the separator</param>
[9651]	144	/// <param name="columnNamesInFirstLine"></param>
[13974]	145	public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
	146	EstimateNumberOfLines(fileName);
[9651]	147	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
[13974]	148	Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
[9651]	149	}
[7849]	150	}
	151
[13974]	152	// determines the number of newline characters in the first 64KB to guess the number of rows for a file
	153	private void EstimateNumberOfLines(string fileName) {
	154	var len = new System.IO.FileInfo(fileName).Length;
	155	var buf = new char[1024 * 1024];
	156	using (var reader = new StreamReader(fileName, Encoding)) {
	157	reader.ReadBlock(buf, 0, buf.Length);
	158	}
	159	int numNewLine = 0;
	160	int charsInCurrentLine = 0, charsInFirstLine = 0; // the first line (names) and the last line (incomplete) are not representative
	161	foreach (var ch in buf) {
	162	charsInCurrentLine++;
	163	if (ch == '\n') {
	164	if (numNewLine == 0) charsInFirstLine = charsInCurrentLine; // store the number of chars in the first line
	165	charsInCurrentLine = 0;
	166	numNewLine++;
	167	}
	168	}
	169	if (numNewLine <= 1) {
	170	// fail -> keep the default setting
	171	return;
	172	} else {
	173	double charsPerLineFactor = (buf.Length - charsInFirstLine - charsInCurrentLine) / ((double)numNewLine - 1);
	174	double estimatedLines = len / charsPerLineFactor;
	175	estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough
	176	}
	177	}
	178
[7851]	179	/// <summary>
	180	/// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
	181	/// </summary>
	182	/// <param name="stream">stream which is parsed</param>
[9651]	183	/// <param name="columnNamesInFirstLine"></param>
[13974]	184	public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) {
[7851]	185	NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
	186	DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	187	char separator = ',';
[13974]	188	Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
[7851]	189	}
	190
	191	/// <summary>
	192	/// Parses a stream with the given formats.
	193	/// </summary>
	194	/// <param name="stream">Stream which is parsed</param>
	195	/// <param name="numberFormat">Format of numbers</param>
	196	/// <param name="dateTimeFormatInfo">Format of datetime</param>
	197	/// <param name="separator">defines the separator</param>
[9651]	198	/// <param name="columnNamesInFirstLine"></param>
[13974]	199	public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
	200	using (StreamReader reader = new StreamReader(stream, Encoding)) {
[7849]	201	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
[13974]	202	values = new List<IList>();
	203	if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
[7849]	204
[13974]	205	if (columnNamesInFirstLine) {
	206	ParseVariableNames();
	207	if (!tokenizer.HasNext())
	208	Error(
	209	"Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
	210	"", tokenizer.CurrentLineNumber);
	211	}
[7849]	212
[13974]	213
	214	// read values... start in first row
	215	int nLinesParsed = 0;
	216	int colIdx = 0;
	217	int numValuesInFirstRow = columnNamesInFirstLine ? variableNames.Count : -1; // number of variables or inizialize based on first row of values (-1)
	218	while (tokenizer.HasNext() && (lineLimit < 0 \|\| nLinesParsed < lineLimit)) {
	219	if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
	220	tokenizer.Skip();
	221
	222	// all rows have to have the same number of values
	223	// the first row defines how many samples are needed
	224	if (numValuesInFirstRow < 0) numValuesInFirstRow = values.Count; // set to number of colums in the first row
	225	else if (colIdx > 0 && numValuesInFirstRow != colIdx) { // read at least one value in the row (support for skipping empty lines)
	226	Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
	227	"Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "",
	228	tokenizer.CurrentLineNumber);
	229	}
	230	OnReport(tokenizer.BytesRead);
	231
	232	nLinesParsed++;
	233	colIdx = 0;
	234	} else {
	235	// read one value
	236	TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal;
	237	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
	238
	239	// initialize columns on the first row (fixing data types as presented in the first row...)
	240	if (nLinesParsed == 0) {
	241	values.Add(CreateList(type, estimatedNumberOfLines));
	242	} else if (colIdx == values.Count) {
	243	Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
	244	"Line " + tokenizer.CurrentLineNumber + " has more columns.", "",
	245	tokenizer.CurrentLineNumber);
	246	}
	247	if (!IsColumnTypeCompatible(values[colIdx], type)) {
	248	values[colIdx] = ConvertToStringColumn(values[colIdx]);
	249	}
	250	// add the value to the column
	251	AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal);
	252	}
[7849]	253	}
	254
[13974]	255	if (!values.Any() \|\| values.First().Count == 0)
	256	Error("Couldn't parse data values. Probably because of incorrect number format " +
	257	"(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[7849]	258	}
	259
[13974]	260	this.rows = values.First().Count;
	261	this.columns = values.Count;
[7849]	262
[13974]	263	// after everything has been parsed make sure the lists are as compact as possible
	264	foreach (var l in values) {
	265	var dblList = l as List<double>;
	266	var byteList = l as List<byte>;
	267	var dateList = l as List<DateTime>;
	268	var stringList = l as List<string>;
	269	var objList = l as List<object>;
	270	if (dblList != null) dblList.TrimExcess();
	271	if (byteList != null) byteList.TrimExcess();
	272	if (dateList != null) dateList.TrimExcess();
	273	if (stringList != null) stringList.TrimExcess();
	274	if (objList != null) objList.TrimExcess();
	275	}
[7849]	276
[13974]	277	// for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction)
	278	GC.Collect(2, GCCollectionMode.Forced);
	279	}
	280
	281	#region type-dependent dispatch
	282	private bool IsColumnTypeCompatible(IList list, TokenTypeEnum tokenType) {
	283	return (list is List<string>) \|\| // all tokens can be added to a string list
	284	(tokenType == TokenTypeEnum.Missing) \|\| // empty entries are allowed in all columns
	285	(tokenType == TokenTypeEnum.Double && list is List<double>) \|\|
	286	(tokenType == TokenTypeEnum.DateTime && list is List<DateTime>);
	287	}
	288
	289	// all columns are converted to string columns when we find an non-empty value that has incorrect type
	290	private IList ConvertToStringColumn(IList list) {
	291	var dblL = list as List<double>;
	292	if (dblL != null) {
	293	var l = new List<string>(dblL.Capacity);
	294	l.AddRange(dblL.Select(dbl => dbl.ToString()));
	295	return l;
[7849]	296	}
[13974]	297
	298	var dtL = list as List<DateTime>;
	299	if (dtL != null) {
	300	var l = new List<string>(dtL.Capacity);
	301	l.AddRange(dtL.Select(dbl => dbl.ToString()));
	302	return l;
	303	}
	304
	305	if (list is List<string>) return list;
	306
	307	throw new InvalidProgramException(string.Format("Cannot convert column of type {0} to string column", list.GetType()));
[7849]	308	}
	309
[13974]	310	private void AddValue(TokenTypeEnum type, IList list, string strVal, double dblVal, DateTime dateTimeVal) {
	311	var dblList = list as List<double>;
	312	if (dblList != null) {
	313	AddValue(type, dblList, dblVal);
	314	return;
	315	}
	316
	317	var strList = list as List<string>;
	318	if (strList != null) {
	319	AddValue(type, strList, strVal);
	320	return;
	321	}
	322	var dtList = list as List<DateTime>;
	323	if (dtList != null) {
	324	AddValue(type, dtList, dateTimeVal);
	325	return;
	326	}
	327
	328	list.Add(strVal); // assumes List<object>
	329	}
	330
	331	private void AddValue(TokenTypeEnum type, List<double> list, double dblVal) {
	332	Contract.Assert(type == TokenTypeEnum.Missing \|\| type == TokenTypeEnum.Double);
	333	list.Add(type == TokenTypeEnum.Missing ? double.NaN : dblVal);
	334	}
	335
	336	private void AddValue(TokenTypeEnum type, List<string> list, string strVal) {
	337	// assumes that strVal is always set to the original token read from the input file
	338	list.Add(type == TokenTypeEnum.Missing ? string.Empty : strVal);
	339	}
	340
	341	private void AddValue(TokenTypeEnum type, List<DateTime> list, DateTime dtVal) {
	342	Contract.Assert(type == TokenTypeEnum.Missing \|\| type == TokenTypeEnum.DateTime);
	343	list.Add(type == TokenTypeEnum.Missing ? DateTime.MinValue : dtVal);
	344	}
	345
	346	private IList CreateList(TokenTypeEnum type, int estimatedNumberOfLines) {
	347	switch (type) {
	348	case TokenTypeEnum.String:
	349	return new List<string>(estimatedNumberOfLines);
	350	case TokenTypeEnum.Double:
	351	case TokenTypeEnum.Missing: // assume double columns
	352	return new List<double>(estimatedNumberOfLines);
	353	case TokenTypeEnum.DateTime:
	354	return new List<DateTime>(estimatedNumberOfLines);
	355	default:
	356	throw new InvalidOperationException();
	357	}
	358	}
	359	#endregion
	360
[7849]	361	public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
[8885]	362	DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
[7849]	363	}
	364
	365	public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
	366	using (StreamReader reader = new StreamReader(stream)) {
	367	// skip first line
	368	reader.ReadLine();
	369	// read a block
	370	char[] buffer = new char[BUFFER_SIZE];
	371	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
	372	// count frequency of special characters
	373	Dictionary<char, int> charCounts = buffer.Take(charsRead)
	374	.GroupBy(c => c)
	375	.ToDictionary(g => g.Key, g => g.Count());
	376
	377	// depending on the characters occuring in the block
	378	// we distinghish a number of different cases based on the the following rules:
	379	// many points => it must be English number format, the other frequently occuring char is the separator
	380	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
	381	// => check the line in more detail:
	382	// English: 0, 0, 0, 0
	383	// German: 0,0 0,0 0,0 ...
	384	// => if commas are followed by space => English format
	385	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
	386	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
	387	if (OccurrencesOf(charCounts, '.') > 10) {
	388	numberFormat = NumberFormatInfo.InvariantInfo;
	389	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	390	separator = POSSIBLE_SEPARATORS
	391	.Where(c => OccurrencesOf(charCounts, c) > 10)
	392	.OrderBy(c => -OccurrencesOf(charCounts, c))
	393	.DefaultIfEmpty(' ')
	394	.First();
	395	} else if (OccurrencesOf(charCounts, ',') > 10) {
	396	// no points and many commas
	397	// count the number of tokens (chains of only digits and commas) that contain multiple comma characters
	398	int tokensWithMultipleCommas = 0;
	399	for (int i = 0; i < charsRead; i++) {
	400	int nCommas = 0;
	401	while (i < charsRead && (buffer[i] == ',' \|\| Char.IsDigit(buffer[i]))) {
	402	if (buffer[i] == ',') nCommas++;
	403	i++;
	404	}
	405	if (nCommas > 2) tokensWithMultipleCommas++;
	406	}
	407	if (tokensWithMultipleCommas > 1) {
	408	// English format (only integer values) with ',' as separator
	409	numberFormat = NumberFormatInfo.InvariantInfo;
	410	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	411	separator = ',';
	412	} else {
[13974]	413	char[] disallowedSeparators = new char[] { ',' }; // n. def. contains a space so ' ' should be disallowed to, however existing unit tests would fail
[7849]	414	// German format (real values)
	415	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
	416	dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
	417	separator = POSSIBLE_SEPARATORS
	418	.Except(disallowedSeparators)
	419	.Where(c => OccurrencesOf(charCounts, c) > 10)
	420	.OrderBy(c => -OccurrencesOf(charCounts, c))
	421	.DefaultIfEmpty(' ')
	422	.First();
	423	}
	424	} else {
	425	// no points and no commas => English format
	426	numberFormat = NumberFormatInfo.InvariantInfo;
	427	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	428	separator = POSSIBLE_SEPARATORS
	429	.Where(c => OccurrencesOf(charCounts, c) > 10)
	430	.OrderBy(c => -OccurrencesOf(charCounts, c))
	431	.DefaultIfEmpty(' ')
	432	.First();
	433	}
	434	}
	435	}
	436
	437	private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
	438	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
	439	}
	440
	441	#region tokenizer
[13974]	442	// the tokenizer reads full lines and returns separated tokens in the line as well as a terminating end-of-line character
[7849]	443	internal enum TokenTypeEnum {
[13974]	444	NewLine, String, Double, DateTime, Missing
[7849]	445	}
	446
	447	internal class Tokenizer {
	448	private StreamReader reader;
[13974]	449	// we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary)
	450	private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024];
	451	private string[] stringVals = new string[1024];
	452	private double[] doubleVals = new double[1024];
	453	private DateTime[] dateTimeVals = new DateTime[1024];
	454	private int tokenPos;
	455	private int numTokens;
[7849]	456	private NumberFormatInfo numberFormatInfo;
	457	private DateTimeFormatInfo dateTimeFormatInfo;
	458	private char separator;
	459
[13974]	460	// arrays for string.Split()
	461	private readonly char[] whiteSpaceSeparators = new char[0]; // string split uses separators as default
	462	private readonly char[] separators;
	463
[7849]	464	private int currentLineNumber = 0;
	465	public int CurrentLineNumber {
	466	get { return currentLineNumber; }
	467	private set { currentLineNumber = value; }
	468	}
	469	private string currentLine;
	470	public string CurrentLine {
	471	get { return currentLine; }
	472	private set { currentLine = value; }
	473	}
[13974]	474	public long BytesRead {
	475	get;
	476	private set;
[7849]	477	}
	478
	479	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
	480	this.reader = reader;
	481	this.numberFormatInfo = numberFormatInfo;
	482	this.dateTimeFormatInfo = dateTimeFormatInfo;
	483	this.separator = separator;
[13974]	484	this.separators = new char[] { separator };
[7849]	485	ReadNextTokens();
	486	}
	487
[13974]	488	public bool HasNext() {
	489	return numTokens > tokenPos \|\| !reader.EndOfStream;
[7849]	490	}
	491
[13974]	492	public TokenTypeEnum PeekType() {
	493	return tokenTypes[tokenPos];
[7849]	494	}
	495
[13974]	496	public void Skip() {
	497	// simply skips one token without returning the result values
	498	tokenPos++;
	499	if (numTokens == tokenPos) {
	500	ReadNextTokens();
[7849]	501	}
	502	}
	503
[13974]	504	public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {
	505	type = tokenTypes[tokenPos];
	506	strVal = stringVals[tokenPos];
	507	dblVal = doubleVals[tokenPos];
	508	dateTimeVal = dateTimeVals[tokenPos];
	509	Skip();
[7849]	510	}
	511
[13974]	512	private void ReadNextTokens() {
	513	if (!reader.EndOfStream) {
	514	CurrentLine = reader.ReadLine();
	515	CurrentLineNumber++;
	516	if (reader.BaseStream.CanSeek) {
	517	BytesRead = reader.BaseStream.Position;
	518	} else {
	519	BytesRead += CurrentLine.Length + 2; // guess
	520	}
	521	int i = 0;
	522	if (!string.IsNullOrWhiteSpace(CurrentLine)) {
	523	foreach (var tok in Split(CurrentLine)) {
	524	TokenTypeEnum type;
	525	double doubleVal;
	526	DateTime dateTimeValue;
	527	type = TokenTypeEnum.String; // default
	528	stringVals[i] = tok.Trim();
	529	if (double.TryParse(tok, NumberStyles.Float, numberFormatInfo, out doubleVal)) {
	530	type = TokenTypeEnum.Double;
	531	doubleVals[i] = doubleVal;
	532	} else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) {
	533	type = TokenTypeEnum.DateTime;
	534	dateTimeVals[i] = dateTimeValue;
	535	} else if (string.IsNullOrWhiteSpace(tok)) {
	536	type = TokenTypeEnum.Missing;
	537	}
[7849]	538
[13974]	539	// couldn't parse the token as an int or float number or datetime value so return a string token
[7849]	540
[13974]	541	tokenTypes[i] = type;
	542	i++;
[7849]	543
[13974]	544	if (i >= tokenTypes.Length) {
	545	// increase buffer size if necessary
	546	IncreaseCapacity(ref tokenTypes);
	547	IncreaseCapacity(ref doubleVals);
	548	IncreaseCapacity(ref stringVals);
	549	IncreaseCapacity(ref dateTimeVals);
	550	}
	551	}
[7849]	552	}
[13974]	553	tokenTypes[i] = TokenTypeEnum.NewLine;
	554	numTokens = i + 1;
	555	tokenPos = 0;
[7849]	556	}
	557	}
	558
[13974]	559	private IEnumerable<string> Split(string line) {
	560	return separator == WHITESPACECHAR ?
	561	line.Split(whiteSpaceSeparators, StringSplitOptions.RemoveEmptyEntries) :
	562	line.Split(separators);
[7849]	563	}
[13974]	564
	565	private static void IncreaseCapacity<T>(ref T[] arr) {
	566	int n = (int)Math.Floor(arr.Length * 1.7); // guess
	567	T[] arr2 = new T[n];
	568	Array.Copy(arr, arr2, arr.Length);
	569	arr = arr2;
	570	}
[7849]	571	}
[13974]	572	#endregion
[7849]	573
[13974]	574	#region parsing
	575
[7849]	576	private void ParseVariableNames() {
	577	// the first line must contain variable names
[13974]	578	List<string> varNames = new List<string>();
	579
	580	TokenTypeEnum type;
	581	string strVal;
	582	double dblVal;
	583	DateTime dateTimeVal;
	584
	585	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
	586
	587	// the first token must be a variable name
	588	if (type != TokenTypeEnum.String)
	589	throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type);
	590	varNames.Add(strVal);
	591
	592	while (tokenizer.HasNext() && tokenizer.PeekType() != TokenTypeEnum.NewLine) {
	593	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
	594	varNames.Add(strVal);
[7849]	595	}
[13974]	596	ExpectType(TokenTypeEnum.NewLine);
	597
	598	variableNames = varNames;
[7849]	599	}
	600
[13974]	601	private void ExpectType(TokenTypeEnum expectedToken) {
	602	if (tokenizer.PeekType() != expectedToken)
	603	throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType());
	604	tokenizer.Skip();
[7849]	605	}
	606
	607	private void Error(string message, string token, int lineNumber) {
	608	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
	609	}
	610	#endregion
	611
	612	[Serializable]
[9449]	613	public class DataFormatException : Exception {
[7849]	614	private int line;
	615	public int Line {
	616	get { return line; }
	617	}
	618	private string token;
	619	public string Token {
	620	get { return token; }
	621	}
	622	public DataFormatException(string message, string token, int line)
	623	: base(message + "\nToken: " + token + " (line: " + line + ")") {
	624	this.token = token;
	625	this.line = line;
	626	}
	627
	628	public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
	629	}
	630	}
	631	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences