Context Navigation

source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 13445

Visit:

Last change on this file since 13445 was 13445, checked in by gkronber, 9 years ago
#2071: corrected disposal of StreamReader
File size: 25.2 KB

Rev	Line
[7849]	1	#region License Information
	2	/* HeuristicLab
[12012]	3	* Copyright (C) 2002-2015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[7849]	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
	22
	23	using System;
	24	using System.Collections;
	25	using System.Collections.Generic;
	26	using System.Globalization;
	27	using System.IO;
	28	using System.Linq;
[13442]	29	using System.Runtime;
[7849]	30	using System.Runtime.Serialization;
[13440]	31	using System.Text;
[7849]	32
	33	namespace HeuristicLab.Problems.Instances.DataAnalysis {
[13414]	34	public class TableFileParser : Progress<long> { // reports the number of bytes read
[8564]	35	private const int BUFFER_SIZE = 65536;
[9652]	36	// char used to symbolize whitespaces (no missing values can be handled with whitespaces)
	37	private const char WHITESPACECHAR = (char)0;
	38	private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
[7849]	39	private Tokenizer tokenizer;
[13440]	40	private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file
[7849]	41
	42	private int rows;
	43	public int Rows {
	44	get { return rows; }
	45	set { rows = value; }
	46	}
	47
	48	private int columns;
	49	public int Columns {
	50	get { return columns; }
	51	set { columns = value; }
	52	}
	53
	54	private List<IList> values;
	55	public List<IList> Values {
	56	get {
	57	return values;
	58	}
	59	}
	60
	61	private List<string> variableNames;
	62	public IEnumerable<string> VariableNames {
	63	get {
	64	if (variableNames.Count > 0) return variableNames;
	65	else {
	66	string[] names = new string[columns];
	67	for (int i = 0; i < names.Length; i++) {
	68	names[i] = "X" + i.ToString("000");
	69	}
	70	return names;
	71	}
	72	}
	73	}
	74
	75	public TableFileParser() {
	76	variableNames = new List<string>();
	77	}
	78
[9608]	79	public bool AreColumnNamesInFirstLine(string fileName) {
	80	NumberFormatInfo numberFormat;
	81	DateTimeFormatInfo dateTimeFormatInfo;
	82	char separator;
	83	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
	84	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
	85	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
	86	}
	87	}
	88
	89	public bool AreColumnNamesInFirstLine(Stream stream) {
	90	NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
	91	DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	92	char separator = ',';
	93	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
	94	}
	95
	96	public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
	97	DateTimeFormatInfo dateTimeFormatInfo, char separator) {
	98	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
	99	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
	100	}
	101	}
	102
	103	public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
	104	DateTimeFormatInfo dateTimeFormatInfo, char separator) {
	105	using (StreamReader reader = new StreamReader(stream)) {
	106	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
[13440]	107	return (tokenizer.PeekType() != TokenTypeEnum.Double);
[9608]	108	}
	109	}
	110
[7851]	111	/// <summary>
	112	/// Parses a file and determines the format first
	113	/// </summary>
	114	/// <param name="fileName">file which is parsed</param>
[9608]	115	/// <param name="columnNamesInFirstLine"></param>
[13413]	116	public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) {
[7851]	117	NumberFormatInfo numberFormat;
	118	DateTimeFormatInfo dateTimeFormatInfo;
	119	char separator;
[9608]	120	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
[13440]	121	EstimateNumberOfLines(fileName);
[13413]	122	Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
[7851]	123	}
	124
	125	/// <summary>
	126	/// Parses a file with the given formats
	127	/// </summary>
	128	/// <param name="fileName">file which is parsed</param>
	129	/// <param name="numberFormat">Format of numbers</param>
	130	/// <param name="dateTimeFormatInfo">Format of datetime</param>
	131	/// <param name="separator">defines the separator</param>
[9608]	132	/// <param name="columnNamesInFirstLine"></param>
[13413]	133	public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
[13440]	134	EstimateNumberOfLines(fileName);
[9608]	135	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
[13413]	136	Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
[9608]	137	}
[7849]	138	}
	139
[13440]	140	// determines the number of newline characters in the first 64KB to guess the number of rows for a file
	141	private void EstimateNumberOfLines(string fileName) {
	142	var len = new System.IO.FileInfo(fileName).Length;
	143	var buf = new char[64 * 1024];
[13445]	144	using(var reader = new StreamReader(fileName)) {
	145	reader.ReadBlock(buf, 0, buf.Length);
	146	}
[13440]	147	int numNewLine = 0;
[13442]	148	int charsInCurrentLine = 0, charsInFirstLine = 0; // the first line (names) and the last line (incomplete) are not representative
	149	foreach (var ch in buf) {
	150	charsInCurrentLine++;
	151	if (ch == '\n') {
	152	if (numNewLine == 0) charsInFirstLine = charsInCurrentLine; // store the number of chars in the first line
	153	charsInCurrentLine = 0;
	154	numNewLine++;
	155	}
	156	}
	157	if (numNewLine <= 1) {
[13440]	158	// fail -> keep the default setting
	159	return;
	160	} else {
[13442]	161	double charsPerLineFactor = (buf.Length - charsInFirstLine - charsInCurrentLine) / ((double)numNewLine - 1);
[13440]	162	double estimatedLines = len / charsPerLineFactor;
	163	estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough
	164	}
	165	}
	166
[7851]	167	/// <summary>
	168	/// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
	169	/// </summary>
	170	/// <param name="stream">stream which is parsed</param>
[9608]	171	/// <param name="columnNamesInFirstLine"></param>
[13413]	172	public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) {
[7851]	173	NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
	174	DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	175	char separator = ',';
[13413]	176	Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
[7851]	177	}
	178
	179	/// <summary>
	180	/// Parses a stream with the given formats.
	181	/// </summary>
	182	/// <param name="stream">Stream which is parsed</param>
	183	/// <param name="numberFormat">Format of numbers</param>
	184	/// <param name="dateTimeFormatInfo">Format of datetime</param>
	185	/// <param name="separator">defines the separator</param>
[9608]	186	/// <param name="columnNamesInFirstLine"></param>
[13413]	187	public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
[7849]	188	using (StreamReader reader = new StreamReader(stream)) {
	189	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
[13440]	190	// parse the file line by line
	191	values = new List<IList>();
	192	if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
	193	foreach (var row in Parse(columnNamesInFirstLine, lineLimit)) {
	194	columns = row.Count;
	195	// on the first row we create our lists for column-oriented storage
	196	if (!values.Any()) {
	197	foreach (var obj in row) {
	198	// create a list type matching the object type and add first element
	199	if (obj == null) {
	200	var l = new List<object>(estimatedNumberOfLines);
	201	values.Add(l);
	202	l.Add(obj);
	203	} else if (obj is double) {
	204	var l = new List<double>(estimatedNumberOfLines);
	205	values.Add(l);
	206	l.Add((double)obj);
	207	} else if (obj is DateTime) {
	208	var l = new List<DateTime>(estimatedNumberOfLines);
	209	values.Add(l);
	210	l.Add((DateTime)obj);
	211	} else if (obj is string) {
	212	var l = new List<string>(estimatedNumberOfLines);
	213	values.Add(l);
	214	l.Add((string)obj);
	215	} else throw new InvalidOperationException();
	216	}
	217	// fill with initial value
	218	} else {
	219	// the columns are already there -> try to add values
	220	int columnIndex = 0;
	221	foreach (object element in row) {
	222	if (values[columnIndex] is List<double> && !(element is double))
	223	values[columnIndex].Add(double.NaN);
	224	else if (values[columnIndex] is List<DateTime> && !(element is DateTime))
	225	values[columnIndex].Add(DateTime.MinValue);
	226	else if (values[columnIndex] is List<string> && !(element is string))
	227	values[columnIndex].Add(element.ToString());
	228	else
	229	values[columnIndex].Add(element);
	230	columnIndex++;
	231	}
	232	}
[7849]	233	}
	234
[13440]	235	if (!values.Any() \|\| values.First().Count == 0)
	236	Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[7849]	237	}
	238
[13441]	239	this.rows = values.First().Count;
	240
[13440]	241	// after everything has been parsed make sure the lists are as compact as possible
	242	foreach (var l in values) {
	243	var dblList = l as List<double>;
	244	var byteList = l as List<byte>;
	245	var dateList = l as List<DateTime>;
	246	var stringList = l as List<string>;
	247	var objList = l as List<object>;
	248	if (dblList != null) dblList.TrimExcess();
	249	if (byteList != null) byteList.TrimExcess();
	250	if (dateList != null) dateList.TrimExcess();
	251	if (stringList != null) stringList.TrimExcess();
	252	if (objList != null) objList.TrimExcess();
[7849]	253	}
[13442]	254
	255	// for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction)
	256	GC.Collect(2, GCCollectionMode.Forced);
[7849]	257	}
	258
	259	public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
[8885]	260	DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
[7849]	261	}
	262
	263	public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
	264	using (StreamReader reader = new StreamReader(stream)) {
	265	// skip first line
	266	reader.ReadLine();
	267	// read a block
	268	char[] buffer = new char[BUFFER_SIZE];
	269	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
	270	// count frequency of special characters
	271	Dictionary<char, int> charCounts = buffer.Take(charsRead)
	272	.GroupBy(c => c)
	273	.ToDictionary(g => g.Key, g => g.Count());
	274
	275	// depending on the characters occuring in the block
	276	// we distinghish a number of different cases based on the the following rules:
	277	// many points => it must be English number format, the other frequently occuring char is the separator
	278	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
	279	// => check the line in more detail:
	280	// English: 0, 0, 0, 0
	281	// German: 0,0 0,0 0,0 ...
	282	// => if commas are followed by space => English format
	283	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
	284	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
	285	if (OccurrencesOf(charCounts, '.') > 10) {
	286	numberFormat = NumberFormatInfo.InvariantInfo;
	287	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	288	separator = POSSIBLE_SEPARATORS
	289	.Where(c => OccurrencesOf(charCounts, c) > 10)
	290	.OrderBy(c => -OccurrencesOf(charCounts, c))
	291	.DefaultIfEmpty(' ')
	292	.First();
	293	} else if (OccurrencesOf(charCounts, ',') > 10) {
	294	// no points and many commas
	295	// count the number of tokens (chains of only digits and commas) that contain multiple comma characters
	296	int tokensWithMultipleCommas = 0;
	297	for (int i = 0; i < charsRead; i++) {
	298	int nCommas = 0;
	299	while (i < charsRead && (buffer[i] == ',' \|\| Char.IsDigit(buffer[i]))) {
	300	if (buffer[i] == ',') nCommas++;
	301	i++;
	302	}
	303	if (nCommas > 2) tokensWithMultipleCommas++;
	304	}
	305	if (tokensWithMultipleCommas > 1) {
	306	// English format (only integer values) with ',' as separator
	307	numberFormat = NumberFormatInfo.InvariantInfo;
	308	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	309	separator = ',';
	310	} else {
	311	char[] disallowedSeparators = new char[] { ',' };
	312	// German format (real values)
	313	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
	314	dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
	315	separator = POSSIBLE_SEPARATORS
	316	.Except(disallowedSeparators)
	317	.Where(c => OccurrencesOf(charCounts, c) > 10)
	318	.OrderBy(c => -OccurrencesOf(charCounts, c))
	319	.DefaultIfEmpty(' ')
	320	.First();
	321	}
	322	} else {
	323	// no points and no commas => English format
	324	numberFormat = NumberFormatInfo.InvariantInfo;
	325	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	326	separator = POSSIBLE_SEPARATORS
	327	.Where(c => OccurrencesOf(charCounts, c) > 10)
	328	.OrderBy(c => -OccurrencesOf(charCounts, c))
	329	.DefaultIfEmpty(' ')
	330	.First();
	331	}
	332	}
	333	}
	334
	335	private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
	336	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
	337	}
	338
	339	#region tokenizer
	340	internal enum TokenTypeEnum {
	341	NewLine, Separator, String, Double, DateTime
	342	}
	343
	344	internal class Tokenizer {
	345	private StreamReader reader;
[13411]	346	// we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary)
	347	private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024];
	348	private string[] stringVals = new string[1024];
	349	private double[] doubleVals = new double[1024];
	350	private DateTime[] dateTimeVals = new DateTime[1024];
	351	private int tokenPos;
	352	private int numTokens;
[7849]	353	private NumberFormatInfo numberFormatInfo;
	354	private DateTimeFormatInfo dateTimeFormatInfo;
	355	private char separator;
	356	private const string INTERNAL_SEPARATOR = "#";
	357
	358	private int currentLineNumber = 0;
	359	public int CurrentLineNumber {
	360	get { return currentLineNumber; }
	361	private set { currentLineNumber = value; }
	362	}
	363	private string currentLine;
	364	public string CurrentLine {
	365	get { return currentLine; }
	366	private set { currentLine = value; }
	367	}
[13414]	368	public long BytesRead {
	369	get;
	370	private set;
	371	}
[7849]	372
	373	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
	374	this.reader = reader;
	375	this.numberFormatInfo = numberFormatInfo;
	376	this.dateTimeFormatInfo = dateTimeFormatInfo;
	377	this.separator = separator;
	378	ReadNextTokens();
	379	}
	380
	381	private void ReadNextTokens() {
	382	if (!reader.EndOfStream) {
	383	CurrentLine = reader.ReadLine();
[13414]	384	try {
	385	BytesRead = reader.BaseStream.Position;
[13441]	386	} catch (IOException) {
[13414]	387	BytesRead += CurrentLine.Length + 2; // guess
[13441]	388	} catch (NotSupportedException) {
[13414]	389	BytesRead += CurrentLine.Length + 2;
	390	}
[13411]	391	int i = 0;
	392	foreach (var tok in Split(CurrentLine)) {
	393	var trimmedStr = tok.Trim();
	394	if (!string.IsNullOrEmpty(trimmedStr)) {
	395	TokenTypeEnum type = TokenTypeEnum.String; // default
	396	stringVals[i] = trimmedStr;
	397	double doubleVal;
	398	DateTime dateTimeValue;
	399	if (trimmedStr.Equals(INTERNAL_SEPARATOR)) {
	400	type = TokenTypeEnum.Separator;
	401	} else if (double.TryParse(trimmedStr, NumberStyles.Float, numberFormatInfo, out doubleVal)) {
	402	type = TokenTypeEnum.Double;
	403	doubleVals[i] = doubleVal;
	404	} else if (DateTime.TryParse(trimmedStr, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) {
	405	type = TokenTypeEnum.DateTime;
	406	dateTimeVals[i] = dateTimeValue;
	407	}
[7849]	408
[13411]	409	// couldn't parse the token as an int or float number or datetime value so return a string token
	410
	411	tokenTypes[i] = type;
	412	i++;
	413
	414	if (i >= tokenTypes.Length) {
	415	// increase buffer size if necessary
	416	IncreaseCapacity(ref tokenTypes);
	417	IncreaseCapacity(ref doubleVals);
	418	IncreaseCapacity(ref stringVals);
	419	IncreaseCapacity(ref dateTimeVals);
	420	}
	421	}
	422	}
	423	tokenTypes[i] = TokenTypeEnum.NewLine;
	424	numTokens = i + 1;
	425	tokenPos = 0;
[7849]	426	}
	427	}
	428
[13411]	429	private static void IncreaseCapacity<T>(ref T[] arr) {
	430	int n = (int)Math.Floor(arr.Length * 1.7); // guess
	431	T[] arr2 = new T[n];
	432	Array.Copy(arr, arr2, arr.Length);
	433	arr = arr2;
	434	}
	435
[7849]	436	private IEnumerable<string> Split(string line) {
[13411]	437	string[] splitString;
[9652]	438	if (separator == WHITESPACECHAR) {
	439	//separate whitespaces
	440	splitString = line.Split(new char[0], StringSplitOptions.RemoveEmptyEntries);
	441	} else {
	442	splitString = line.Split(separator);
	443	}
[7849]	444
[13411]	445	for (int i = 0; i < splitString.Length - 1; i++) {
	446	yield return splitString[i];
	447	yield return INTERNAL_SEPARATOR;
[7849]	448	}
[13411]	449	// do not return the INTERNAL_SEPARATOR after the last string
	450	yield return splitString[splitString.Length - 1];
[7849]	451	}
	452
[13411]	453	public TokenTypeEnum PeekType() {
	454	return tokenTypes[tokenPos];
[7849]	455	}
	456
[13411]	457	public void Skip() {
	458	// simply skips one token without returning the result values
	459	tokenPos++;
	460	if (numTokens == tokenPos) {
[7849]	461	ReadNextTokens();
	462	}
	463	}
	464
[13411]	465	public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {
	466	type = tokenTypes[tokenPos];
	467	strVal = stringVals[tokenPos];
	468	dblVal = doubleVals[tokenPos];
	469	dateTimeVal = dateTimeVals[tokenPos];
	470	Skip();
	471	}
	472
[7849]	473	public bool HasNext() {
[13411]	474	return numTokens > tokenPos \|\| !reader.EndOfStream;
[7849]	475	}
	476	}
	477	#endregion
	478
	479	#region parsing
[13440]	480	private IEnumerable<List<object>> Parse(bool columnNamesInFirstLine, int lineLimit = -1) { // lineLimit = -1 means no limit
[9608]	481	if (columnNamesInFirstLine) {
	482	ParseVariableNames();
	483	if (!tokenizer.HasNext())
	484	Error(
	485	"Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
	486	"", tokenizer.CurrentLineNumber);
	487	}
[13440]	488	return ParseValues(lineLimit);
[7849]	489	}
	490
[13440]	491	private IEnumerable<List<object>> ParseValues(int lineLimit = -1) {
[13413]	492	int nLinesParsed = 0;
[13440]	493	int numValuesInFirstRow = -1;
[13413]	494	while (tokenizer.HasNext() && (lineLimit < 0 \|\| nLinesParsed < lineLimit)) {
[13411]	495	if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
	496	tokenizer.Skip();
[13413]	497	nLinesParsed++;
[7849]	498	} else {
	499	List<object> row = new List<object>();
	500	object value = NextValue(tokenizer);
	501	row.Add(value);
[13411]	502	while (tokenizer.HasNext() && tokenizer.PeekType() == TokenTypeEnum.Separator) {
	503	ExpectType(TokenTypeEnum.Separator);
[7849]	504	row.Add(NextValue(tokenizer));
	505	}
[13411]	506	ExpectType(TokenTypeEnum.NewLine);
[13413]	507	nLinesParsed++;
[7849]	508	// all rows have to have the same number of values
	509	// the first row defines how many samples are needed
[13440]	510	if (numValuesInFirstRow < 0) numValuesInFirstRow = row.Count;
	511	else if (numValuesInFirstRow != row.Count) {
	512	Error("The first row of the dataset has " + numValuesInFirstRow + " columns." +
[7849]	513	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "",
	514	tokenizer.CurrentLineNumber);
	515	}
[13440]	516	yield return row;
[7849]	517	}
[13414]	518
	519	OnReport(tokenizer.BytesRead);
[7849]	520	}
	521	}
	522
	523	private object NextValue(Tokenizer tokenizer) {
[13411]	524	if (tokenizer.PeekType() == TokenTypeEnum.Separator \|\| tokenizer.PeekType() == TokenTypeEnum.NewLine) return string.Empty;
	525	TokenTypeEnum type;
	526	string strVal;
	527	double dblVal;
	528	DateTime dateTimeVal;
	529
	530	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
	531	switch (type) {
	532	case TokenTypeEnum.Separator: return double.NaN;
	533	case TokenTypeEnum.String: return strVal;
	534	case TokenTypeEnum.Double: return dblVal;
	535	case TokenTypeEnum.DateTime: return dateTimeVal;
[7849]	536	}
	537	// found an unexpected token => throw error
[13411]	538	Error("Unexpected token.", strVal, tokenizer.CurrentLineNumber);
[7849]	539	// this line is never executed because Error() throws an exception
	540	throw new InvalidOperationException();
	541	}
	542
	543	private void ParseVariableNames() {
	544	// the first line must contain variable names
[13411]	545	List<string> varNames = new List<string>();
	546
	547	TokenTypeEnum type;
	548	string strVal;
	549	double dblVal;
	550	DateTime dateTimeVal;
	551
	552	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
	553
	554	// the first token must be a variable name
	555	if (type != TokenTypeEnum.String)
	556	throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type);
	557	varNames.Add(strVal);
	558
	559	while (tokenizer.HasNext() && tokenizer.PeekType() == TokenTypeEnum.Separator) {
	560	ExpectType(TokenTypeEnum.Separator);
	561	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
	562	varNames.Add(strVal);
[7849]	563	}
[13411]	564	ExpectType(TokenTypeEnum.NewLine);
	565
	566	variableNames = varNames;
[7849]	567	}
	568
[13411]	569	private void ExpectType(TokenTypeEnum expectedToken) {
	570	if (tokenizer.PeekType() != expectedToken)
	571	throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType());
	572	tokenizer.Skip();
[7849]	573	}
	574
	575	private void Error(string message, string token, int lineNumber) {
	576	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
	577	}
	578	#endregion
	579
	580	[Serializable]
[9449]	581	public class DataFormatException : Exception {
[7849]	582	private int line;
	583	public int Line {
	584	get { return line; }
	585	}
	586	private string token;
	587	public string Token {
	588	get { return token; }
	589	}
	590	public DataFormatException(string message, string token, int line)
	591	: base(message + "\nToken: " + token + " (line: " + line + ")") {
	592	this.token = token;
	593	this.line = line;
	594	}
	595
	596	public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
	597	}
	598	}
	599	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences