Context Navigation

source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 15377

Visit:

Last change on this file since 15377 was 14408, checked in by mkommend, 8 years ago
#2661: Fixed a bug in the TableFileparser that omits the correct detection of DateTime columns.
File size: 26.9 KB

Rev	Line
[7849]	1	#region License Information
	2	/* HeuristicLab
[14185]	3	* Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[7849]	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
	22
	23	using System;
	24	using System.Collections;
	25	using System.Collections.Generic;
[13526]	26	using System.Diagnostics.Contracts;
[7849]	27	using System.Globalization;
	28	using System.IO;
	29	using System.Linq;
[13440]	30	using System.Text;
[7849]	31
	32	namespace HeuristicLab.Problems.Instances.DataAnalysis {
[13414]	33	public class TableFileParser : Progress<long> { // reports the number of bytes read
[8564]	34	private const int BUFFER_SIZE = 65536;
[9652]	35	// char used to symbolize whitespaces (no missing values can be handled with whitespaces)
	36	private const char WHITESPACECHAR = (char)0;
	37	private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
[7849]	38	private Tokenizer tokenizer;
[13440]	39	private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file
[7849]	40
[13584]	41
	42	private Encoding encoding = Encoding.Default;
	43
	44	public Encoding Encoding {
	45	get { return encoding; }
	46	set {
	47	if (value == null) throw new ArgumentNullException("Encoding");
	48	encoding = value;
	49	}
	50	}
	51
	52
[7849]	53	private int rows;
	54	public int Rows {
	55	get { return rows; }
	56	set { rows = value; }
	57	}
	58
	59	private int columns;
	60	public int Columns {
	61	get { return columns; }
	62	set { columns = value; }
	63	}
	64
	65	private List<IList> values;
	66	public List<IList> Values {
	67	get {
	68	return values;
	69	}
	70	}
	71
	72	private List<string> variableNames;
	73	public IEnumerable<string> VariableNames {
	74	get {
	75	if (variableNames.Count > 0) return variableNames;
	76	else {
	77	string[] names = new string[columns];
	78	for (int i = 0; i < names.Length; i++) {
	79	names[i] = "X" + i.ToString("000");
	80	}
	81	return names;
	82	}
	83	}
	84	}
	85
	86	public TableFileParser() {
	87	variableNames = new List<string>();
	88	}
	89
[9608]	90	public bool AreColumnNamesInFirstLine(string fileName) {
	91	NumberFormatInfo numberFormat;
	92	DateTimeFormatInfo dateTimeFormatInfo;
	93	char separator;
	94	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
	95	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
	96	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
	97	}
	98	}
	99
	100	public bool AreColumnNamesInFirstLine(Stream stream) {
	101	NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
	102	DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	103	char separator = ',';
	104	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
	105	}
	106
	107	public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
	108	DateTimeFormatInfo dateTimeFormatInfo, char separator) {
	109	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
	110	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
	111	}
	112	}
	113
	114	public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
	115	DateTimeFormatInfo dateTimeFormatInfo, char separator) {
[13584]	116	using (StreamReader reader = new StreamReader(stream, Encoding)) {
[9608]	117	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
[13440]	118	return (tokenizer.PeekType() != TokenTypeEnum.Double);
[9608]	119	}
	120	}
	121
[7851]	122	/// <summary>
	123	/// Parses a file and determines the format first
	124	/// </summary>
	125	/// <param name="fileName">file which is parsed</param>
[9608]	126	/// <param name="columnNamesInFirstLine"></param>
[13413]	127	public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) {
[7851]	128	NumberFormatInfo numberFormat;
	129	DateTimeFormatInfo dateTimeFormatInfo;
	130	char separator;
[9608]	131	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
[13440]	132	EstimateNumberOfLines(fileName);
[13413]	133	Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
[7851]	134	}
	135
	136	/// <summary>
	137	/// Parses a file with the given formats
	138	/// </summary>
	139	/// <param name="fileName">file which is parsed</param>
	140	/// <param name="numberFormat">Format of numbers</param>
	141	/// <param name="dateTimeFormatInfo">Format of datetime</param>
	142	/// <param name="separator">defines the separator</param>
[9608]	143	/// <param name="columnNamesInFirstLine"></param>
[13413]	144	public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
[13440]	145	EstimateNumberOfLines(fileName);
[9608]	146	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
[13413]	147	Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
[9608]	148	}
[7849]	149	}
	150
[13440]	151	// determines the number of newline characters in the first 64KB to guess the number of rows for a file
	152	private void EstimateNumberOfLines(string fileName) {
	153	var len = new System.IO.FileInfo(fileName).Length;
[13526]	154	var buf = new char[1024 * 1024];
[13584]	155	using (var reader = new StreamReader(fileName, Encoding)) {
[13445]	156	reader.ReadBlock(buf, 0, buf.Length);
	157	}
[13440]	158	int numNewLine = 0;
[13442]	159	int charsInCurrentLine = 0, charsInFirstLine = 0; // the first line (names) and the last line (incomplete) are not representative
	160	foreach (var ch in buf) {
	161	charsInCurrentLine++;
	162	if (ch == '\n') {
	163	if (numNewLine == 0) charsInFirstLine = charsInCurrentLine; // store the number of chars in the first line
	164	charsInCurrentLine = 0;
	165	numNewLine++;
	166	}
	167	}
	168	if (numNewLine <= 1) {
[13440]	169	// fail -> keep the default setting
	170	return;
	171	} else {
[13442]	172	double charsPerLineFactor = (buf.Length - charsInFirstLine - charsInCurrentLine) / ((double)numNewLine - 1);
[13440]	173	double estimatedLines = len / charsPerLineFactor;
	174	estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough
	175	}
	176	}
	177
[7851]	178	/// <summary>
	179	/// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
	180	/// </summary>
	181	/// <param name="stream">stream which is parsed</param>
[9608]	182	/// <param name="columnNamesInFirstLine"></param>
[13413]	183	public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) {
[7851]	184	NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
	185	DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	186	char separator = ',';
[13413]	187	Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
[7851]	188	}
	189
	190	/// <summary>
	191	/// Parses a stream with the given formats.
	192	/// </summary>
	193	/// <param name="stream">Stream which is parsed</param>
	194	/// <param name="numberFormat">Format of numbers</param>
	195	/// <param name="dateTimeFormatInfo">Format of datetime</param>
	196	/// <param name="separator">defines the separator</param>
[9608]	197	/// <param name="columnNamesInFirstLine"></param>
[13413]	198	public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
[14296]	199	if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
	200
	201	using (var reader = new StreamReader(stream)) {
[7849]	202	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
[14296]	203	var strValues = new List<List<string>>();
[13440]	204	values = new List<IList>();
[14296]	205	Prepare(columnNamesInFirstLine, strValues);
[13447]	206
	207	int nLinesParsed = 0;
	208	int colIdx = 0;
	209	while (tokenizer.HasNext() && (lineLimit < 0 \|\| nLinesParsed < lineLimit)) {
	210	if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
	211	tokenizer.Skip();
	212
	213	// all rows have to have the same number of values
[14296]	214	// the first row defines how many elements are needed
	215	if (colIdx > 0 && values.Count != colIdx) {
	216	// read at least one value in the row (support for skipping empty lines)
	217	Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine +
[13447]	218	"Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "",
[14296]	219	tokenizer.CurrentLineNumber);
[13440]	220	}
[13447]	221	OnReport(tokenizer.BytesRead);
	222
	223	nLinesParsed++;
	224	colIdx = 0;
[13440]	225	} else {
[13447]	226	// read one value
[14296]	227	TokenTypeEnum type;
	228	string strVal;
	229	double dblVal;
	230	DateTime dateTimeVal;
[13447]	231	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
	232
[14296]	233	if (colIdx == values.Count) {
	234	Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine +
[13447]	235	"Line " + tokenizer.CurrentLineNumber + " has more columns.", "",
	236	tokenizer.CurrentLineNumber);
[13440]	237	}
[13526]	238	if (!IsColumnTypeCompatible(values[colIdx], type)) {
[14296]	239	values[colIdx] = strValues[colIdx];
[13526]	240	}
[14296]	241
[13447]	242	// add the value to the column
[14296]	243	AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal);
	244	if (!(values[colIdx] is List<string>)) { // optimization: don't store the string values in another list if the column is list<string>
	245	strValues[colIdx].Add(strVal);
	246	}
	247	colIdx++;
[13440]	248	}
[7849]	249	}
	250	}
	251
[14296]	252	if (!values.Any() \|\| values.First().Count == 0)
	253	Error("Couldn't parse data values. Probably because of incorrect number format " +
	254	"(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
	255
[13441]	256	this.rows = values.First().Count;
[13447]	257	this.columns = values.Count;
[13441]	258
[13440]	259	// after everything has been parsed make sure the lists are as compact as possible
	260	foreach (var l in values) {
	261	var dblList = l as List<double>;
	262	var byteList = l as List<byte>;
	263	var dateList = l as List<DateTime>;
	264	var stringList = l as List<string>;
	265	var objList = l as List<object>;
	266	if (dblList != null) dblList.TrimExcess();
	267	if (byteList != null) byteList.TrimExcess();
	268	if (dateList != null) dateList.TrimExcess();
	269	if (stringList != null) stringList.TrimExcess();
	270	if (objList != null) objList.TrimExcess();
[7849]	271	}
[13442]	272
	273	// for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction)
	274	GC.Collect(2, GCCollectionMode.Forced);
[7849]	275	}
	276
[14296]	277	private void Prepare(bool columnNamesInFirstLine, List<List<string>> strValues) {
	278	if (columnNamesInFirstLine) {
	279	ParseVariableNames();
	280	if (!tokenizer.HasNext())
	281	Error(
	282	"Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
	283	"", tokenizer.CurrentLineNumber);
	284	}
	285	// read first line to determine types and allocate specific lists
	286	// read values... start in first row
	287	int colIdx = 0;
	288	while (tokenizer.PeekType() != TokenTypeEnum.NewLine) {
	289	// read one value
	290	TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal;
	291	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
	292
	293	// initialize column
	294	values.Add(CreateList(type, estimatedNumberOfLines));
	295	if (type == TokenTypeEnum.String)
	296	strValues.Add(new List<string>(0)); // optimization: don't store the string values in another list if the column is list<string>
	297	else
	298	strValues.Add(new List<string>(estimatedNumberOfLines));
	299
	300	AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal);
	301	if (type != TokenTypeEnum.String)
	302	strValues[colIdx].Add(strVal);
	303	colIdx++;
	304	}
	305	tokenizer.Skip(); // skip newline
	306	}
	307
[13447]	308	#region type-dependent dispatch
[13526]	309	private bool IsColumnTypeCompatible(IList list, TokenTypeEnum tokenType) {
	310	return (list is List<string>) \|\| // all tokens can be added to a string list
	311	(tokenType == TokenTypeEnum.Missing) \|\| // empty entries are allowed in all columns
	312	(tokenType == TokenTypeEnum.Double && list is List<double>) \|\|
	313	(tokenType == TokenTypeEnum.DateTime && list is List<DateTime>);
	314	}
	315
	316	// all columns are converted to string columns when we find an non-empty value that has incorrect type
	317	private IList ConvertToStringColumn(IList list) {
	318	var dblL = list as List<double>;
	319	if (dblL != null) {
	320	var l = new List<string>(dblL.Capacity);
	321	l.AddRange(dblL.Select(dbl => dbl.ToString()));
	322	return l;
[13447]	323	}
[13526]	324
	325	var dtL = list as List<DateTime>;
	326	if (dtL != null) {
	327	var l = new List<string>(dtL.Capacity);
	328	l.AddRange(dtL.Select(dbl => dbl.ToString()));
	329	return l;
	330	}
	331
	332	if (list is List<string>) return list;
	333
	334	throw new InvalidProgramException(string.Format("Cannot convert column of type {0} to string column", list.GetType()));
[13447]	335	}
	336
[13526]	337	private void AddValue(TokenTypeEnum type, IList list, string strVal, double dblVal, DateTime dateTimeVal) {
[13447]	338	var dblList = list as List<double>;
[13526]	339	if (dblList != null) {
	340	AddValue(type, dblList, dblVal);
	341	return;
[13447]	342	}
	343
	344	var strList = list as List<string>;
[13526]	345	if (strList != null) {
	346	AddValue(type, strList, strVal);
	347	return;
[13447]	348	}
[13526]	349	var dtList = list as List<DateTime>;
	350	if (dtList != null) {
	351	AddValue(type, dtList, dateTimeVal);
	352	return;
	353	}
	354
	355	list.Add(strVal); // assumes List<object>
[13447]	356	}
	357
[13526]	358	private void AddValue(TokenTypeEnum type, List<double> list, double dblVal) {
	359	Contract.Assert(type == TokenTypeEnum.Missing \|\| type == TokenTypeEnum.Double);
	360	list.Add(type == TokenTypeEnum.Missing ? double.NaN : dblVal);
[13447]	361	}
	362
[13526]	363	private void AddValue(TokenTypeEnum type, List<string> list, string strVal) {
	364	// assumes that strVal is always set to the original token read from the input file
	365	list.Add(type == TokenTypeEnum.Missing ? string.Empty : strVal);
	366	}
	367
	368	private void AddValue(TokenTypeEnum type, List<DateTime> list, DateTime dtVal) {
	369	Contract.Assert(type == TokenTypeEnum.Missing \|\| type == TokenTypeEnum.DateTime);
	370	list.Add(type == TokenTypeEnum.Missing ? DateTime.MinValue : dtVal);
	371	}
	372
[13447]	373	private IList CreateList(TokenTypeEnum type, int estimatedNumberOfLines) {
	374	switch (type) {
	375	case TokenTypeEnum.String:
	376	return new List<string>(estimatedNumberOfLines);
	377	case TokenTypeEnum.Double:
[13526]	378	case TokenTypeEnum.Missing: // assume double columns
[13447]	379	return new List<double>(estimatedNumberOfLines);
	380	case TokenTypeEnum.DateTime:
	381	return new List<DateTime>(estimatedNumberOfLines);
	382	default:
	383	throw new InvalidOperationException();
	384	}
	385	}
	386	#endregion
	387
[7849]	388	public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
[8885]	389	DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
[7849]	390	}
	391
	392	public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
	393	using (StreamReader reader = new StreamReader(stream)) {
	394	// skip first line
	395	reader.ReadLine();
	396	// read a block
	397	char[] buffer = new char[BUFFER_SIZE];
	398	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
	399	// count frequency of special characters
	400	Dictionary<char, int> charCounts = buffer.Take(charsRead)
	401	.GroupBy(c => c)
	402	.ToDictionary(g => g.Key, g => g.Count());
	403
	404	// depending on the characters occuring in the block
	405	// we distinghish a number of different cases based on the the following rules:
	406	// many points => it must be English number format, the other frequently occuring char is the separator
	407	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
	408	// => check the line in more detail:
	409	// English: 0, 0, 0, 0
	410	// German: 0,0 0,0 0,0 ...
	411	// => if commas are followed by space => English format
	412	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
	413	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
	414	if (OccurrencesOf(charCounts, '.') > 10) {
	415	numberFormat = NumberFormatInfo.InvariantInfo;
	416	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	417	separator = POSSIBLE_SEPARATORS
	418	.Where(c => OccurrencesOf(charCounts, c) > 10)
	419	.OrderBy(c => -OccurrencesOf(charCounts, c))
	420	.DefaultIfEmpty(' ')
	421	.First();
	422	} else if (OccurrencesOf(charCounts, ',') > 10) {
	423	// no points and many commas
	424	// count the number of tokens (chains of only digits and commas) that contain multiple comma characters
	425	int tokensWithMultipleCommas = 0;
	426	for (int i = 0; i < charsRead; i++) {
	427	int nCommas = 0;
	428	while (i < charsRead && (buffer[i] == ',' \|\| Char.IsDigit(buffer[i]))) {
	429	if (buffer[i] == ',') nCommas++;
	430	i++;
	431	}
	432	if (nCommas > 2) tokensWithMultipleCommas++;
	433	}
	434	if (tokensWithMultipleCommas > 1) {
	435	// English format (only integer values) with ',' as separator
	436	numberFormat = NumberFormatInfo.InvariantInfo;
	437	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	438	separator = ',';
	439	} else {
[13526]	440	char[] disallowedSeparators = new char[] { ',' }; // n. def. contains a space so ' ' should be disallowed to, however existing unit tests would fail
[7849]	441	// German format (real values)
	442	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
	443	dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
	444	separator = POSSIBLE_SEPARATORS
	445	.Except(disallowedSeparators)
	446	.Where(c => OccurrencesOf(charCounts, c) > 10)
	447	.OrderBy(c => -OccurrencesOf(charCounts, c))
[13584]	448	.DefaultIfEmpty(' ')
[7849]	449	.First();
	450	}
	451	} else {
	452	// no points and no commas => English format
	453	numberFormat = NumberFormatInfo.InvariantInfo;
	454	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	455	separator = POSSIBLE_SEPARATORS
	456	.Where(c => OccurrencesOf(charCounts, c) > 10)
	457	.OrderBy(c => -OccurrencesOf(charCounts, c))
	458	.DefaultIfEmpty(' ')
	459	.First();
	460	}
	461	}
	462	}
	463
	464	private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
	465	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
	466	}
	467
	468	#region tokenizer
[13447]	469	// the tokenizer reads full lines and returns separated tokens in the line as well as a terminating end-of-line character
[7849]	470	internal enum TokenTypeEnum {
[13526]	471	NewLine, String, Double, DateTime, Missing
[7849]	472	}
	473
	474	internal class Tokenizer {
	475	private StreamReader reader;
[13411]	476	// we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary)
	477	private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024];
	478	private string[] stringVals = new string[1024];
	479	private double[] doubleVals = new double[1024];
	480	private DateTime[] dateTimeVals = new DateTime[1024];
	481	private int tokenPos;
	482	private int numTokens;
[7849]	483	private NumberFormatInfo numberFormatInfo;
	484	private DateTimeFormatInfo dateTimeFormatInfo;
	485	private char separator;
	486
[13447]	487	// arrays for string.Split()
	488	private readonly char[] whiteSpaceSeparators = new char[0]; // string split uses separators as default
	489	private readonly char[] separators;
	490
[7849]	491	private int currentLineNumber = 0;
	492	public int CurrentLineNumber {
	493	get { return currentLineNumber; }
	494	private set { currentLineNumber = value; }
	495	}
	496	private string currentLine;
	497	public string CurrentLine {
	498	get { return currentLine; }
	499	private set { currentLine = value; }
	500	}
[13414]	501	public long BytesRead {
	502	get;
	503	private set;
	504	}
[7849]	505
	506	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
	507	this.reader = reader;
	508	this.numberFormatInfo = numberFormatInfo;
	509	this.dateTimeFormatInfo = dateTimeFormatInfo;
	510	this.separator = separator;
[13447]	511	this.separators = new char[] { separator };
[7849]	512	ReadNextTokens();
	513	}
	514
[13447]	515	public bool HasNext() {
	516	return numTokens > tokenPos \|\| !reader.EndOfStream;
	517	}
	518
	519	public TokenTypeEnum PeekType() {
	520	return tokenTypes[tokenPos];
	521	}
	522
	523	public void Skip() {
	524	// simply skips one token without returning the result values
	525	tokenPos++;
	526	if (numTokens == tokenPos) {
	527	ReadNextTokens();
	528	}
	529	}
	530
	531	public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {
	532	type = tokenTypes[tokenPos];
	533	strVal = stringVals[tokenPos];
	534	dblVal = doubleVals[tokenPos];
	535	dateTimeVal = dateTimeVals[tokenPos];
	536	Skip();
	537	}
	538
[7849]	539	private void ReadNextTokens() {
	540	if (!reader.EndOfStream) {
	541	CurrentLine = reader.ReadLine();
[13447]	542	CurrentLineNumber++;
[13925]	543	if (reader.BaseStream.CanSeek) {
[13414]	544	BytesRead = reader.BaseStream.Position;
[13925]	545	} else {
[13414]	546	BytesRead += CurrentLine.Length + 2; // guess
[13584]	547	}
[13411]	548	int i = 0;
[13447]	549	if (!string.IsNullOrWhiteSpace(CurrentLine)) {
	550	foreach (var tok in Split(CurrentLine)) {
	551	TokenTypeEnum type;
[13411]	552	double doubleVal;
	553	DateTime dateTimeValue;
[13447]	554	type = TokenTypeEnum.String; // default
	555	stringVals[i] = tok.Trim();
	556	if (double.TryParse(tok, NumberStyles.Float, numberFormatInfo, out doubleVal)) {
[13411]	557	type = TokenTypeEnum.Double;
	558	doubleVals[i] = doubleVal;
[14296]	559	} else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.NoCurrentDateDefault, out dateTimeValue)
[14408]	560	&& (dateTimeValue.Year > 1 \|\| dateTimeValue.Month > 1 \|\| dateTimeValue.Day > 1)// if no date is given it is returned as 1.1.0001 -> don't allow this
[14296]	561	) {
[13411]	562	type = TokenTypeEnum.DateTime;
	563	dateTimeVals[i] = dateTimeValue;
[13526]	564	} else if (string.IsNullOrWhiteSpace(tok)) {
	565	type = TokenTypeEnum.Missing;
[13411]	566	}
[7849]	567
[13447]	568	// couldn't parse the token as an int or float number or datetime value so return a string token
[13411]	569
	570	tokenTypes[i] = type;
	571	i++;
	572
	573	if (i >= tokenTypes.Length) {
	574	// increase buffer size if necessary
	575	IncreaseCapacity(ref tokenTypes);
	576	IncreaseCapacity(ref doubleVals);
	577	IncreaseCapacity(ref stringVals);
	578	IncreaseCapacity(ref dateTimeVals);
	579	}
	580	}
	581	}
	582	tokenTypes[i] = TokenTypeEnum.NewLine;
	583	numTokens = i + 1;
	584	tokenPos = 0;
[7849]	585	}
	586	}
	587
[13447]	588	private IEnumerable<string> Split(string line) {
	589	return separator == WHITESPACECHAR ?
	590	line.Split(whiteSpaceSeparators, StringSplitOptions.RemoveEmptyEntries) :
	591	line.Split(separators);
	592	}
	593
[13411]	594	private static void IncreaseCapacity<T>(ref T[] arr) {
	595	int n = (int)Math.Floor(arr.Length * 1.7); // guess
	596	T[] arr2 = new T[n];
	597	Array.Copy(arr, arr2, arr.Length);
	598	arr = arr2;
	599	}
[7849]	600	}
	601	#endregion
	602
	603	#region parsing
	604
	605	private void ParseVariableNames() {
	606	// the first line must contain variable names
[13411]	607	List<string> varNames = new List<string>();
	608
	609	TokenTypeEnum type;
	610	string strVal;
	611	double dblVal;
	612	DateTime dateTimeVal;
	613
	614	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
	615
	616	// the first token must be a variable name
	617	if (type != TokenTypeEnum.String)
	618	throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type);
	619	varNames.Add(strVal);
	620
[13447]	621	while (tokenizer.HasNext() && tokenizer.PeekType() != TokenTypeEnum.NewLine) {
[13411]	622	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
	623	varNames.Add(strVal);
[7849]	624	}
[13411]	625	ExpectType(TokenTypeEnum.NewLine);
	626
	627	variableNames = varNames;
[7849]	628	}
	629
[13411]	630	private void ExpectType(TokenTypeEnum expectedToken) {
	631	if (tokenizer.PeekType() != expectedToken)
	632	throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType());
	633	tokenizer.Skip();
[7849]	634	}
	635
	636	private void Error(string message, string token, int lineNumber) {
[14285]	637	throw new IOException(string.Format("Error while parsing. {0} (token: {1} lineNumber: {2}).", message, token, lineNumber));
[7849]	638	}
	639	#endregion
	640	}
	641	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences