Context Navigation

source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 14296

Visit:

Last change on this file since 14296 was 14296, checked in by gkronber, 8 years ago
#2661: implemented fixes for several problems in the TableFileParser. We now also store the original string representation of all tokens and use those when we detect that a column cannot be read as DateTime / double.
File size: 26.9 KB

Rev	Line
[7849]	1	#region License Information
	2	/* HeuristicLab
[14185]	3	* Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[7849]	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
	22
	23	using System;
	24	using System.Collections;
	25	using System.Collections.Generic;
[13526]	26	using System.Diagnostics.Contracts;
[7849]	27	using System.Globalization;
	28	using System.IO;
	29	using System.Linq;
	30	using System.Runtime.Serialization;
[13440]	31	using System.Text;
[7849]	32
	33	namespace HeuristicLab.Problems.Instances.DataAnalysis {
[13414]	34	public class TableFileParser : Progress<long> { // reports the number of bytes read
[8564]	35	private const int BUFFER_SIZE = 65536;
[9652]	36	// char used to symbolize whitespaces (no missing values can be handled with whitespaces)
	37	private const char WHITESPACECHAR = (char)0;
	38	private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
[7849]	39	private Tokenizer tokenizer;
[13440]	40	private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file
[7849]	41
[13584]	42
	43	private Encoding encoding = Encoding.Default;
	44
	45	public Encoding Encoding {
	46	get { return encoding; }
	47	set {
	48	if (value == null) throw new ArgumentNullException("Encoding");
	49	encoding = value;
	50	}
	51	}
	52
	53
[7849]	54	private int rows;
	55	public int Rows {
	56	get { return rows; }
	57	set { rows = value; }
	58	}
	59
	60	private int columns;
	61	public int Columns {
	62	get { return columns; }
	63	set { columns = value; }
	64	}
	65
	66	private List<IList> values;
	67	public List<IList> Values {
	68	get {
	69	return values;
	70	}
	71	}
	72
	73	private List<string> variableNames;
	74	public IEnumerable<string> VariableNames {
	75	get {
	76	if (variableNames.Count > 0) return variableNames;
	77	else {
	78	string[] names = new string[columns];
	79	for (int i = 0; i < names.Length; i++) {
	80	names[i] = "X" + i.ToString("000");
	81	}
	82	return names;
	83	}
	84	}
	85	}
	86
	87	public TableFileParser() {
	88	variableNames = new List<string>();
	89	}
	90
[9608]	91	public bool AreColumnNamesInFirstLine(string fileName) {
	92	NumberFormatInfo numberFormat;
	93	DateTimeFormatInfo dateTimeFormatInfo;
	94	char separator;
	95	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
	96	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
	97	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
	98	}
	99	}
	100
	101	public bool AreColumnNamesInFirstLine(Stream stream) {
	102	NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
	103	DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	104	char separator = ',';
	105	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
	106	}
	107
	108	public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
	109	DateTimeFormatInfo dateTimeFormatInfo, char separator) {
	110	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
	111	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
	112	}
	113	}
	114
	115	public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
	116	DateTimeFormatInfo dateTimeFormatInfo, char separator) {
[13584]	117	using (StreamReader reader = new StreamReader(stream, Encoding)) {
[9608]	118	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
[13440]	119	return (tokenizer.PeekType() != TokenTypeEnum.Double);
[9608]	120	}
	121	}
	122
[7851]	123	/// <summary>
	124	/// Parses a file and determines the format first
	125	/// </summary>
	126	/// <param name="fileName">file which is parsed</param>
[9608]	127	/// <param name="columnNamesInFirstLine"></param>
[13413]	128	public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) {
[7851]	129	NumberFormatInfo numberFormat;
	130	DateTimeFormatInfo dateTimeFormatInfo;
	131	char separator;
[9608]	132	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
[13440]	133	EstimateNumberOfLines(fileName);
[13413]	134	Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
[7851]	135	}
	136
	137	/// <summary>
	138	/// Parses a file with the given formats
	139	/// </summary>
	140	/// <param name="fileName">file which is parsed</param>
	141	/// <param name="numberFormat">Format of numbers</param>
	142	/// <param name="dateTimeFormatInfo">Format of datetime</param>
	143	/// <param name="separator">defines the separator</param>
[9608]	144	/// <param name="columnNamesInFirstLine"></param>
[13413]	145	public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
[13440]	146	EstimateNumberOfLines(fileName);
[9608]	147	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
[13413]	148	Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
[9608]	149	}
[7849]	150	}
	151
[13440]	152	// determines the number of newline characters in the first 64KB to guess the number of rows for a file
	153	private void EstimateNumberOfLines(string fileName) {
	154	var len = new System.IO.FileInfo(fileName).Length;
[13526]	155	var buf = new char[1024 * 1024];
[13584]	156	using (var reader = new StreamReader(fileName, Encoding)) {
[13445]	157	reader.ReadBlock(buf, 0, buf.Length);
	158	}
[13440]	159	int numNewLine = 0;
[13442]	160	int charsInCurrentLine = 0, charsInFirstLine = 0; // the first line (names) and the last line (incomplete) are not representative
	161	foreach (var ch in buf) {
	162	charsInCurrentLine++;
	163	if (ch == '\n') {
	164	if (numNewLine == 0) charsInFirstLine = charsInCurrentLine; // store the number of chars in the first line
	165	charsInCurrentLine = 0;
	166	numNewLine++;
	167	}
	168	}
	169	if (numNewLine <= 1) {
[13440]	170	// fail -> keep the default setting
	171	return;
	172	} else {
[13442]	173	double charsPerLineFactor = (buf.Length - charsInFirstLine - charsInCurrentLine) / ((double)numNewLine - 1);
[13440]	174	double estimatedLines = len / charsPerLineFactor;
	175	estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough
	176	}
	177	}
	178
[7851]	179	/// <summary>
	180	/// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
	181	/// </summary>
	182	/// <param name="stream">stream which is parsed</param>
[9608]	183	/// <param name="columnNamesInFirstLine"></param>
[13413]	184	public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) {
[7851]	185	NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
	186	DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	187	char separator = ',';
[13413]	188	Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
[7851]	189	}
	190
	191	/// <summary>
	192	/// Parses a stream with the given formats.
	193	/// </summary>
	194	/// <param name="stream">Stream which is parsed</param>
	195	/// <param name="numberFormat">Format of numbers</param>
	196	/// <param name="dateTimeFormatInfo">Format of datetime</param>
	197	/// <param name="separator">defines the separator</param>
[9608]	198	/// <param name="columnNamesInFirstLine"></param>
[13413]	199	public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
[14296]	200	if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
	201
	202	using (var reader = new StreamReader(stream)) {
[7849]	203	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
[14296]	204	var strValues = new List<List<string>>();
[13440]	205	values = new List<IList>();
[14296]	206	Prepare(columnNamesInFirstLine, strValues);
[13447]	207
	208	int nLinesParsed = 0;
	209	int colIdx = 0;
	210	while (tokenizer.HasNext() && (lineLimit < 0 \|\| nLinesParsed < lineLimit)) {
	211	if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
	212	tokenizer.Skip();
	213
	214	// all rows have to have the same number of values
[14296]	215	// the first row defines how many elements are needed
	216	if (colIdx > 0 && values.Count != colIdx) {
	217	// read at least one value in the row (support for skipping empty lines)
	218	Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine +
[13447]	219	"Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "",
[14296]	220	tokenizer.CurrentLineNumber);
[13440]	221	}
[13447]	222	OnReport(tokenizer.BytesRead);
	223
	224	nLinesParsed++;
	225	colIdx = 0;
[13440]	226	} else {
[13447]	227	// read one value
[14296]	228	TokenTypeEnum type;
	229	string strVal;
	230	double dblVal;
	231	DateTime dateTimeVal;
[13447]	232	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
	233
[14296]	234	if (colIdx == values.Count) {
	235	Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine +
[13447]	236	"Line " + tokenizer.CurrentLineNumber + " has more columns.", "",
	237	tokenizer.CurrentLineNumber);
[13440]	238	}
[13526]	239	if (!IsColumnTypeCompatible(values[colIdx], type)) {
[14296]	240	values[colIdx] = strValues[colIdx];
[13526]	241	}
[14296]	242
[13447]	243	// add the value to the column
[14296]	244	AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal);
	245	if (!(values[colIdx] is List<string>)) { // optimization: don't store the string values in another list if the column is list<string>
	246	strValues[colIdx].Add(strVal);
	247	}
	248	colIdx++;
[13440]	249	}
[7849]	250	}
	251	}
	252
[14296]	253	if (!values.Any() \|\| values.First().Count == 0)
	254	Error("Couldn't parse data values. Probably because of incorrect number format " +
	255	"(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
	256
[13441]	257	this.rows = values.First().Count;
[13447]	258	this.columns = values.Count;
[13441]	259
[13440]	260	// after everything has been parsed make sure the lists are as compact as possible
	261	foreach (var l in values) {
	262	var dblList = l as List<double>;
	263	var byteList = l as List<byte>;
	264	var dateList = l as List<DateTime>;
	265	var stringList = l as List<string>;
	266	var objList = l as List<object>;
	267	if (dblList != null) dblList.TrimExcess();
	268	if (byteList != null) byteList.TrimExcess();
	269	if (dateList != null) dateList.TrimExcess();
	270	if (stringList != null) stringList.TrimExcess();
	271	if (objList != null) objList.TrimExcess();
[7849]	272	}
[13442]	273
	274	// for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction)
	275	GC.Collect(2, GCCollectionMode.Forced);
[7849]	276	}
	277
[14296]	278	private void Prepare(bool columnNamesInFirstLine, List<List<string>> strValues) {
	279	if (columnNamesInFirstLine) {
	280	ParseVariableNames();
	281	if (!tokenizer.HasNext())
	282	Error(
	283	"Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
	284	"", tokenizer.CurrentLineNumber);
	285	}
	286	// read first line to determine types and allocate specific lists
	287	// read values... start in first row
	288	int colIdx = 0;
	289	while (tokenizer.PeekType() != TokenTypeEnum.NewLine) {
	290	// read one value
	291	TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal;
	292	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
	293
	294	// initialize column
	295	values.Add(CreateList(type, estimatedNumberOfLines));
	296	if (type == TokenTypeEnum.String)
	297	strValues.Add(new List<string>(0)); // optimization: don't store the string values in another list if the column is list<string>
	298	else
	299	strValues.Add(new List<string>(estimatedNumberOfLines));
	300
	301	AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal);
	302	if (type != TokenTypeEnum.String)
	303	strValues[colIdx].Add(strVal);
	304	colIdx++;
	305	}
	306	tokenizer.Skip(); // skip newline
	307	}
	308
[13447]	309	#region type-dependent dispatch
[13526]	310	private bool IsColumnTypeCompatible(IList list, TokenTypeEnum tokenType) {
	311	return (list is List<string>) \|\| // all tokens can be added to a string list
	312	(tokenType == TokenTypeEnum.Missing) \|\| // empty entries are allowed in all columns
	313	(tokenType == TokenTypeEnum.Double && list is List<double>) \|\|
	314	(tokenType == TokenTypeEnum.DateTime && list is List<DateTime>);
	315	}
	316
	317	// all columns are converted to string columns when we find an non-empty value that has incorrect type
	318	private IList ConvertToStringColumn(IList list) {
	319	var dblL = list as List<double>;
	320	if (dblL != null) {
	321	var l = new List<string>(dblL.Capacity);
	322	l.AddRange(dblL.Select(dbl => dbl.ToString()));
	323	return l;
[13447]	324	}
[13526]	325
	326	var dtL = list as List<DateTime>;
	327	if (dtL != null) {
	328	var l = new List<string>(dtL.Capacity);
	329	l.AddRange(dtL.Select(dbl => dbl.ToString()));
	330	return l;
	331	}
	332
	333	if (list is List<string>) return list;
	334
	335	throw new InvalidProgramException(string.Format("Cannot convert column of type {0} to string column", list.GetType()));
[13447]	336	}
	337
[13526]	338	private void AddValue(TokenTypeEnum type, IList list, string strVal, double dblVal, DateTime dateTimeVal) {
[13447]	339	var dblList = list as List<double>;
[13526]	340	if (dblList != null) {
	341	AddValue(type, dblList, dblVal);
	342	return;
[13447]	343	}
	344
	345	var strList = list as List<string>;
[13526]	346	if (strList != null) {
	347	AddValue(type, strList, strVal);
	348	return;
[13447]	349	}
[13526]	350	var dtList = list as List<DateTime>;
	351	if (dtList != null) {
	352	AddValue(type, dtList, dateTimeVal);
	353	return;
	354	}
	355
	356	list.Add(strVal); // assumes List<object>
[13447]	357	}
	358
[13526]	359	private void AddValue(TokenTypeEnum type, List<double> list, double dblVal) {
	360	Contract.Assert(type == TokenTypeEnum.Missing \|\| type == TokenTypeEnum.Double);
	361	list.Add(type == TokenTypeEnum.Missing ? double.NaN : dblVal);
[13447]	362	}
	363
[13526]	364	private void AddValue(TokenTypeEnum type, List<string> list, string strVal) {
	365	// assumes that strVal is always set to the original token read from the input file
	366	list.Add(type == TokenTypeEnum.Missing ? string.Empty : strVal);
	367	}
	368
	369	private void AddValue(TokenTypeEnum type, List<DateTime> list, DateTime dtVal) {
	370	Contract.Assert(type == TokenTypeEnum.Missing \|\| type == TokenTypeEnum.DateTime);
	371	list.Add(type == TokenTypeEnum.Missing ? DateTime.MinValue : dtVal);
	372	}
	373
[13447]	374	private IList CreateList(TokenTypeEnum type, int estimatedNumberOfLines) {
	375	switch (type) {
	376	case TokenTypeEnum.String:
	377	return new List<string>(estimatedNumberOfLines);
	378	case TokenTypeEnum.Double:
[13526]	379	case TokenTypeEnum.Missing: // assume double columns
[13447]	380	return new List<double>(estimatedNumberOfLines);
	381	case TokenTypeEnum.DateTime:
	382	return new List<DateTime>(estimatedNumberOfLines);
	383	default:
	384	throw new InvalidOperationException();
	385	}
	386	}
	387	#endregion
	388
[7849]	389	public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
[8885]	390	DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
[7849]	391	}
	392
	393	public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
	394	using (StreamReader reader = new StreamReader(stream)) {
	395	// skip first line
	396	reader.ReadLine();
	397	// read a block
	398	char[] buffer = new char[BUFFER_SIZE];
	399	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
	400	// count frequency of special characters
	401	Dictionary<char, int> charCounts = buffer.Take(charsRead)
	402	.GroupBy(c => c)
	403	.ToDictionary(g => g.Key, g => g.Count());
	404
	405	// depending on the characters occuring in the block
	406	// we distinghish a number of different cases based on the the following rules:
	407	// many points => it must be English number format, the other frequently occuring char is the separator
	408	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
	409	// => check the line in more detail:
	410	// English: 0, 0, 0, 0
	411	// German: 0,0 0,0 0,0 ...
	412	// => if commas are followed by space => English format
	413	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
	414	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
	415	if (OccurrencesOf(charCounts, '.') > 10) {
	416	numberFormat = NumberFormatInfo.InvariantInfo;
	417	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	418	separator = POSSIBLE_SEPARATORS
	419	.Where(c => OccurrencesOf(charCounts, c) > 10)
	420	.OrderBy(c => -OccurrencesOf(charCounts, c))
	421	.DefaultIfEmpty(' ')
	422	.First();
	423	} else if (OccurrencesOf(charCounts, ',') > 10) {
	424	// no points and many commas
	425	// count the number of tokens (chains of only digits and commas) that contain multiple comma characters
	426	int tokensWithMultipleCommas = 0;
	427	for (int i = 0; i < charsRead; i++) {
	428	int nCommas = 0;
	429	while (i < charsRead && (buffer[i] == ',' \|\| Char.IsDigit(buffer[i]))) {
	430	if (buffer[i] == ',') nCommas++;
	431	i++;
	432	}
	433	if (nCommas > 2) tokensWithMultipleCommas++;
	434	}
	435	if (tokensWithMultipleCommas > 1) {
	436	// English format (only integer values) with ',' as separator
	437	numberFormat = NumberFormatInfo.InvariantInfo;
	438	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	439	separator = ',';
	440	} else {
[13526]	441	char[] disallowedSeparators = new char[] { ',' }; // n. def. contains a space so ' ' should be disallowed to, however existing unit tests would fail
[7849]	442	// German format (real values)
	443	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
	444	dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
	445	separator = POSSIBLE_SEPARATORS
	446	.Except(disallowedSeparators)
	447	.Where(c => OccurrencesOf(charCounts, c) > 10)
	448	.OrderBy(c => -OccurrencesOf(charCounts, c))
[13584]	449	.DefaultIfEmpty(' ')
[7849]	450	.First();
	451	}
	452	} else {
	453	// no points and no commas => English format
	454	numberFormat = NumberFormatInfo.InvariantInfo;
	455	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
	456	separator = POSSIBLE_SEPARATORS
	457	.Where(c => OccurrencesOf(charCounts, c) > 10)
	458	.OrderBy(c => -OccurrencesOf(charCounts, c))
	459	.DefaultIfEmpty(' ')
	460	.First();
	461	}
	462	}
	463	}
	464
	465	private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
	466	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
	467	}
	468
	469	#region tokenizer
[13447]	470	// the tokenizer reads full lines and returns separated tokens in the line as well as a terminating end-of-line character
[7849]	471	internal enum TokenTypeEnum {
[13526]	472	NewLine, String, Double, DateTime, Missing
[7849]	473	}
	474
	475	internal class Tokenizer {
	476	private StreamReader reader;
[13411]	477	// we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary)
	478	private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024];
	479	private string[] stringVals = new string[1024];
	480	private double[] doubleVals = new double[1024];
	481	private DateTime[] dateTimeVals = new DateTime[1024];
	482	private int tokenPos;
	483	private int numTokens;
[7849]	484	private NumberFormatInfo numberFormatInfo;
	485	private DateTimeFormatInfo dateTimeFormatInfo;
	486	private char separator;
	487
[13447]	488	// arrays for string.Split()
	489	private readonly char[] whiteSpaceSeparators = new char[0]; // string split uses separators as default
	490	private readonly char[] separators;
	491
[7849]	492	private int currentLineNumber = 0;
	493	public int CurrentLineNumber {
	494	get { return currentLineNumber; }
	495	private set { currentLineNumber = value; }
	496	}
	497	private string currentLine;
	498	public string CurrentLine {
	499	get { return currentLine; }
	500	private set { currentLine = value; }
	501	}
[13414]	502	public long BytesRead {
	503	get;
	504	private set;
	505	}
[7849]	506
	507	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
	508	this.reader = reader;
	509	this.numberFormatInfo = numberFormatInfo;
	510	this.dateTimeFormatInfo = dateTimeFormatInfo;
	511	this.separator = separator;
[13447]	512	this.separators = new char[] { separator };
[7849]	513	ReadNextTokens();
	514	}
	515
[13447]	516	public bool HasNext() {
	517	return numTokens > tokenPos \|\| !reader.EndOfStream;
	518	}
	519
	520	public TokenTypeEnum PeekType() {
	521	return tokenTypes[tokenPos];
	522	}
	523
	524	public void Skip() {
	525	// simply skips one token without returning the result values
	526	tokenPos++;
	527	if (numTokens == tokenPos) {
	528	ReadNextTokens();
	529	}
	530	}
	531
	532	public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {
	533	type = tokenTypes[tokenPos];
	534	strVal = stringVals[tokenPos];
	535	dblVal = doubleVals[tokenPos];
	536	dateTimeVal = dateTimeVals[tokenPos];
	537	Skip();
	538	}
	539
[7849]	540	private void ReadNextTokens() {
	541	if (!reader.EndOfStream) {
	542	CurrentLine = reader.ReadLine();
[13447]	543	CurrentLineNumber++;
[13925]	544	if (reader.BaseStream.CanSeek) {
[13414]	545	BytesRead = reader.BaseStream.Position;
[13925]	546	} else {
[13414]	547	BytesRead += CurrentLine.Length + 2; // guess
[13584]	548	}
[13411]	549	int i = 0;
[13447]	550	if (!string.IsNullOrWhiteSpace(CurrentLine)) {
	551	foreach (var tok in Split(CurrentLine)) {
	552	TokenTypeEnum type;
[13411]	553	double doubleVal;
	554	DateTime dateTimeValue;
[13447]	555	type = TokenTypeEnum.String; // default
	556	stringVals[i] = tok.Trim();
	557	if (double.TryParse(tok, NumberStyles.Float, numberFormatInfo, out doubleVal)) {
[13411]	558	type = TokenTypeEnum.Double;
	559	doubleVals[i] = doubleVal;
[14296]	560	} else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.NoCurrentDateDefault, out dateTimeValue)
	561	&& dateTimeValue.Year > 1 && dateTimeValue.Month > 1 && dateTimeValue.Day > 1 // if no date is given it is returned as 1.1.0001 -> don't allow this
	562	) {
[13411]	563	type = TokenTypeEnum.DateTime;
	564	dateTimeVals[i] = dateTimeValue;
[13526]	565	} else if (string.IsNullOrWhiteSpace(tok)) {
	566	type = TokenTypeEnum.Missing;
[13411]	567	}
[7849]	568
[13447]	569	// couldn't parse the token as an int or float number or datetime value so return a string token
[13411]	570
	571	tokenTypes[i] = type;
	572	i++;
	573
	574	if (i >= tokenTypes.Length) {
	575	// increase buffer size if necessary
	576	IncreaseCapacity(ref tokenTypes);
	577	IncreaseCapacity(ref doubleVals);
	578	IncreaseCapacity(ref stringVals);
	579	IncreaseCapacity(ref dateTimeVals);
	580	}
	581	}
	582	}
	583	tokenTypes[i] = TokenTypeEnum.NewLine;
	584	numTokens = i + 1;
	585	tokenPos = 0;
[7849]	586	}
	587	}
	588
[13447]	589	private IEnumerable<string> Split(string line) {
	590	return separator == WHITESPACECHAR ?
	591	line.Split(whiteSpaceSeparators, StringSplitOptions.RemoveEmptyEntries) :
	592	line.Split(separators);
	593	}
	594
[13411]	595	private static void IncreaseCapacity<T>(ref T[] arr) {
	596	int n = (int)Math.Floor(arr.Length * 1.7); // guess
	597	T[] arr2 = new T[n];
	598	Array.Copy(arr, arr2, arr.Length);
	599	arr = arr2;
	600	}
[7849]	601	}
	602	#endregion
	603
	604	#region parsing
	605
	606	private void ParseVariableNames() {
	607	// the first line must contain variable names
[13411]	608	List<string> varNames = new List<string>();
	609
	610	TokenTypeEnum type;
	611	string strVal;
	612	double dblVal;
	613	DateTime dateTimeVal;
	614
	615	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
	616
	617	// the first token must be a variable name
	618	if (type != TokenTypeEnum.String)
	619	throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type);
	620	varNames.Add(strVal);
	621
[13447]	622	while (tokenizer.HasNext() && tokenizer.PeekType() != TokenTypeEnum.NewLine) {
[13411]	623	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
	624	varNames.Add(strVal);
[7849]	625	}
[13411]	626	ExpectType(TokenTypeEnum.NewLine);
	627
	628	variableNames = varNames;
[7849]	629	}
	630
[13411]	631	private void ExpectType(TokenTypeEnum expectedToken) {
	632	if (tokenizer.PeekType() != expectedToken)
	633	throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType());
	634	tokenizer.Skip();
[7849]	635	}
	636
	637	private void Error(string message, string token, int lineNumber) {
[14285]	638	throw new IOException(string.Format("Error while parsing. {0} (token: {1} lineNumber: {2}).", message, token, lineNumber));
[7849]	639	}
	640	#endregion
	641	}
	642	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences