Context Navigation

source: stable/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs @ 14113

Visit:

Last change on this file since 14113 was 13974, checked in by gkronber, 8 years ago
#2071: merged r13411,r13413,r13414,r13415,r13419,r13440,r13441,r13442,r13445,r13447,r13525,r13526,r13529,r13584,r13901,r13925 from trunk to stable
File size: 26.4 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22
23	using System;
24	using System.Collections;
25	using System.Collections.Generic;
26	using System.Diagnostics.Contracts;
27	using System.Globalization;
28	using System.IO;
29	using System.Linq;
30	using System.Runtime.Serialization;
31	using System.Text;
32
33	namespace HeuristicLab.Problems.Instances.DataAnalysis {
34	public class TableFileParser : Progress<long> { // reports the number of bytes read
35	private const int BUFFER_SIZE = 65536;
36	// char used to symbolize whitespaces (no missing values can be handled with whitespaces)
37	private const char WHITESPACECHAR = (char)0;
38	private static readonly char[] POSSIBLE_SEPARATORS = new char[] { ',', ';', '\t', WHITESPACECHAR };
39	private Tokenizer tokenizer;
40	private int estimatedNumberOfLines = 200; // initial capacity for columns, will be set automatically when data is read from a file
41
42
43	private Encoding encoding = Encoding.Default;
44
45	public Encoding Encoding {
46	get { return encoding; }
47	set {
48	if (value == null) throw new ArgumentNullException("Encoding");
49	encoding = value;
50	}
51	}
52
53
54	private int rows;
55	public int Rows {
56	get { return rows; }
57	set { rows = value; }
58	}
59
60	private int columns;
61	public int Columns {
62	get { return columns; }
63	set { columns = value; }
64	}
65
66	private List<IList> values;
67	public List<IList> Values {
68	get {
69	return values;
70	}
71	}
72
73	private List<string> variableNames;
74	public IEnumerable<string> VariableNames {
75	get {
76	if (variableNames.Count > 0) return variableNames;
77	else {
78	string[] names = new string[columns];
79	for (int i = 0; i < names.Length; i++) {
80	names[i] = "X" + i.ToString("000");
81	}
82	return names;
83	}
84	}
85	}
86
87	public TableFileParser() {
88	variableNames = new List<string>();
89	}
90
91	public bool AreColumnNamesInFirstLine(string fileName) {
92	NumberFormatInfo numberFormat;
93	DateTimeFormatInfo dateTimeFormatInfo;
94	char separator;
95	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
96	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
97	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
98	}
99	}
100
101	public bool AreColumnNamesInFirstLine(Stream stream) {
102	NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
103	DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
104	char separator = ',';
105	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
106	}
107
108	public bool AreColumnNamesInFirstLine(string fileName, NumberFormatInfo numberFormat,
109	DateTimeFormatInfo dateTimeFormatInfo, char separator) {
110	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
111	return AreColumnNamesInFirstLine(stream, numberFormat, dateTimeFormatInfo, separator);
112	}
113	}
114
115	public bool AreColumnNamesInFirstLine(Stream stream, NumberFormatInfo numberFormat,
116	DateTimeFormatInfo dateTimeFormatInfo, char separator) {
117	using (StreamReader reader = new StreamReader(stream, Encoding)) {
118	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
119	return (tokenizer.PeekType() != TokenTypeEnum.Double);
120	}
121	}
122
123	/// <summary>
124	/// Parses a file and determines the format first
125	/// </summary>
126	/// <param name="fileName">file which is parsed</param>
127	/// <param name="columnNamesInFirstLine"></param>
128	public void Parse(string fileName, bool columnNamesInFirstLine, int lineLimit = -1) {
129	NumberFormatInfo numberFormat;
130	DateTimeFormatInfo dateTimeFormatInfo;
131	char separator;
132	DetermineFileFormat(fileName, out numberFormat, out dateTimeFormatInfo, out separator);
133	EstimateNumberOfLines(fileName);
134	Parse(new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
135	}
136
137	/// <summary>
138	/// Parses a file with the given formats
139	/// </summary>
140	/// <param name="fileName">file which is parsed</param>
141	/// <param name="numberFormat">Format of numbers</param>
142	/// <param name="dateTimeFormatInfo">Format of datetime</param>
143	/// <param name="separator">defines the separator</param>
144	/// <param name="columnNamesInFirstLine"></param>
145	public void Parse(string fileName, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
146	EstimateNumberOfLines(fileName);
147	using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) {
148	Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
149	}
150	}
151
152	// determines the number of newline characters in the first 64KB to guess the number of rows for a file
153	private void EstimateNumberOfLines(string fileName) {
154	var len = new System.IO.FileInfo(fileName).Length;
155	var buf = new char[1024 * 1024];
156	using (var reader = new StreamReader(fileName, Encoding)) {
157	reader.ReadBlock(buf, 0, buf.Length);
158	}
159	int numNewLine = 0;
160	int charsInCurrentLine = 0, charsInFirstLine = 0; // the first line (names) and the last line (incomplete) are not representative
161	foreach (var ch in buf) {
162	charsInCurrentLine++;
163	if (ch == '\n') {
164	if (numNewLine == 0) charsInFirstLine = charsInCurrentLine; // store the number of chars in the first line
165	charsInCurrentLine = 0;
166	numNewLine++;
167	}
168	}
169	if (numNewLine <= 1) {
170	// fail -> keep the default setting
171	return;
172	} else {
173	double charsPerLineFactor = (buf.Length - charsInFirstLine - charsInCurrentLine) / ((double)numNewLine - 1);
174	double estimatedLines = len / charsPerLineFactor;
175	estimatedNumberOfLines = (int)Math.Round(estimatedLines * 1.1); // pessimistic allocation of 110% to make sure that the list is very likely large enough
176	}
177	}
178
179	/// <summary>
180	/// Takes a Stream and parses it with default format. NumberFormatInfo.InvariantInfo, DateTimeFormatInfo.InvariantInfo and separator = ','
181	/// </summary>
182	/// <param name="stream">stream which is parsed</param>
183	/// <param name="columnNamesInFirstLine"></param>
184	public void Parse(Stream stream, bool columnNamesInFirstLine, int lineLimit = -1) {
185	NumberFormatInfo numberFormat = NumberFormatInfo.InvariantInfo;
186	DateTimeFormatInfo dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
187	char separator = ',';
188	Parse(stream, numberFormat, dateTimeFormatInfo, separator, columnNamesInFirstLine, lineLimit);
189	}
190
191	/// <summary>
192	/// Parses a stream with the given formats.
193	/// </summary>
194	/// <param name="stream">Stream which is parsed</param>
195	/// <param name="numberFormat">Format of numbers</param>
196	/// <param name="dateTimeFormatInfo">Format of datetime</param>
197	/// <param name="separator">defines the separator</param>
198	/// <param name="columnNamesInFirstLine"></param>
199	public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
200	using (StreamReader reader = new StreamReader(stream, Encoding)) {
201	tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
202	values = new List<IList>();
203	if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
204
205	if (columnNamesInFirstLine) {
206	ParseVariableNames();
207	if (!tokenizer.HasNext())
208	Error(
209	"Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
210	"", tokenizer.CurrentLineNumber);
211	}
212
213
214	// read values... start in first row
215	int nLinesParsed = 0;
216	int colIdx = 0;
217	int numValuesInFirstRow = columnNamesInFirstLine ? variableNames.Count : -1; // number of variables or inizialize based on first row of values (-1)
218	while (tokenizer.HasNext() && (lineLimit < 0 \|\| nLinesParsed < lineLimit)) {
219	if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
220	tokenizer.Skip();
221
222	// all rows have to have the same number of values
223	// the first row defines how many samples are needed
224	if (numValuesInFirstRow < 0) numValuesInFirstRow = values.Count; // set to number of colums in the first row
225	else if (colIdx > 0 && numValuesInFirstRow != colIdx) { // read at least one value in the row (support for skipping empty lines)
226	Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
227	"Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "",
228	tokenizer.CurrentLineNumber);
229	}
230	OnReport(tokenizer.BytesRead);
231
232	nLinesParsed++;
233	colIdx = 0;
234	} else {
235	// read one value
236	TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal;
237	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
238
239	// initialize columns on the first row (fixing data types as presented in the first row...)
240	if (nLinesParsed == 0) {
241	values.Add(CreateList(type, estimatedNumberOfLines));
242	} else if (colIdx == values.Count) {
243	Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
244	"Line " + tokenizer.CurrentLineNumber + " has more columns.", "",
245	tokenizer.CurrentLineNumber);
246	}
247	if (!IsColumnTypeCompatible(values[colIdx], type)) {
248	values[colIdx] = ConvertToStringColumn(values[colIdx]);
249	}
250	// add the value to the column
251	AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal);
252	}
253	}
254
255	if (!values.Any() \|\| values.First().Count == 0)
256	Error("Couldn't parse data values. Probably because of incorrect number format " +
257	"(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
258	}
259
260	this.rows = values.First().Count;
261	this.columns = values.Count;
262
263	// after everything has been parsed make sure the lists are as compact as possible
264	foreach (var l in values) {
265	var dblList = l as List<double>;
266	var byteList = l as List<byte>;
267	var dateList = l as List<DateTime>;
268	var stringList = l as List<string>;
269	var objList = l as List<object>;
270	if (dblList != null) dblList.TrimExcess();
271	if (byteList != null) byteList.TrimExcess();
272	if (dateList != null) dateList.TrimExcess();
273	if (stringList != null) stringList.TrimExcess();
274	if (objList != null) objList.TrimExcess();
275	}
276
277	// for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction)
278	GC.Collect(2, GCCollectionMode.Forced);
279	}
280
281	#region type-dependent dispatch
282	private bool IsColumnTypeCompatible(IList list, TokenTypeEnum tokenType) {
283	return (list is List<string>) \|\| // all tokens can be added to a string list
284	(tokenType == TokenTypeEnum.Missing) \|\| // empty entries are allowed in all columns
285	(tokenType == TokenTypeEnum.Double && list is List<double>) \|\|
286	(tokenType == TokenTypeEnum.DateTime && list is List<DateTime>);
287	}
288
289	// all columns are converted to string columns when we find an non-empty value that has incorrect type
290	private IList ConvertToStringColumn(IList list) {
291	var dblL = list as List<double>;
292	if (dblL != null) {
293	var l = new List<string>(dblL.Capacity);
294	l.AddRange(dblL.Select(dbl => dbl.ToString()));
295	return l;
296	}
297
298	var dtL = list as List<DateTime>;
299	if (dtL != null) {
300	var l = new List<string>(dtL.Capacity);
301	l.AddRange(dtL.Select(dbl => dbl.ToString()));
302	return l;
303	}
304
305	if (list is List<string>) return list;
306
307	throw new InvalidProgramException(string.Format("Cannot convert column of type {0} to string column", list.GetType()));
308	}
309
310	private void AddValue(TokenTypeEnum type, IList list, string strVal, double dblVal, DateTime dateTimeVal) {
311	var dblList = list as List<double>;
312	if (dblList != null) {
313	AddValue(type, dblList, dblVal);
314	return;
315	}
316
317	var strList = list as List<string>;
318	if (strList != null) {
319	AddValue(type, strList, strVal);
320	return;
321	}
322	var dtList = list as List<DateTime>;
323	if (dtList != null) {
324	AddValue(type, dtList, dateTimeVal);
325	return;
326	}
327
328	list.Add(strVal); // assumes List<object>
329	}
330
331	private void AddValue(TokenTypeEnum type, List<double> list, double dblVal) {
332	Contract.Assert(type == TokenTypeEnum.Missing \|\| type == TokenTypeEnum.Double);
333	list.Add(type == TokenTypeEnum.Missing ? double.NaN : dblVal);
334	}
335
336	private void AddValue(TokenTypeEnum type, List<string> list, string strVal) {
337	// assumes that strVal is always set to the original token read from the input file
338	list.Add(type == TokenTypeEnum.Missing ? string.Empty : strVal);
339	}
340
341	private void AddValue(TokenTypeEnum type, List<DateTime> list, DateTime dtVal) {
342	Contract.Assert(type == TokenTypeEnum.Missing \|\| type == TokenTypeEnum.DateTime);
343	list.Add(type == TokenTypeEnum.Missing ? DateTime.MinValue : dtVal);
344	}
345
346	private IList CreateList(TokenTypeEnum type, int estimatedNumberOfLines) {
347	switch (type) {
348	case TokenTypeEnum.String:
349	return new List<string>(estimatedNumberOfLines);
350	case TokenTypeEnum.Double:
351	case TokenTypeEnum.Missing: // assume double columns
352	return new List<double>(estimatedNumberOfLines);
353	case TokenTypeEnum.DateTime:
354	return new List<DateTime>(estimatedNumberOfLines);
355	default:
356	throw new InvalidOperationException();
357	}
358	}
359	#endregion
360
361	public static void DetermineFileFormat(string path, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
362	DetermineFileFormat(new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite), out numberFormat, out dateTimeFormatInfo, out separator);
363	}
364
365	public static void DetermineFileFormat(Stream stream, out NumberFormatInfo numberFormat, out DateTimeFormatInfo dateTimeFormatInfo, out char separator) {
366	using (StreamReader reader = new StreamReader(stream)) {
367	// skip first line
368	reader.ReadLine();
369	// read a block
370	char[] buffer = new char[BUFFER_SIZE];
371	int charsRead = reader.ReadBlock(buffer, 0, BUFFER_SIZE);
372	// count frequency of special characters
373	Dictionary<char, int> charCounts = buffer.Take(charsRead)
374	.GroupBy(c => c)
375	.ToDictionary(g => g.Key, g => g.Count());
376
377	// depending on the characters occuring in the block
378	// we distinghish a number of different cases based on the the following rules:
379	// many points => it must be English number format, the other frequently occuring char is the separator
380	// no points but many commas => this is the problematic case. Either German format (real numbers) or English format (only integer numbers) with ',' as separator
381	// => check the line in more detail:
382	// English: 0, 0, 0, 0
383	// German: 0,0 0,0 0,0 ...
384	// => if commas are followed by space => English format
385	// no points no commas => English format (only integer numbers) use the other frequently occuring char as separator
386	// in all cases only treat ' ' as separator if no other separator is possible (spaces can also occur additionally to separators)
387	if (OccurrencesOf(charCounts, '.') > 10) {
388	numberFormat = NumberFormatInfo.InvariantInfo;
389	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
390	separator = POSSIBLE_SEPARATORS
391	.Where(c => OccurrencesOf(charCounts, c) > 10)
392	.OrderBy(c => -OccurrencesOf(charCounts, c))
393	.DefaultIfEmpty(' ')
394	.First();
395	} else if (OccurrencesOf(charCounts, ',') > 10) {
396	// no points and many commas
397	// count the number of tokens (chains of only digits and commas) that contain multiple comma characters
398	int tokensWithMultipleCommas = 0;
399	for (int i = 0; i < charsRead; i++) {
400	int nCommas = 0;
401	while (i < charsRead && (buffer[i] == ',' \|\| Char.IsDigit(buffer[i]))) {
402	if (buffer[i] == ',') nCommas++;
403	i++;
404	}
405	if (nCommas > 2) tokensWithMultipleCommas++;
406	}
407	if (tokensWithMultipleCommas > 1) {
408	// English format (only integer values) with ',' as separator
409	numberFormat = NumberFormatInfo.InvariantInfo;
410	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
411	separator = ',';
412	} else {
413	char[] disallowedSeparators = new char[] { ',' }; // n. def. contains a space so ' ' should be disallowed to, however existing unit tests would fail
414	// German format (real values)
415	numberFormat = NumberFormatInfo.GetInstance(new CultureInfo("de-DE"));
416	dateTimeFormatInfo = DateTimeFormatInfo.GetInstance(new CultureInfo("de-DE"));
417	separator = POSSIBLE_SEPARATORS
418	.Except(disallowedSeparators)
419	.Where(c => OccurrencesOf(charCounts, c) > 10)
420	.OrderBy(c => -OccurrencesOf(charCounts, c))
421	.DefaultIfEmpty(' ')
422	.First();
423	}
424	} else {
425	// no points and no commas => English format
426	numberFormat = NumberFormatInfo.InvariantInfo;
427	dateTimeFormatInfo = DateTimeFormatInfo.InvariantInfo;
428	separator = POSSIBLE_SEPARATORS
429	.Where(c => OccurrencesOf(charCounts, c) > 10)
430	.OrderBy(c => -OccurrencesOf(charCounts, c))
431	.DefaultIfEmpty(' ')
432	.First();
433	}
434	}
435	}
436
437	private static int OccurrencesOf(Dictionary<char, int> charCounts, char c) {
438	return charCounts.ContainsKey(c) ? charCounts[c] : 0;
439	}
440
441	#region tokenizer
442	// the tokenizer reads full lines and returns separated tokens in the line as well as a terminating end-of-line character
443	internal enum TokenTypeEnum {
444	NewLine, String, Double, DateTime, Missing
445	}
446
447	internal class Tokenizer {
448	private StreamReader reader;
449	// we assume that a buffer of 1024 tokens for a line is sufficient most of the time (the buffer is increased below if necessary)
450	private TokenTypeEnum[] tokenTypes = new TokenTypeEnum[1024];
451	private string[] stringVals = new string[1024];
452	private double[] doubleVals = new double[1024];
453	private DateTime[] dateTimeVals = new DateTime[1024];
454	private int tokenPos;
455	private int numTokens;
456	private NumberFormatInfo numberFormatInfo;
457	private DateTimeFormatInfo dateTimeFormatInfo;
458	private char separator;
459
460	// arrays for string.Split()
461	private readonly char[] whiteSpaceSeparators = new char[0]; // string split uses separators as default
462	private readonly char[] separators;
463
464	private int currentLineNumber = 0;
465	public int CurrentLineNumber {
466	get { return currentLineNumber; }
467	private set { currentLineNumber = value; }
468	}
469	private string currentLine;
470	public string CurrentLine {
471	get { return currentLine; }
472	private set { currentLine = value; }
473	}
474	public long BytesRead {
475	get;
476	private set;
477	}
478
479	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, DateTimeFormatInfo dateTimeFormatInfo, char separator) {
480	this.reader = reader;
481	this.numberFormatInfo = numberFormatInfo;
482	this.dateTimeFormatInfo = dateTimeFormatInfo;
483	this.separator = separator;
484	this.separators = new char[] { separator };
485	ReadNextTokens();
486	}
487
488	public bool HasNext() {
489	return numTokens > tokenPos \|\| !reader.EndOfStream;
490	}
491
492	public TokenTypeEnum PeekType() {
493	return tokenTypes[tokenPos];
494	}
495
496	public void Skip() {
497	// simply skips one token without returning the result values
498	tokenPos++;
499	if (numTokens == tokenPos) {
500	ReadNextTokens();
501	}
502	}
503
504	public void Next(out TokenTypeEnum type, out string strVal, out double dblVal, out DateTime dateTimeVal) {
505	type = tokenTypes[tokenPos];
506	strVal = stringVals[tokenPos];
507	dblVal = doubleVals[tokenPos];
508	dateTimeVal = dateTimeVals[tokenPos];
509	Skip();
510	}
511
512	private void ReadNextTokens() {
513	if (!reader.EndOfStream) {
514	CurrentLine = reader.ReadLine();
515	CurrentLineNumber++;
516	if (reader.BaseStream.CanSeek) {
517	BytesRead = reader.BaseStream.Position;
518	} else {
519	BytesRead += CurrentLine.Length + 2; // guess
520	}
521	int i = 0;
522	if (!string.IsNullOrWhiteSpace(CurrentLine)) {
523	foreach (var tok in Split(CurrentLine)) {
524	TokenTypeEnum type;
525	double doubleVal;
526	DateTime dateTimeValue;
527	type = TokenTypeEnum.String; // default
528	stringVals[i] = tok.Trim();
529	if (double.TryParse(tok, NumberStyles.Float, numberFormatInfo, out doubleVal)) {
530	type = TokenTypeEnum.Double;
531	doubleVals[i] = doubleVal;
532	} else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) {
533	type = TokenTypeEnum.DateTime;
534	dateTimeVals[i] = dateTimeValue;
535	} else if (string.IsNullOrWhiteSpace(tok)) {
536	type = TokenTypeEnum.Missing;
537	}
538
539	// couldn't parse the token as an int or float number or datetime value so return a string token
540
541	tokenTypes[i] = type;
542	i++;
543
544	if (i >= tokenTypes.Length) {
545	// increase buffer size if necessary
546	IncreaseCapacity(ref tokenTypes);
547	IncreaseCapacity(ref doubleVals);
548	IncreaseCapacity(ref stringVals);
549	IncreaseCapacity(ref dateTimeVals);
550	}
551	}
552	}
553	tokenTypes[i] = TokenTypeEnum.NewLine;
554	numTokens = i + 1;
555	tokenPos = 0;
556	}
557	}
558
559	private IEnumerable<string> Split(string line) {
560	return separator == WHITESPACECHAR ?
561	line.Split(whiteSpaceSeparators, StringSplitOptions.RemoveEmptyEntries) :
562	line.Split(separators);
563	}
564
565	private static void IncreaseCapacity<T>(ref T[] arr) {
566	int n = (int)Math.Floor(arr.Length * 1.7); // guess
567	T[] arr2 = new T[n];
568	Array.Copy(arr, arr2, arr.Length);
569	arr = arr2;
570	}
571	}
572	#endregion
573
574	#region parsing
575
576	private void ParseVariableNames() {
577	// the first line must contain variable names
578	List<string> varNames = new List<string>();
579
580	TokenTypeEnum type;
581	string strVal;
582	double dblVal;
583	DateTime dateTimeVal;
584
585	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
586
587	// the first token must be a variable name
588	if (type != TokenTypeEnum.String)
589	throw new ArgumentException("Error: Expected " + TokenTypeEnum.String + " got " + type);
590	varNames.Add(strVal);
591
592	while (tokenizer.HasNext() && tokenizer.PeekType() != TokenTypeEnum.NewLine) {
593	tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
594	varNames.Add(strVal);
595	}
596	ExpectType(TokenTypeEnum.NewLine);
597
598	variableNames = varNames;
599	}
600
601	private void ExpectType(TokenTypeEnum expectedToken) {
602	if (tokenizer.PeekType() != expectedToken)
603	throw new ArgumentException("Error: Expected " + expectedToken + " got " + tokenizer.PeekType());
604	tokenizer.Skip();
605	}
606
607	private void Error(string message, string token, int lineNumber) {
608	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
609	}
610	#endregion
611
612	[Serializable]
613	public class DataFormatException : Exception {
614	private int line;
615	public int Line {
616	get { return line; }
617	}
618	private string token;
619	public string Token {
620	get { return token; }
621	}
622	public DataFormatException(string message, string token, int line)
623	: base(message + "\nToken: " + token + " (line: " + line + ")") {
624	this.token = token;
625	this.line = line;
626	}
627
628	public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
629	}
630	}
631	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences