Context Navigation

source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/CsvFileParser.cs @ 4745

Visit:

Last change on this file since 4745 was 4239, checked in by gkronber, 14 years ago
Merged improvements of symbolic simplifier (revisions: r4220, r4226, r4235:4238) back into trunk. #1026
File size: 10.5 KB

Rev	Line
[2]	1	#region License Information
	2	/* HeuristicLab
[3264]	3	* Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[2]	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
	22	using System;
	23	using System.Collections.Generic;
	24	using System.Globalization;
	25	using System.IO;
[2446]	26	using System.Linq;
	27	using System.Text;
[2]	28
[3373]	29	namespace HeuristicLab.Problems.DataAnalysis {
[3264]	30	public class CsvFileParser {
[273]	31	private const string VARIABLENAMES = "VARIABLENAMES";
[2]	32	private Tokenizer tokenizer;
[3264]	33	private List<string> variableNames;
	34	private List<List<double>> rowValues;
[2]	35
	36	private int rows;
	37	public int Rows {
	38	get { return rows; }
	39	set { rows = value; }
	40	}
	41
	42	private int columns;
	43	public int Columns {
	44	get { return columns; }
	45	set { columns = value; }
	46	}
	47
[3264]	48	private double[,] values;
	49	public double[,] Values {
[2]	50	get {
[3264]	51	return values;
[2]	52	}
	53	}
	54
[3264]	55	public IEnumerable<string> VariableNames {
[2]	56	get {
[3264]	57	if (variableNames.Count > 0) return variableNames;
	58	else {
[273]	59	string[] names = new string[columns];
[1221]	60	for (int i = 0; i < names.Length; i++) {
[273]	61	names[i] = "X" + i.ToString("000");
	62	}
	63	return names;
[2]	64	}
	65	}
	66	}
	67
[3264]	68	public CsvFileParser() {
	69	rowValues = new List<List<double>>();
	70	variableNames = new List<string>();
[2]	71	}
	72
[3264]	73	private void Reset() {
	74	variableNames.Clear();
	75	rowValues.Clear();
[2]	76	}
	77
[3264]	78	public void Parse(string fileName) {
	79	TryParse(fileName);
[2]	80	// translate the list of samples into a DoubleMatrixData item
[3264]	81	rows = rowValues.Count;
	82	columns = rowValues[0].Count;
	83	values = new double[rows, columns];
[2]	84
[3264]	85	int rowIndex = 0;
	86	int columnIndex = 0;
	87	foreach (List<double> row in rowValues) {
	88	columnIndex = 0;
[1221]	89	foreach (double element in row) {
[3264]	90	values[rowIndex, columnIndex++] = element;
[2]	91	}
[3264]	92	rowIndex++;
[2]	93	}
	94	}
	95
[3264]	96	private void TryParse(string fileName) {
[405]	97	Exception lastEx = null;
[3889]	98	NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { CultureInfo.InvariantCulture.NumberFormat };
[1221]	99	foreach (NumberFormatInfo numberFormat in possibleFormats) {
[3264]	100	using (StreamReader reader = new StreamReader(fileName)) {
[405]	101	tokenizer = new Tokenizer(reader, numberFormat);
	102	try {
	103	// parse the file
[3264]	104	Parse();
[405]	105	return; // parsed without errors -> return;
[1221]	106	}
	107	catch (DataFormatException ex) {
[405]	108	lastEx = ex;
	109	}
	110	}
	111	}
	112	// all number formats threw an exception -> rethrow the last exception
	113	throw lastEx;
	114	}
	115
[2]	116	#region tokenizer
	117	internal enum TokenTypeEnum {
[3264]	118	NewLine, Separator, String, Double
[2]	119	}
	120
	121	internal class Token {
	122	public TokenTypeEnum type;
	123	public string stringValue;
	124	public double doubleValue;
	125
	126	public Token(TokenTypeEnum type, string value) {
	127	this.type = type;
	128	stringValue = value;
	129	doubleValue = 0.0;
	130	}
	131
	132	public override string ToString() {
	133	return stringValue;
	134	}
	135	}
	136
	137
[3264]	138	internal class Tokenizer {
[2]	139	private StreamReader reader;
	140	private List<Token> tokens;
[405]	141	private NumberFormatInfo numberFormatInfo;
[2]	142
[3264]	143	private int currentLineNumber = 0;
	144	public int CurrentLineNumber {
	145	get { return currentLineNumber; }
	146	private set { currentLineNumber = value; }
	147	}
	148	private string currentLine;
	149	public string CurrentLine {
	150	get { return currentLine; }
	151	private set { currentLine = value; }
	152	}
[2]	153
[3264]	154	private Token newlineToken;
	155	public Token NewlineToken {
	156	get { return newlineToken; }
	157	private set { newlineToken = value; }
	158	}
	159	private Token separatorToken;
	160	public Token SeparatorToken {
	161	get { return separatorToken; }
	162	private set { separatorToken = value; }
	163	}
[2]	164
[3264]	165	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
[2]	166	this.reader = reader;
[405]	167	this.numberFormatInfo = numberFormatInfo;
[3264]	168	separatorToken = new Token(TokenTypeEnum.Separator, separator.ToString());
	169	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
[2]	170	tokens = new List<Token>();
	171	ReadNextTokens();
	172	}
[3264]	173	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo)
	174	: this(reader, numberFormatInfo, ';') {
	175	}
[2]	176
	177	private void ReadNextTokens() {
[1221]	178	if (!reader.EndOfStream) {
[2]	179	CurrentLine = reader.ReadLine();
[2446]	180	var newTokens = from str in Split(CurrentLine)
	181	let trimmedStr = str.Trim()
	182	where !string.IsNullOrEmpty(trimmedStr)
	183	select MakeToken(trimmedStr.Trim());
[2]	184
[2446]	185	tokens.AddRange(newTokens);
[2]	186	tokens.Add(NewlineToken);
	187	CurrentLineNumber++;
	188	}
	189	}
	190
[2446]	191	private IEnumerable<string> Split(string line) {
	192	StringBuilder subStr = new StringBuilder();
	193	foreach (char c in line) {
[3264]	194	if (c == ';') {
[2446]	195	yield return subStr.ToString();
	196	subStr = new StringBuilder();
	197	yield return c.ToString();
	198	} else {
	199	subStr.Append(c);
	200	}
	201	}
	202	yield return subStr.ToString();
	203	}
	204
[2]	205	private Token MakeToken(string strToken) {
[406]	206	Token token = new Token(TokenTypeEnum.String, strToken);
[3264]	207	if (strToken.Equals(SeparatorToken.stringValue)) {
[2446]	208	return SeparatorToken;
[1221]	209	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
[406]	210	token.type = TokenTypeEnum.Double;
	211	return token;
[2]	212	}
[2446]	213
[406]	214	// couldn't parse the token as an int or float number so return a string token
	215	return token;
[2]	216	}
	217
	218	public Token Peek() {
	219	return tokens[0];
	220	}
	221
	222	public Token Next() {
	223	Token next = tokens[0];
	224	tokens.RemoveAt(0);
[1221]	225	if (tokens.Count == 0) {
[2]	226	ReadNextTokens();
	227	}
	228	return next;
	229	}
	230
	231	public bool HasNext() {
	232	return tokens.Count > 0 \|\| !reader.EndOfStream;
	233	}
	234	}
	235	#endregion
	236
	237	#region parsing
[3264]	238	private void Parse() {
	239	ParseVariableNames();
[1221]	240	if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[3264]	241	ParseValues();
	242	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[2]	243	}
	244
[3264]	245	private void ParseValues() {
[1221]	246	while (tokenizer.HasNext()) {
[2446]	247	List<double> row = new List<double>();
[3264]	248	row.Add(NextValue(tokenizer));
	249	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
	250	Expect(tokenizer.SeparatorToken);
	251	row.Add(NextValue(tokenizer));
[2446]	252	}
[3264]	253	Expect(tokenizer.NewlineToken);
	254	// all rows have to have the same number of values
	255	// the first row defines how many samples are needed
	256	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
	257	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
	258	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
[2446]	259	}
	260	// add the current row to the collection of rows and start a new row
[3264]	261	rowValues.Add(row);
[2446]	262	row = new List<double>();
	263	}
	264	}
	265
[3264]	266	private double NextValue(Tokenizer tokenizer) {
	267	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
[2446]	268	Token current = tokenizer.Next();
[3264]	269	if (current.type == TokenTypeEnum.Separator \|\| current.type == TokenTypeEnum.String) {
[2446]	270	return double.NaN;
	271	} else if (current.type == TokenTypeEnum.Double) {
	272	// just take the value
	273	return current.doubleValue;
[2]	274	}
[3264]	275	// found an unexpected token => throw error
	276	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
	277	// this line is never executed because Error() throws an exception
	278	throw new InvalidOperationException();
[2]	279	}
	280
[3264]	281	private void ParseVariableNames() {
	282	// if the first line doesn't start with a double value then we assume that the
	283	// first line contains variable names
	284	if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
[2446]	285
[2]	286	List<Token> tokens = new List<Token>();
[1221]	287	Token valueToken;
	288	valueToken = tokenizer.Next();
[2446]	289	tokens.Add(valueToken);
[3264]	290	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
	291	Expect(tokenizer.SeparatorToken);
[2]	292	valueToken = tokenizer.Next();
[3264]	293	if (valueToken != tokenizer.NewlineToken) {
[2446]	294	tokens.Add(valueToken);
	295	}
[2]	296	}
[3264]	297	if (valueToken != tokenizer.NewlineToken) {
	298	Expect(tokenizer.NewlineToken);
[2446]	299	}
[3264]	300	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
[2]	301	}
	302	}
	303
	304	private void Expect(Token expectedToken) {
	305	Token actualToken = tokenizer.Next();
[1221]	306	if (actualToken != expectedToken) {
[273]	307	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
[2]	308	}
	309	}
	310
[273]	311	private void Error(string message, string token, int lineNumber) {
	312	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
[2]	313	}
	314	#endregion
	315	}
	316	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences