Context Navigation

source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.3/CsvFileParser.cs @ 3746

Visit:

Last change on this file since 3746 was 3544, checked in by gkronber, 15 years ago
Changed `CsvFileParser` to parse double values with current culture instead of invariant culture. #938 (Data types and operators for regression problems)
File size: 10.5 KB

Rev	Line
[2]	1	#region License Information
	2	/* HeuristicLab
[3264]	3	* Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[2]	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
	22	using System;
	23	using System.Collections.Generic;
	24	using System.Globalization;
	25	using System.IO;
[2446]	26	using System.Linq;
[2]	27	using HeuristicLab.Data;
[2446]	28	using System.Text;
[2]	29
[3373]	30	namespace HeuristicLab.Problems.DataAnalysis {
[3264]	31	public class CsvFileParser {
[273]	32	private const string VARIABLENAMES = "VARIABLENAMES";
[2]	33	private Tokenizer tokenizer;
[3264]	34	private List<string> variableNames;
	35	private List<List<double>> rowValues;
[2]	36
	37	private int rows;
	38	public int Rows {
	39	get { return rows; }
	40	set { rows = value; }
	41	}
	42
	43	private int columns;
	44	public int Columns {
	45	get { return columns; }
	46	set { columns = value; }
	47	}
	48
[3264]	49	private double[,] values;
	50	public double[,] Values {
[2]	51	get {
[3264]	52	return values;
[2]	53	}
	54	}
	55
[3264]	56	public IEnumerable<string> VariableNames {
[2]	57	get {
[3264]	58	if (variableNames.Count > 0) return variableNames;
	59	else {
[273]	60	string[] names = new string[columns];
[1221]	61	for (int i = 0; i < names.Length; i++) {
[273]	62	names[i] = "X" + i.ToString("000");
	63	}
	64	return names;
[2]	65	}
	66	}
	67	}
	68
[3264]	69	public CsvFileParser() {
	70	rowValues = new List<List<double>>();
	71	variableNames = new List<string>();
[2]	72	}
	73
[3264]	74	private void Reset() {
	75	variableNames.Clear();
	76	rowValues.Clear();
[2]	77	}
	78
[3264]	79	public void Parse(string fileName) {
	80	TryParse(fileName);
[2]	81	// translate the list of samples into a DoubleMatrixData item
[3264]	82	rows = rowValues.Count;
	83	columns = rowValues[0].Count;
	84	values = new double[rows, columns];
[2]	85
[3264]	86	int rowIndex = 0;
	87	int columnIndex = 0;
	88	foreach (List<double> row in rowValues) {
	89	columnIndex = 0;
[1221]	90	foreach (double element in row) {
[3264]	91	values[rowIndex, columnIndex++] = element;
[2]	92	}
[3264]	93	rowIndex++;
[2]	94	}
	95	}
	96
[3264]	97	private void TryParse(string fileName) {
[405]	98	Exception lastEx = null;
[3544]	99	NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { CultureInfo.CurrentCulture.NumberFormat };
[1221]	100	foreach (NumberFormatInfo numberFormat in possibleFormats) {
[3264]	101	using (StreamReader reader = new StreamReader(fileName)) {
[405]	102	tokenizer = new Tokenizer(reader, numberFormat);
	103	try {
	104	// parse the file
[3264]	105	Parse();
[405]	106	return; // parsed without errors -> return;
[1221]	107	}
	108	catch (DataFormatException ex) {
[405]	109	lastEx = ex;
	110	}
	111	}
	112	}
	113	// all number formats threw an exception -> rethrow the last exception
	114	throw lastEx;
	115	}
	116
[2]	117	#region tokenizer
	118	internal enum TokenTypeEnum {
[3264]	119	NewLine, Separator, String, Double
[2]	120	}
	121
	122	internal class Token {
	123	public TokenTypeEnum type;
	124	public string stringValue;
	125	public double doubleValue;
	126
	127	public Token(TokenTypeEnum type, string value) {
	128	this.type = type;
	129	stringValue = value;
	130	doubleValue = 0.0;
	131	}
	132
	133	public override string ToString() {
	134	return stringValue;
	135	}
	136	}
	137
	138
[3264]	139	internal class Tokenizer {
[2]	140	private StreamReader reader;
	141	private List<Token> tokens;
[405]	142	private NumberFormatInfo numberFormatInfo;
[2]	143
[3264]	144	private int currentLineNumber = 0;
	145	public int CurrentLineNumber {
	146	get { return currentLineNumber; }
	147	private set { currentLineNumber = value; }
	148	}
	149	private string currentLine;
	150	public string CurrentLine {
	151	get { return currentLine; }
	152	private set { currentLine = value; }
	153	}
[2]	154
[3264]	155	private Token newlineToken;
	156	public Token NewlineToken {
	157	get { return newlineToken; }
	158	private set { newlineToken = value; }
	159	}
	160	private Token separatorToken;
	161	public Token SeparatorToken {
	162	get { return separatorToken; }
	163	private set { separatorToken = value; }
	164	}
[2]	165
[3264]	166	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
[2]	167	this.reader = reader;
[405]	168	this.numberFormatInfo = numberFormatInfo;
[3264]	169	separatorToken = new Token(TokenTypeEnum.Separator, separator.ToString());
	170	newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
[2]	171	tokens = new List<Token>();
	172	ReadNextTokens();
	173	}
[3264]	174	public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo)
	175	: this(reader, numberFormatInfo, ';') {
	176	}
[2]	177
	178	private void ReadNextTokens() {
[1221]	179	if (!reader.EndOfStream) {
[2]	180	CurrentLine = reader.ReadLine();
[2446]	181	var newTokens = from str in Split(CurrentLine)
	182	let trimmedStr = str.Trim()
	183	where !string.IsNullOrEmpty(trimmedStr)
	184	select MakeToken(trimmedStr.Trim());
[2]	185
[2446]	186	tokens.AddRange(newTokens);
[2]	187	tokens.Add(NewlineToken);
	188	CurrentLineNumber++;
	189	}
	190	}
	191
[2446]	192	private IEnumerable<string> Split(string line) {
	193	StringBuilder subStr = new StringBuilder();
	194	foreach (char c in line) {
[3264]	195	if (c == ';') {
[2446]	196	yield return subStr.ToString();
	197	subStr = new StringBuilder();
	198	yield return c.ToString();
	199	} else {
	200	subStr.Append(c);
	201	}
	202	}
	203	yield return subStr.ToString();
	204	}
	205
[2]	206	private Token MakeToken(string strToken) {
[406]	207	Token token = new Token(TokenTypeEnum.String, strToken);
[3264]	208	if (strToken.Equals(SeparatorToken.stringValue)) {
[2446]	209	return SeparatorToken;
[1221]	210	} else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
[406]	211	token.type = TokenTypeEnum.Double;
	212	return token;
[2]	213	}
[2446]	214
[406]	215	// couldn't parse the token as an int or float number so return a string token
	216	return token;
[2]	217	}
	218
	219	public Token Peek() {
	220	return tokens[0];
	221	}
	222
	223	public Token Next() {
	224	Token next = tokens[0];
	225	tokens.RemoveAt(0);
[1221]	226	if (tokens.Count == 0) {
[2]	227	ReadNextTokens();
	228	}
	229	return next;
	230	}
	231
	232	public bool HasNext() {
	233	return tokens.Count > 0 \|\| !reader.EndOfStream;
	234	}
	235	}
	236	#endregion
	237
	238	#region parsing
[3264]	239	private void Parse() {
	240	ParseVariableNames();
[1221]	241	if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[3264]	242	ParseValues();
	243	if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
[2]	244	}
	245
[3264]	246	private void ParseValues() {
[1221]	247	while (tokenizer.HasNext()) {
[2446]	248	List<double> row = new List<double>();
[3264]	249	row.Add(NextValue(tokenizer));
	250	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
	251	Expect(tokenizer.SeparatorToken);
	252	row.Add(NextValue(tokenizer));
[2446]	253	}
[3264]	254	Expect(tokenizer.NewlineToken);
	255	// all rows have to have the same number of values
	256	// the first row defines how many samples are needed
	257	if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
	258	Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
	259	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
[2446]	260	}
	261	// add the current row to the collection of rows and start a new row
[3264]	262	rowValues.Add(row);
[2446]	263	row = new List<double>();
	264	}
	265	}
	266
[3264]	267	private double NextValue(Tokenizer tokenizer) {
	268	if (tokenizer.Peek() == tokenizer.SeparatorToken \|\| tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
[2446]	269	Token current = tokenizer.Next();
[3264]	270	if (current.type == TokenTypeEnum.Separator \|\| current.type == TokenTypeEnum.String) {
[2446]	271	return double.NaN;
	272	} else if (current.type == TokenTypeEnum.Double) {
	273	// just take the value
	274	return current.doubleValue;
[2]	275	}
[3264]	276	// found an unexpected token => throw error
	277	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
	278	// this line is never executed because Error() throws an exception
	279	throw new InvalidOperationException();
[2]	280	}
	281
[3264]	282	private void ParseVariableNames() {
	283	// if the first line doesn't start with a double value then we assume that the
	284	// first line contains variable names
	285	if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
[2446]	286
[2]	287	List<Token> tokens = new List<Token>();
[1221]	288	Token valueToken;
	289	valueToken = tokenizer.Next();
[2446]	290	tokens.Add(valueToken);
[3264]	291	while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
	292	Expect(tokenizer.SeparatorToken);
[2]	293	valueToken = tokenizer.Next();
[3264]	294	if (valueToken != tokenizer.NewlineToken) {
[2446]	295	tokens.Add(valueToken);
	296	}
[2]	297	}
[3264]	298	if (valueToken != tokenizer.NewlineToken) {
	299	Expect(tokenizer.NewlineToken);
[2446]	300	}
[3264]	301	variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
[2]	302	}
	303	}
	304
	305	private void Expect(Token expectedToken) {
	306	Token actualToken = tokenizer.Next();
[1221]	307	if (actualToken != expectedToken) {
[273]	308	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
[2]	309	}
	310	}
	311
[273]	312	private void Error(string message, string token, int lineNumber) {
	313	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
[2]	314	}
	315	#endregion
	316	}
	317	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences