Context Navigation

source: trunk/sources/HeuristicLab.DataAnalysis/DatasetParser.cs @ 219

Visit:

Last change on this file since 219 was 173, checked in by gkronber, 17 years ago
fixed a bug non-matching string constant
File size: 10.9 KB

Rev	Line
[2]	1	#region License Information
	2	/* HeuristicLab
	3	* Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
	22	using System;
	23	using System.Collections.Generic;
	24	using System.Globalization;
	25	using System.IO;
	26	using HeuristicLab.Data;
	27
	28	namespace HeuristicLab.DataAnalysis {
	29	public class DatasetParser {
	30	private Tokenizer tokenizer;
	31	private Dictionary<string, List<Token>> metadata;
	32	private List<List<double>> samplesList;
	33
	34	private int rows;
	35	public int Rows {
	36	get { return rows; }
	37	set { rows = value; }
	38	}
	39
	40	private int columns;
	41	public int Columns {
	42	get { return columns; }
	43	set { columns = value; }
	44	}
	45
	46	private double[] samples;
	47	public double[] Samples {
	48	get {
	49	return samples;
	50	}
	51	}
	52
	53	public string ProblemName {
	54	get {
	55	return metadata["PROBLEMNAME"][0].stringValue;
	56	}
	57	}
	58
	59	public string[] VariableNames {
	60	get {
	61	List<Token> nameList = metadata["VARIABLENAMES"];
	62	string[] names = new string[nameList.Count];
	63	for (int i = 0; i < names.Length; i++) {
	64	names[i] = nameList[i].stringValue;
	65	}
	66
	67	return names;
	68	}
	69	}
	70
	71	public int TargetVariable {
	72	get {
	73	return metadata["TARGETVARIABLE"][0].intValue;
	74	}
	75	}
	76
	77	public int MaxTreeHeight {
	78	get {
	79	return metadata["MAXIMUMTREEHEIGHT"][0].intValue;
	80	}
	81	}
	82
	83	public int MaxTreeSize {
	84	get {
	85	return metadata["MAXIMUMTREESIZE"][0].intValue;
	86	}
	87	}
	88
	89	public int TrainingSamplesStart {
	90	get {
[173]	91	if(!metadata.ContainsKey("TRAININGSAMPLESSTART")) return 0;
[172]	92	else return metadata["TRAININGSAMPLESSTART"][0].intValue;
[2]	93	}
	94	}
	95
	96	public int TrainingSamplesEnd {
	97	get {
[173]	98	if(!metadata.ContainsKey("TRAININGSAMPLESEND")) return rows;
[172]	99	else return metadata["TRAININGSAMPLESEND"][0].intValue;
[2]	100	}
	101	}
	102
	103	public DatasetParser() {
	104	this.metadata = new Dictionary<string, List<Token>>();
	105	samplesList = new List<List<double>>();
	106	}
	107
	108	public void Import(string importFileName, bool strict) {
	109	StreamReader reader = new StreamReader(importFileName);
	110	this.tokenizer = new Tokenizer(reader);
	111	tokenizer.Separators = new string[] { " ", ";", "\t" };
	112
	113	// parse the file
	114	Parse(strict);
	115
	116	// translate the list of samples into a DoubleMatrixData item
	117	samples = new double[samplesList.Count * samplesList[0].Count];
	118	rows = samplesList.Count;
	119	columns = samplesList[0].Count;
	120
	121	int i = 0;
	122	int j = 0;
	123	foreach (List<double> row in samplesList) {
	124	j = 0;
	125	foreach (double element in row) {
	126	samples[i * columns + j] = element;
	127	j++;
	128	}
	129	i++;
	130	}
	131	}
	132
	133	#region tokenizer
	134	internal enum TokenTypeEnum {
	135	At, Assign, NewLine, String, Double, Int
	136	}
	137
	138	internal class Token {
	139	public TokenTypeEnum type;
	140	public string stringValue;
	141	public double doubleValue;
	142	public int intValue;
	143
	144	public Token(TokenTypeEnum type, string value) {
	145	this.type = type;
	146	stringValue = value;
	147	doubleValue = 0.0;
	148	intValue = 0;
	149	}
	150
	151	public override string ToString() {
	152	return stringValue;
	153	}
	154	}
	155
	156
	157	class Tokenizer {
	158	private StreamReader reader;
	159	private List<Token> tokens;
	160	private string[] separators;
	161
	162	public int CurrentLineNumber = 0;
	163	public string CurrentLine;
	164
	165	public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
	166	public static Token AtToken = new Token(TokenTypeEnum.At, "@");
	167	public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
	168
	169	public string[] Separators {
	170	get { return separators; }
	171	set { separators = value; }
	172	}
	173
	174
	175	public Tokenizer(StreamReader reader) {
	176	this.reader = reader;
	177	tokens = new List<Token>();
	178	ReadNextTokens();
	179	}
	180
	181	private void ReadNextTokens() {
	182	if (!reader.EndOfStream) {
	183	CurrentLine = reader.ReadLine();
	184	Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) {
	185	return MakeToken(str);
	186	});
	187
	188	tokens.AddRange(newTokens);
	189	tokens.Add(NewlineToken);
	190	CurrentLineNumber++;
	191	}
	192	}
	193
	194	private Token MakeToken(string strToken) {
	195	if (strToken == "@")
	196	return AtToken;
	197	else if (strToken == "=")
	198	return AssignmentToken;
	199	else {
	200	Token token = new Token(TokenTypeEnum.String, strToken);
	201
	202	// try invariant culture
	203	NumberFormatInfo currentNumberFormatInfo = CultureInfo.InvariantCulture.NumberFormat;
	204	if (int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
	205	token.type = TokenTypeEnum.Int;
	206	return token;
	207	} else if (double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
	208	token.type = TokenTypeEnum.Double;
	209	return token;
	210	}
	211	// try german culture
	212	currentNumberFormatInfo = CultureInfo.GetCultureInfo("de-DE").NumberFormat;
	213	if (int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
	214	token.type = TokenTypeEnum.Int;
	215	return token;
	216	} else if (double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
	217	token.type = TokenTypeEnum.Double;
	218	return token;
	219	}
	220
	221	// try current culture
	222	currentNumberFormatInfo = CultureInfo.CurrentCulture.NumberFormat;
	223	if (int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
	224	token.type = TokenTypeEnum.Int;
	225	return token;
	226	} else if (double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
	227	token.type = TokenTypeEnum.Double;
	228	return token;
	229	}
	230
	231	// nothing worked
	232	return token;
	233	}
	234	}
	235
	236	public Token Peek() {
	237	return tokens[0];
	238	}
	239
	240	public Token Next() {
	241	Token next = tokens[0];
	242	tokens.RemoveAt(0);
	243	if (tokens.Count == 0) {
	244	ReadNextTokens();
	245	}
	246	return next;
	247	}
	248
	249	public bool HasNext() {
	250	return tokens.Count > 0 \|\| !reader.EndOfStream;
	251	}
	252	}
	253	#endregion
	254
	255	#region parsing
	256	private void Parse(bool strict) {
	257	ParseMetaData(strict);
	258	ParseSampleData(strict);
	259	}
	260
	261	private void ParseSampleData(bool strict) {
	262	List<double> row = new List<double>();
	263	while (tokenizer.HasNext()) {
	264	Token current = tokenizer.Next();
	265	if (current.type == TokenTypeEnum.Double) {
	266	// just take the value
	267	row.Add(current.doubleValue);
	268	} else if (current.type == TokenTypeEnum.Int) {
	269	// translate the int value to double
	270	row.Add((double)current.intValue);
	271	} else if (current == Tokenizer.NewlineToken) {
	272	// when parsing strictly all rows have to have the same number of values
	273	if (strict) {
	274	// the first row defines how many samples are needed
	275	if (samplesList.Count > 0 && samplesList[0].Count != row.Count) {
	276	Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
	277	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.");
	278	}
	279	} else if (samplesList.Count > 0) {
	280	// when we are not strict then fill or drop elements as needed
	281	if (samplesList[0].Count > row.Count) {
	282	// fill with NAN
	283	for (int i = row.Count; i < samplesList[0].Count; i++) {
	284	row.Add(double.NaN);
	285	}
	286	} else if (samplesList[0].Count < row.Count) {
	287	// drop last k elements where k = n - length of first row
	288	row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
	289	}
	290	}
	291
	292	// add the current row to the collection of rows and start a new row
	293	samplesList.Add(row);
	294	row = new List<double>();
	295	} else {
	296	// found an unexpected token => return false when parsing strictly
	297	// when we are parsing non-strictly we also allow unreadable values inserting NAN instead
	298	if (strict) {
	299	Error("Unkown value " + current + " in line " + tokenizer.CurrentLineNumber +
	300	"\n" + tokenizer.CurrentLine);
	301	} else {
	302	row.Add(double.NaN);
	303	}
	304	}
	305	}
	306	}
	307
	308	private void ParseMetaData(bool strict) {
	309	while (tokenizer.Peek() == Tokenizer.AtToken) {
	310	Expect(Tokenizer.AtToken);
	311
	312	Token nameToken = tokenizer.Next();
	313	if (nameToken.type != TokenTypeEnum.String)
	314	throw new Exception("Expected a variable name; got " + nameToken +
	315	"\nLine " + tokenizer.CurrentLineNumber + ": " + tokenizer.CurrentLine);
	316
	317	Expect(Tokenizer.AssignmentToken);
	318
	319	List<Token> tokens = new List<Token>();
	320	Token valueToken = tokenizer.Next();
	321	while (valueToken != Tokenizer.NewlineToken) {
	322	tokens.Add(valueToken);
	323	valueToken = tokenizer.Next();
	324	}
	325
	326	metadata[nameToken.stringValue] = tokens;
	327	}
	328	}
	329
	330	private void Expect(Token expectedToken) {
	331	Token actualToken = tokenizer.Next();
	332	if (actualToken != expectedToken) {
	333	Error("Expected: " + expectedToken + " got: " + actualToken +
	334	"\nLine " + tokenizer.CurrentLineNumber + ": " + tokenizer.CurrentLine);
	335	}
	336	}
	337
	338	private void Error(string message) {
	339	throw new Exception("Error while parsing.\n" + message);
	340	}
	341	#endregion
	342	}
	343	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences