Context Navigation

source: trunk/sources/HeuristicLab.DataAnalysis/DatasetParser.cs @ 393

Visit:

Last change on this file since 393 was 363, checked in by gkronber, 16 years ago

implemented operator to store the best of run solution, in regard of a specific fitness variable).
adapted struct-id infrastructure to allow evaluation of models on validation data.

ticket #194

File size: 12.5 KB

Rev	Line
[2]	1	#region License Information
	2	/* HeuristicLab
	3	* Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
	22	using System;
	23	using System.Collections.Generic;
	24	using System.Globalization;
	25	using System.IO;
	26	using HeuristicLab.Data;
	27
	28	namespace HeuristicLab.DataAnalysis {
	29	public class DatasetParser {
[273]	30	private const string PROBLEMNAME = "PROBLEMNAME";
	31	private const string VARIABLENAMES = "VARIABLENAMES";
	32	private const string TARGETVARIABLE = "TARGETVARIABLE";
	33	private const string MAXIMUMTREEHEIGHT = "MAXIMUMTREEHEIGHT";
	34	private const string MAXIMUMTREESIZE = "MAXIMUMTREESIZE";
	35	private const string TRAININGSAMPLESSTART = "TRAININGSAMPLESSTART";
	36	private const string TRAININGSAMPLESEND = "TRAININGSAMPLESEND";
[363]	37	private const string VALIDATIONSAMPLESSTART = "VALIDATIONSAMPLESSTART";
	38	private const string VALIDATIONSAMPLESEND = "VALIDATIONSAMPLESEND";
[2]	39	private Tokenizer tokenizer;
	40	private Dictionary<string, List<Token>> metadata;
	41	private List<List<double>> samplesList;
	42
	43	private int rows;
	44	public int Rows {
	45	get { return rows; }
	46	set { rows = value; }
	47	}
	48
	49	private int columns;
	50	public int Columns {
	51	get { return columns; }
	52	set { columns = value; }
	53	}
	54
	55	private double[] samples;
	56	public double[] Samples {
	57	get {
	58	return samples;
	59	}
	60	}
	61
	62	public string ProblemName {
	63	get {
[273]	64	if(metadata.ContainsKey(PROBLEMNAME)) {
	65	return metadata[PROBLEMNAME][0].stringValue;
	66	} else return "-";
[2]	67	}
	68	}
	69
	70	public string[] VariableNames {
	71	get {
[273]	72	if(metadata.ContainsKey(VARIABLENAMES)) {
	73	List<Token> nameList = metadata[VARIABLENAMES];
	74	string[] names = new string[nameList.Count];
	75	for(int i = 0; i < names.Length; i++) {
	76	names[i] = nameList[i].stringValue;
	77	}
	78	return names;
	79	} else {
	80	string[] names = new string[columns];
	81	for(int i = 0; i < names.Length; i++) {
	82	names[i] = "X" + i.ToString("000");
	83	}
	84	return names;
[2]	85	}
	86	}
	87	}
	88
	89	public int TargetVariable {
	90	get {
[273]	91	if(metadata.ContainsKey(TARGETVARIABLE)) {
	92	return metadata[TARGETVARIABLE][0].intValue;
	93	} else return 0; // default is the first column
[2]	94	}
	95	}
	96
	97	public int MaxTreeHeight {
	98	get {
[273]	99	if(metadata.ContainsKey(MAXIMUMTREEHEIGHT)) {
	100	return metadata[MAXIMUMTREEHEIGHT][0].intValue;
	101	} else return 0;
[2]	102	}
	103	}
	104
	105	public int MaxTreeSize {
	106	get {
[273]	107	if(metadata.ContainsKey(MAXIMUMTREESIZE)) {
	108	return metadata[MAXIMUMTREESIZE][0].intValue;
	109	} else return 0;
[2]	110	}
	111	}
	112
	113	public int TrainingSamplesStart {
	114	get {
[273]	115	if(metadata.ContainsKey(TRAININGSAMPLESSTART)) {
	116	return metadata[TRAININGSAMPLESSTART][0].intValue;
	117	} else return 0;
[2]	118	}
	119	}
	120
	121	public int TrainingSamplesEnd {
	122	get {
[273]	123	if(metadata.ContainsKey(TRAININGSAMPLESEND)) {
	124	return metadata[TRAININGSAMPLESEND][0].intValue;
	125	} else return rows;
[2]	126	}
	127	}
[363]	128	public int ValidationSamplesStart {
	129	get {
	130	if(metadata.ContainsKey(VALIDATIONSAMPLESSTART)) {
	131	return metadata[VALIDATIONSAMPLESSTART][0].intValue;
	132	} else return 0;
	133	}
	134	}
[2]	135
[363]	136	public int ValidationSamplesEnd {
	137	get {
	138	if(metadata.ContainsKey(VALIDATIONSAMPLESEND)) {
	139	return metadata[VALIDATIONSAMPLESEND][0].intValue;
	140	} else return rows;
	141	}
	142	}
	143
[2]	144	public DatasetParser() {
	145	this.metadata = new Dictionary<string, List<Token>>();
	146	samplesList = new List<List<double>>();
	147	}
	148
	149	public void Import(string importFileName, bool strict) {
	150	StreamReader reader = new StreamReader(importFileName);
	151	this.tokenizer = new Tokenizer(reader);
	152	tokenizer.Separators = new string[] { " ", ";", "\t" };
	153
[272]	154	try {
	155	// parse the file
	156	Parse(strict);
	157	} finally {
	158	reader.Close();
	159	}
[2]	160
	161	// translate the list of samples into a DoubleMatrixData item
	162	samples = new double[samplesList.Count * samplesList[0].Count];
	163	rows = samplesList.Count;
	164	columns = samplesList[0].Count;
	165
	166	int i = 0;
	167	int j = 0;
[272]	168	foreach(List<double> row in samplesList) {
[2]	169	j = 0;
[272]	170	foreach(double element in row) {
[2]	171	samples[i * columns + j] = element;
	172	j++;
	173	}
	174	i++;
	175	}
	176	}
	177
	178	#region tokenizer
	179	internal enum TokenTypeEnum {
	180	At, Assign, NewLine, String, Double, Int
	181	}
	182
	183	internal class Token {
	184	public TokenTypeEnum type;
	185	public string stringValue;
	186	public double doubleValue;
	187	public int intValue;
	188
	189	public Token(TokenTypeEnum type, string value) {
	190	this.type = type;
	191	stringValue = value;
	192	doubleValue = 0.0;
	193	intValue = 0;
	194	}
	195
	196	public override string ToString() {
	197	return stringValue;
	198	}
	199	}
	200
	201
	202	class Tokenizer {
	203	private StreamReader reader;
	204	private List<Token> tokens;
	205	private string[] separators;
	206
	207	public int CurrentLineNumber = 0;
	208	public string CurrentLine;
	209
	210	public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
	211	public static Token AtToken = new Token(TokenTypeEnum.At, "@");
	212	public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
	213
	214	public string[] Separators {
	215	get { return separators; }
	216	set { separators = value; }
	217	}
	218
	219
	220	public Tokenizer(StreamReader reader) {
	221	this.reader = reader;
	222	tokens = new List<Token>();
	223	ReadNextTokens();
	224	}
	225
	226	private void ReadNextTokens() {
[272]	227	if(!reader.EndOfStream) {
[2]	228	CurrentLine = reader.ReadLine();
	229	Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) {
	230	return MakeToken(str);
	231	});
	232
	233	tokens.AddRange(newTokens);
	234	tokens.Add(NewlineToken);
	235	CurrentLineNumber++;
	236	}
	237	}
	238
	239	private Token MakeToken(string strToken) {
[272]	240	if(strToken == "@")
[2]	241	return AtToken;
[272]	242	else if(strToken == "=")
[2]	243	return AssignmentToken;
	244	else {
	245	Token token = new Token(TokenTypeEnum.String, strToken);
	246
	247	// try invariant culture
	248	NumberFormatInfo currentNumberFormatInfo = CultureInfo.InvariantCulture.NumberFormat;
[272]	249	if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
[2]	250	token.type = TokenTypeEnum.Int;
	251	return token;
[272]	252	} else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
[2]	253	token.type = TokenTypeEnum.Double;
	254	return token;
	255	}
	256	// try german culture
	257	currentNumberFormatInfo = CultureInfo.GetCultureInfo("de-DE").NumberFormat;
[272]	258	if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
[2]	259	token.type = TokenTypeEnum.Int;
	260	return token;
[272]	261	} else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
[2]	262	token.type = TokenTypeEnum.Double;
	263	return token;
	264	}
	265
	266	// try current culture
	267	currentNumberFormatInfo = CultureInfo.CurrentCulture.NumberFormat;
[272]	268	if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
[2]	269	token.type = TokenTypeEnum.Int;
	270	return token;
[272]	271	} else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
[2]	272	token.type = TokenTypeEnum.Double;
	273	return token;
	274	}
	275
	276	// nothing worked
	277	return token;
	278	}
	279	}
	280
	281	public Token Peek() {
	282	return tokens[0];
	283	}
	284
	285	public Token Next() {
	286	Token next = tokens[0];
	287	tokens.RemoveAt(0);
[272]	288	if(tokens.Count == 0) {
[2]	289	ReadNextTokens();
	290	}
	291	return next;
	292	}
	293
	294	public bool HasNext() {
	295	return tokens.Count > 0 \|\| !reader.EndOfStream;
	296	}
	297	}
	298	#endregion
	299
	300	#region parsing
	301	private void Parse(bool strict) {
	302	ParseMetaData(strict);
	303	ParseSampleData(strict);
	304	}
	305
	306	private void ParseSampleData(bool strict) {
	307	List<double> row = new List<double>();
[272]	308	while(tokenizer.HasNext()) {
[2]	309	Token current = tokenizer.Next();
[272]	310	if(current.type == TokenTypeEnum.Double) {
[2]	311	// just take the value
	312	row.Add(current.doubleValue);
[272]	313	} else if(current.type == TokenTypeEnum.Int) {
[2]	314	// translate the int value to double
	315	row.Add((double)current.intValue);
[272]	316	} else if(current == Tokenizer.NewlineToken) {
[2]	317	// when parsing strictly all rows have to have the same number of values
[272]	318	if(strict) {
[2]	319	// the first row defines how many samples are needed
[272]	320	if(samplesList.Count > 0 && samplesList[0].Count != row.Count) {
[2]	321	Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
[273]	322	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
[2]	323	}
[272]	324	} else if(samplesList.Count > 0) {
[2]	325	// when we are not strict then fill or drop elements as needed
[272]	326	if(samplesList[0].Count > row.Count) {
[2]	327	// fill with NAN
[272]	328	for(int i = row.Count; i < samplesList[0].Count; i++) {
[2]	329	row.Add(double.NaN);
	330	}
[272]	331	} else if(samplesList[0].Count < row.Count) {
[2]	332	// drop last k elements where k = n - length of first row
	333	row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
	334	}
	335	}
	336
	337	// add the current row to the collection of rows and start a new row
	338	samplesList.Add(row);
	339	row = new List<double>();
	340	} else {
	341	// found an unexpected token => return false when parsing strictly
	342	// when we are parsing non-strictly we also allow unreadable values inserting NAN instead
[272]	343	if(strict) {
[273]	344	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
[2]	345	} else {
	346	row.Add(double.NaN);
	347	}
	348	}
	349	}
	350	}
	351
	352	private void ParseMetaData(bool strict) {
[272]	353	while(tokenizer.Peek() == Tokenizer.AtToken) {
[2]	354	Expect(Tokenizer.AtToken);
	355
	356	Token nameToken = tokenizer.Next();
[272]	357	if(nameToken.type != TokenTypeEnum.String)
[273]	358	Error("Expected a variable name.", nameToken.stringValue, tokenizer.CurrentLineNumber);
[2]	359
	360	Expect(Tokenizer.AssignmentToken);
	361
	362	List<Token> tokens = new List<Token>();
	363	Token valueToken = tokenizer.Next();
[272]	364	while(valueToken != Tokenizer.NewlineToken) {
[2]	365	tokens.Add(valueToken);
	366	valueToken = tokenizer.Next();
	367	}
	368
	369	metadata[nameToken.stringValue] = tokens;
	370	}
	371	}
	372
	373	private void Expect(Token expectedToken) {
	374	Token actualToken = tokenizer.Next();
[272]	375	if(actualToken != expectedToken) {
[273]	376	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
[2]	377	}
	378	}
	379
[273]	380	private void Error(string message, string token, int lineNumber) {
	381	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
[2]	382	}
	383	#endregion
	384	}
	385	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences