Context Navigation

source: trunk/sources/HeuristicLab.DataAnalysis/DatasetParser.cs @ 73

Visit:

Last change on this file since 73 was 2, checked in by swagner, 17 years ago
Added HeuristicLab 3.0 sources from former SVN repository at revision 52
File size: 10.7 KB

Rev	Line
[2]	1	#region License Information
	2	/* HeuristicLab
	3	* Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
	22	using System;
	23	using System.Collections.Generic;
	24	using System.Globalization;
	25	using System.IO;
	26	using HeuristicLab.Data;
	27
	28	namespace HeuristicLab.DataAnalysis {
	29	public class DatasetParser {
	30	private Tokenizer tokenizer;
	31	private Dictionary<string, List<Token>> metadata;
	32	private List<List<double>> samplesList;
	33
	34	private int rows;
	35	public int Rows {
	36	get { return rows; }
	37	set { rows = value; }
	38	}
	39
	40	private int columns;
	41	public int Columns {
	42	get { return columns; }
	43	set { columns = value; }
	44	}
	45
	46	private double[] samples;
	47	public double[] Samples {
	48	get {
	49	return samples;
	50	}
	51	}
	52
	53	public string ProblemName {
	54	get {
	55	return metadata["PROBLEMNAME"][0].stringValue;
	56	}
	57	}
	58
	59	public string[] VariableNames {
	60	get {
	61	List<Token> nameList = metadata["VARIABLENAMES"];
	62	string[] names = new string[nameList.Count];
	63	for (int i = 0; i < names.Length; i++) {
	64	names[i] = nameList[i].stringValue;
	65	}
	66
	67	return names;
	68	}
	69	}
	70
	71	public int TargetVariable {
	72	get {
	73	return metadata["TARGETVARIABLE"][0].intValue;
	74	}
	75	}
	76
	77	public int MaxTreeHeight {
	78	get {
	79	return metadata["MAXIMUMTREEHEIGHT"][0].intValue;
	80	}
	81	}
	82
	83	public int MaxTreeSize {
	84	get {
	85	return metadata["MAXIMUMTREESIZE"][0].intValue;
	86	}
	87	}
	88
	89	public int TrainingSamplesStart {
	90	get {
	91	return metadata["TRAININGSAMPLESSTART"][0].intValue;
	92	}
	93	}
	94
	95	public int TrainingSamplesEnd {
	96	get {
	97	return metadata["TRAININGSAMPLESEND"][0].intValue;
	98	}
	99	}
	100
	101	public DatasetParser() {
	102	this.metadata = new Dictionary<string, List<Token>>();
	103	samplesList = new List<List<double>>();
	104	}
	105
	106	public void Import(string importFileName, bool strict) {
	107	StreamReader reader = new StreamReader(importFileName);
	108	this.tokenizer = new Tokenizer(reader);
	109	tokenizer.Separators = new string[] { " ", ";", "\t" };
	110
	111	// parse the file
	112	Parse(strict);
	113
	114	// translate the list of samples into a DoubleMatrixData item
	115	samples = new double[samplesList.Count * samplesList[0].Count];
	116	rows = samplesList.Count;
	117	columns = samplesList[0].Count;
	118
	119	int i = 0;
	120	int j = 0;
	121	foreach (List<double> row in samplesList) {
	122	j = 0;
	123	foreach (double element in row) {
	124	samples[i * columns + j] = element;
	125	j++;
	126	}
	127	i++;
	128	}
	129	}
	130
	131	#region tokenizer
	132	internal enum TokenTypeEnum {
	133	At, Assign, NewLine, String, Double, Int
	134	}
	135
	136	internal class Token {
	137	public TokenTypeEnum type;
	138	public string stringValue;
	139	public double doubleValue;
	140	public int intValue;
	141
	142	public Token(TokenTypeEnum type, string value) {
	143	this.type = type;
	144	stringValue = value;
	145	doubleValue = 0.0;
	146	intValue = 0;
	147	}
	148
	149	public override string ToString() {
	150	return stringValue;
	151	}
	152	}
	153
	154
	155	class Tokenizer {
	156	private StreamReader reader;
	157	private List<Token> tokens;
	158	private string[] separators;
	159
	160	public int CurrentLineNumber = 0;
	161	public string CurrentLine;
	162
	163	public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
	164	public static Token AtToken = new Token(TokenTypeEnum.At, "@");
	165	public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
	166
	167	public string[] Separators {
	168	get { return separators; }
	169	set { separators = value; }
	170	}
	171
	172
	173	public Tokenizer(StreamReader reader) {
	174	this.reader = reader;
	175	tokens = new List<Token>();
	176	ReadNextTokens();
	177	}
	178
	179	private void ReadNextTokens() {
	180	if (!reader.EndOfStream) {
	181	CurrentLine = reader.ReadLine();
	182	Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) {
	183	return MakeToken(str);
	184	});
	185
	186	tokens.AddRange(newTokens);
	187	tokens.Add(NewlineToken);
	188	CurrentLineNumber++;
	189	}
	190	}
	191
	192	private Token MakeToken(string strToken) {
	193	if (strToken == "@")
	194	return AtToken;
	195	else if (strToken == "=")
	196	return AssignmentToken;
	197	else {
	198	Token token = new Token(TokenTypeEnum.String, strToken);
	199
	200	// try invariant culture
	201	NumberFormatInfo currentNumberFormatInfo = CultureInfo.InvariantCulture.NumberFormat;
	202	if (int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
	203	token.type = TokenTypeEnum.Int;
	204	return token;
	205	} else if (double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
	206	token.type = TokenTypeEnum.Double;
	207	return token;
	208	}
	209	// try german culture
	210	currentNumberFormatInfo = CultureInfo.GetCultureInfo("de-DE").NumberFormat;
	211	if (int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
	212	token.type = TokenTypeEnum.Int;
	213	return token;
	214	} else if (double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
	215	token.type = TokenTypeEnum.Double;
	216	return token;
	217	}
	218
	219	// try current culture
	220	currentNumberFormatInfo = CultureInfo.CurrentCulture.NumberFormat;
	221	if (int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
	222	token.type = TokenTypeEnum.Int;
	223	return token;
	224	} else if (double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
	225	token.type = TokenTypeEnum.Double;
	226	return token;
	227	}
	228
	229	// nothing worked
	230	return token;
	231	}
	232	}
	233
	234	public Token Peek() {
	235	return tokens[0];
	236	}
	237
	238	public Token Next() {
	239	Token next = tokens[0];
	240	tokens.RemoveAt(0);
	241	if (tokens.Count == 0) {
	242	ReadNextTokens();
	243	}
	244	return next;
	245	}
	246
	247	public bool HasNext() {
	248	return tokens.Count > 0 \|\| !reader.EndOfStream;
	249	}
	250	}
	251	#endregion
	252
	253	#region parsing
	254	private void Parse(bool strict) {
	255	ParseMetaData(strict);
	256	ParseSampleData(strict);
	257	}
	258
	259	private void ParseSampleData(bool strict) {
	260	List<double> row = new List<double>();
	261	while (tokenizer.HasNext()) {
	262	Token current = tokenizer.Next();
	263	if (current.type == TokenTypeEnum.Double) {
	264	// just take the value
	265	row.Add(current.doubleValue);
	266	} else if (current.type == TokenTypeEnum.Int) {
	267	// translate the int value to double
	268	row.Add((double)current.intValue);
	269	} else if (current == Tokenizer.NewlineToken) {
	270	// when parsing strictly all rows have to have the same number of values
	271	if (strict) {
	272	// the first row defines how many samples are needed
	273	if (samplesList.Count > 0 && samplesList[0].Count != row.Count) {
	274	Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
	275	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.");
	276	}
	277	} else if (samplesList.Count > 0) {
	278	// when we are not strict then fill or drop elements as needed
	279	if (samplesList[0].Count > row.Count) {
	280	// fill with NAN
	281	for (int i = row.Count; i < samplesList[0].Count; i++) {
	282	row.Add(double.NaN);
	283	}
	284	} else if (samplesList[0].Count < row.Count) {
	285	// drop last k elements where k = n - length of first row
	286	row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
	287	}
	288	}
	289
	290	// add the current row to the collection of rows and start a new row
	291	samplesList.Add(row);
	292	row = new List<double>();
	293	} else {
	294	// found an unexpected token => return false when parsing strictly
	295	// when we are parsing non-strictly we also allow unreadable values inserting NAN instead
	296	if (strict) {
	297	Error("Unkown value " + current + " in line " + tokenizer.CurrentLineNumber +
	298	"\n" + tokenizer.CurrentLine);
	299	} else {
	300	row.Add(double.NaN);
	301	}
	302	}
	303	}
	304	}
	305
	306	private void ParseMetaData(bool strict) {
	307	while (tokenizer.Peek() == Tokenizer.AtToken) {
	308	Expect(Tokenizer.AtToken);
	309
	310	Token nameToken = tokenizer.Next();
	311	if (nameToken.type != TokenTypeEnum.String)
	312	throw new Exception("Expected a variable name; got " + nameToken +
	313	"\nLine " + tokenizer.CurrentLineNumber + ": " + tokenizer.CurrentLine);
	314
	315	Expect(Tokenizer.AssignmentToken);
	316
	317	List<Token> tokens = new List<Token>();
	318	Token valueToken = tokenizer.Next();
	319	while (valueToken != Tokenizer.NewlineToken) {
	320	tokens.Add(valueToken);
	321	valueToken = tokenizer.Next();
	322	}
	323
	324	metadata[nameToken.stringValue] = tokens;
	325	}
	326	}
	327
	328	private void Expect(Token expectedToken) {
	329	Token actualToken = tokenizer.Next();
	330	if (actualToken != expectedToken) {
	331	Error("Expected: " + expectedToken + " got: " + actualToken +
	332	"\nLine " + tokenizer.CurrentLineNumber + ": " + tokenizer.CurrentLine);
	333	}
	334	}
	335
	336	private void Error(string message) {
	337	throw new Exception("Error while parsing.\n" + message);
	338	}
	339	#endregion
	340	}
	341	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences