Context Navigation

source: trunk/sources/HeuristicLab.DataAnalysis/DatasetParser.cs @ 308

Visit:

Last change on this file since 308 was 273, checked in by gkronber, 17 years ago
fixed #160
File size: 11.9 KB

Rev	Line
[2]	1	#region License Information
	2	/* HeuristicLab
	3	* Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
	22	using System;
	23	using System.Collections.Generic;
	24	using System.Globalization;
	25	using System.IO;
	26	using HeuristicLab.Data;
	27
	28	namespace HeuristicLab.DataAnalysis {
	29	public class DatasetParser {
[273]	30	private const string PROBLEMNAME = "PROBLEMNAME";
	31	private const string VARIABLENAMES = "VARIABLENAMES";
	32	private const string TARGETVARIABLE = "TARGETVARIABLE";
	33	private const string MAXIMUMTREEHEIGHT = "MAXIMUMTREEHEIGHT";
	34	private const string MAXIMUMTREESIZE = "MAXIMUMTREESIZE";
	35	private const string TRAININGSAMPLESSTART = "TRAININGSAMPLESSTART";
	36	private const string TRAININGSAMPLESEND = "TRAININGSAMPLESEND";
[2]	37	private Tokenizer tokenizer;
	38	private Dictionary<string, List<Token>> metadata;
	39	private List<List<double>> samplesList;
	40
	41	private int rows;
	42	public int Rows {
	43	get { return rows; }
	44	set { rows = value; }
	45	}
	46
	47	private int columns;
	48	public int Columns {
	49	get { return columns; }
	50	set { columns = value; }
	51	}
	52
	53	private double[] samples;
	54	public double[] Samples {
	55	get {
	56	return samples;
	57	}
	58	}
	59
	60	public string ProblemName {
	61	get {
[273]	62	if(metadata.ContainsKey(PROBLEMNAME)) {
	63	return metadata[PROBLEMNAME][0].stringValue;
	64	} else return "-";
[2]	65	}
	66	}
	67
	68	public string[] VariableNames {
	69	get {
[273]	70	if(metadata.ContainsKey(VARIABLENAMES)) {
	71	List<Token> nameList = metadata[VARIABLENAMES];
	72	string[] names = new string[nameList.Count];
	73	for(int i = 0; i < names.Length; i++) {
	74	names[i] = nameList[i].stringValue;
	75	}
	76	return names;
	77	} else {
	78	string[] names = new string[columns];
	79	for(int i = 0; i < names.Length; i++) {
	80	names[i] = "X" + i.ToString("000");
	81	}
	82	return names;
[2]	83	}
	84	}
	85	}
	86
	87	public int TargetVariable {
	88	get {
[273]	89	if(metadata.ContainsKey(TARGETVARIABLE)) {
	90	return metadata[TARGETVARIABLE][0].intValue;
	91	} else return 0; // default is the first column
[2]	92	}
	93	}
	94
	95	public int MaxTreeHeight {
	96	get {
[273]	97	if(metadata.ContainsKey(MAXIMUMTREEHEIGHT)) {
	98	return metadata[MAXIMUMTREEHEIGHT][0].intValue;
	99	} else return 0;
[2]	100	}
	101	}
	102
	103	public int MaxTreeSize {
	104	get {
[273]	105	if(metadata.ContainsKey(MAXIMUMTREESIZE)) {
	106	return metadata[MAXIMUMTREESIZE][0].intValue;
	107	} else return 0;
[2]	108	}
	109	}
	110
	111	public int TrainingSamplesStart {
	112	get {
[273]	113	if(metadata.ContainsKey(TRAININGSAMPLESSTART)) {
	114	return metadata[TRAININGSAMPLESSTART][0].intValue;
	115	} else return 0;
[2]	116	}
	117	}
	118
	119	public int TrainingSamplesEnd {
	120	get {
[273]	121	if(metadata.ContainsKey(TRAININGSAMPLESEND)) {
	122	return metadata[TRAININGSAMPLESEND][0].intValue;
	123	} else return rows;
[2]	124	}
	125	}
	126
	127	public DatasetParser() {
	128	this.metadata = new Dictionary<string, List<Token>>();
	129	samplesList = new List<List<double>>();
	130	}
	131
	132	public void Import(string importFileName, bool strict) {
	133	StreamReader reader = new StreamReader(importFileName);
	134	this.tokenizer = new Tokenizer(reader);
	135	tokenizer.Separators = new string[] { " ", ";", "\t" };
	136
[272]	137	try {
	138	// parse the file
	139	Parse(strict);
	140	} finally {
	141	reader.Close();
	142	}
[2]	143
	144	// translate the list of samples into a DoubleMatrixData item
	145	samples = new double[samplesList.Count * samplesList[0].Count];
	146	rows = samplesList.Count;
	147	columns = samplesList[0].Count;
	148
	149	int i = 0;
	150	int j = 0;
[272]	151	foreach(List<double> row in samplesList) {
[2]	152	j = 0;
[272]	153	foreach(double element in row) {
[2]	154	samples[i * columns + j] = element;
	155	j++;
	156	}
	157	i++;
	158	}
	159	}
	160
	161	#region tokenizer
	162	internal enum TokenTypeEnum {
	163	At, Assign, NewLine, String, Double, Int
	164	}
	165
	166	internal class Token {
	167	public TokenTypeEnum type;
	168	public string stringValue;
	169	public double doubleValue;
	170	public int intValue;
	171
	172	public Token(TokenTypeEnum type, string value) {
	173	this.type = type;
	174	stringValue = value;
	175	doubleValue = 0.0;
	176	intValue = 0;
	177	}
	178
	179	public override string ToString() {
	180	return stringValue;
	181	}
	182	}
	183
	184
	185	class Tokenizer {
	186	private StreamReader reader;
	187	private List<Token> tokens;
	188	private string[] separators;
	189
	190	public int CurrentLineNumber = 0;
	191	public string CurrentLine;
	192
	193	public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
	194	public static Token AtToken = new Token(TokenTypeEnum.At, "@");
	195	public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
	196
	197	public string[] Separators {
	198	get { return separators; }
	199	set { separators = value; }
	200	}
	201
	202
	203	public Tokenizer(StreamReader reader) {
	204	this.reader = reader;
	205	tokens = new List<Token>();
	206	ReadNextTokens();
	207	}
	208
	209	private void ReadNextTokens() {
[272]	210	if(!reader.EndOfStream) {
[2]	211	CurrentLine = reader.ReadLine();
	212	Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.RemoveEmptyEntries), delegate(string str) {
	213	return MakeToken(str);
	214	});
	215
	216	tokens.AddRange(newTokens);
	217	tokens.Add(NewlineToken);
	218	CurrentLineNumber++;
	219	}
	220	}
	221
	222	private Token MakeToken(string strToken) {
[272]	223	if(strToken == "@")
[2]	224	return AtToken;
[272]	225	else if(strToken == "=")
[2]	226	return AssignmentToken;
	227	else {
	228	Token token = new Token(TokenTypeEnum.String, strToken);
	229
	230	// try invariant culture
	231	NumberFormatInfo currentNumberFormatInfo = CultureInfo.InvariantCulture.NumberFormat;
[272]	232	if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
[2]	233	token.type = TokenTypeEnum.Int;
	234	return token;
[272]	235	} else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
[2]	236	token.type = TokenTypeEnum.Double;
	237	return token;
	238	}
	239	// try german culture
	240	currentNumberFormatInfo = CultureInfo.GetCultureInfo("de-DE").NumberFormat;
[272]	241	if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
[2]	242	token.type = TokenTypeEnum.Int;
	243	return token;
[272]	244	} else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
[2]	245	token.type = TokenTypeEnum.Double;
	246	return token;
	247	}
	248
	249	// try current culture
	250	currentNumberFormatInfo = CultureInfo.CurrentCulture.NumberFormat;
[272]	251	if(int.TryParse(strToken, NumberStyles.Integer, currentNumberFormatInfo, out token.intValue)) {
[2]	252	token.type = TokenTypeEnum.Int;
	253	return token;
[272]	254	} else if(double.TryParse(strToken, NumberStyles.Float, currentNumberFormatInfo, out token.doubleValue)) {
[2]	255	token.type = TokenTypeEnum.Double;
	256	return token;
	257	}
	258
	259	// nothing worked
	260	return token;
	261	}
	262	}
	263
	264	public Token Peek() {
	265	return tokens[0];
	266	}
	267
	268	public Token Next() {
	269	Token next = tokens[0];
	270	tokens.RemoveAt(0);
[272]	271	if(tokens.Count == 0) {
[2]	272	ReadNextTokens();
	273	}
	274	return next;
	275	}
	276
	277	public bool HasNext() {
	278	return tokens.Count > 0 \|\| !reader.EndOfStream;
	279	}
	280	}
	281	#endregion
	282
	283	#region parsing
	284	private void Parse(bool strict) {
	285	ParseMetaData(strict);
	286	ParseSampleData(strict);
	287	}
	288
	289	private void ParseSampleData(bool strict) {
	290	List<double> row = new List<double>();
[272]	291	while(tokenizer.HasNext()) {
[2]	292	Token current = tokenizer.Next();
[272]	293	if(current.type == TokenTypeEnum.Double) {
[2]	294	// just take the value
	295	row.Add(current.doubleValue);
[272]	296	} else if(current.type == TokenTypeEnum.Int) {
[2]	297	// translate the int value to double
	298	row.Add((double)current.intValue);
[272]	299	} else if(current == Tokenizer.NewlineToken) {
[2]	300	// when parsing strictly all rows have to have the same number of values
[272]	301	if(strict) {
[2]	302	// the first row defines how many samples are needed
[272]	303	if(samplesList.Count > 0 && samplesList[0].Count != row.Count) {
[2]	304	Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
[273]	305	"\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
[2]	306	}
[272]	307	} else if(samplesList.Count > 0) {
[2]	308	// when we are not strict then fill or drop elements as needed
[272]	309	if(samplesList[0].Count > row.Count) {
[2]	310	// fill with NAN
[272]	311	for(int i = row.Count; i < samplesList[0].Count; i++) {
[2]	312	row.Add(double.NaN);
	313	}
[272]	314	} else if(samplesList[0].Count < row.Count) {
[2]	315	// drop last k elements where k = n - length of first row
	316	row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
	317	}
	318	}
	319
	320	// add the current row to the collection of rows and start a new row
	321	samplesList.Add(row);
	322	row = new List<double>();
	323	} else {
	324	// found an unexpected token => return false when parsing strictly
	325	// when we are parsing non-strictly we also allow unreadable values inserting NAN instead
[272]	326	if(strict) {
[273]	327	Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
[2]	328	} else {
	329	row.Add(double.NaN);
	330	}
	331	}
	332	}
	333	}
	334
	335	private void ParseMetaData(bool strict) {
[272]	336	while(tokenizer.Peek() == Tokenizer.AtToken) {
[2]	337	Expect(Tokenizer.AtToken);
	338
	339	Token nameToken = tokenizer.Next();
[272]	340	if(nameToken.type != TokenTypeEnum.String)
[273]	341	Error("Expected a variable name.", nameToken.stringValue, tokenizer.CurrentLineNumber);
[2]	342
	343	Expect(Tokenizer.AssignmentToken);
	344
	345	List<Token> tokens = new List<Token>();
	346	Token valueToken = tokenizer.Next();
[272]	347	while(valueToken != Tokenizer.NewlineToken) {
[2]	348	tokens.Add(valueToken);
	349	valueToken = tokenizer.Next();
	350	}
	351
	352	metadata[nameToken.stringValue] = tokens;
	353	}
	354	}
	355
	356	private void Expect(Token expectedToken) {
	357	Token actualToken = tokenizer.Next();
[272]	358	if(actualToken != expectedToken) {
[273]	359	Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber);
[2]	360	}
	361	}
	362
[273]	363	private void Error(string message, string token, int lineNumber) {
	364	throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
[2]	365	}
	366	#endregion
	367	}
	368	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences