#region License Information /* HeuristicLab * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Collections.Generic; using System.Globalization; using System.IO; using HeuristicLab.Data; namespace HeuristicLab.DataAnalysis { public class DatasetParser { private const string PROBLEMNAME = "PROBLEMNAME"; private const string VARIABLENAMES = "VARIABLENAMES"; private const string TARGETVARIABLE = "TARGETVARIABLE"; private const string MAXIMUMTREEHEIGHT = "MAXIMUMTREEHEIGHT"; private const string MAXIMUMTREESIZE = "MAXIMUMTREESIZE"; private const string TRAININGSAMPLESSTART = "TRAININGSAMPLESSTART"; private const string TRAININGSAMPLESEND = "TRAININGSAMPLESEND"; private const string VALIDATIONSAMPLESSTART = "VALIDATIONSAMPLESSTART"; private const string VALIDATIONSAMPLESEND = "VALIDATIONSAMPLESEND"; private const string TESTSAMPLESSTART = "TESTSAMPLESSTART"; private const string TESTSAMPLESEND = "TESTSAMPLESEND"; private const string NONINPUTVARIABLES = "NONINPUTVARIABLES"; private Tokenizer tokenizer; private Dictionary> metadata; private List> samplesList; private int rows; public int Rows { get { return rows; } set { rows = value; } } private int columns; public int Columns { get { return columns; } set { columns = value; } } private double[] samples; public double[] Samples { get { return samples; } } public string ProblemName { get { if (metadata.ContainsKey(PROBLEMNAME)) { return metadata[PROBLEMNAME][0].stringValue; } else return "-"; } } public string[] VariableNames { get { if (metadata.ContainsKey(VARIABLENAMES)) { List nameList = metadata[VARIABLENAMES]; string[] names = new string[nameList.Count]; for (int i = 0; i < names.Length; i++) { names[i] = nameList[i].stringValue; } return names; } else { string[] names = new string[columns]; for (int i = 0; i < names.Length; i++) { names[i] = "X" + i.ToString("000"); } return names; } } } public int TargetVariable { get { if (metadata.ContainsKey(TARGETVARIABLE)) { return metadata[TARGETVARIABLE][0].intValue; } else return 0; // default is the first column } } public int MaxTreeHeight { get { if (metadata.ContainsKey(MAXIMUMTREEHEIGHT)) { return metadata[MAXIMUMTREEHEIGHT][0].intValue; } else return 0; } } public int MaxTreeSize { get { if (metadata.ContainsKey(MAXIMUMTREESIZE)) { return metadata[MAXIMUMTREESIZE][0].intValue; } else return 0; } } public int TrainingSamplesStart { get { if (metadata.ContainsKey(TRAININGSAMPLESSTART)) { return metadata[TRAININGSAMPLESSTART][0].intValue; } else return 0; } } public int TrainingSamplesEnd { get { if (metadata.ContainsKey(TRAININGSAMPLESEND)) { return metadata[TRAININGSAMPLESEND][0].intValue; } else return rows; } } public int ValidationSamplesStart { get { if (metadata.ContainsKey(VALIDATIONSAMPLESSTART)) { return metadata[VALIDATIONSAMPLESSTART][0].intValue; } else return 0; } } public int ValidationSamplesEnd { get { if (metadata.ContainsKey(VALIDATIONSAMPLESEND)) { return metadata[VALIDATIONSAMPLESEND][0].intValue; } else return rows; } } public int TestSamplesStart { get { if (metadata.ContainsKey(TESTSAMPLESSTART)) { return metadata[TESTSAMPLESSTART][0].intValue; } else return 0; } } public int TestSamplesEnd { get { if (metadata.ContainsKey(TESTSAMPLESEND)) { return metadata[TESTSAMPLESEND][0].intValue; } else return rows; } } public List NonInputVariables { get { List disallowedVariables = new List(); if (metadata.ContainsKey(NONINPUTVARIABLES)) { foreach (Token t in metadata[NONINPUTVARIABLES]) { disallowedVariables.Add(t.intValue); } } return disallowedVariables; } } public DatasetParser() { this.metadata = new Dictionary>(); samplesList = new List>(); } public void Reset() { metadata.Clear(); samplesList.Clear(); } public void Import(string importFileName, bool strict) { TryParse(importFileName, strict); // translate the list of samples into a DoubleMatrixData item samples = new double[samplesList.Count * samplesList[0].Count]; rows = samplesList.Count; columns = samplesList[0].Count; int i = 0; int j = 0; foreach (List row in samplesList) { j = 0; foreach (double element in row) { samples[i * columns + j] = element; j++; } i++; } } private void TryParse(string importFileName, bool strict) { Exception lastEx = null; NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { NumberFormatInfo.InvariantInfo, CultureInfo.GetCultureInfo("de-DE").NumberFormat, NumberFormatInfo.CurrentInfo }; foreach (NumberFormatInfo numberFormat in possibleFormats) { using (StreamReader reader = new StreamReader(importFileName)) { tokenizer = new Tokenizer(reader, numberFormat); try { // parse the file Parse(strict); return; // parsed without errors -> return; } catch (DataFormatException ex) { lastEx = ex; } } } // all number formats threw an exception -> rethrow the last exception throw lastEx; } #region tokenizer internal enum TokenTypeEnum { At, Assign, NewLine, String, Double, Int, WhiteSpace } internal class Token { public TokenTypeEnum type; public string stringValue; public double doubleValue; public int intValue; public Token(TokenTypeEnum type, string value) { this.type = type; stringValue = value; doubleValue = 0.0; intValue = 0; } public override string ToString() { return stringValue; } } class Tokenizer { private StreamReader reader; private List tokens; private string[] separators = new string[] { "@", "=", ";", "\t" }; private NumberFormatInfo numberFormatInfo; public int CurrentLineNumber = 0; public string CurrentLine; public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n"); public static Token AtToken = new Token(TokenTypeEnum.At, "@"); public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "="); public static Token SeparatorToken = new Token(TokenTypeEnum.WhiteSpace, ""); public string[] Separators { get { return separators; } set { separators = value; } } public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo) { this.reader = reader; this.numberFormatInfo = numberFormatInfo; tokens = new List(); ReadNextTokens(); } private void ReadNextTokens() { if (!reader.EndOfStream) { CurrentLine = reader.ReadLine(); Token[] newTokens = Array.ConvertAll(CurrentLine.Split(separators, StringSplitOptions.None), delegate(string str) { return MakeToken(str.Trim()); }); foreach (Token tok in newTokens) { if (tok != SeparatorToken) tokens.Add(tok); } tokens.Add(NewlineToken); CurrentLineNumber++; } } private Token MakeToken(string strToken) { Token token = new Token(TokenTypeEnum.String, strToken); // try to parse as a number first if (int.TryParse(strToken, NumberStyles.Integer, numberFormatInfo, out token.intValue)) { token.type = TokenTypeEnum.Int; return token; } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) { token.type = TokenTypeEnum.Double; return token; } else if (String.IsNullOrEmpty(strToken)) { token.type = TokenTypeEnum.WhiteSpace; return token; } // couldn't parse the token as an int or float number so return a string token return token; } public Token Peek() { return tokens[0]; } public Token Next() { Token next = tokens[0]; tokens.RemoveAt(0); if (tokens.Count == 0) { ReadNextTokens(); } return next; } public bool HasNext() { return tokens.Count > 0 || !reader.EndOfStream; } } #endregion #region parsing private void Parse(bool strict) { ParseMetaData(strict); if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); ParseSampleData(strict); if (samplesList.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber); } private void ParseSampleData(bool strict) { List row = new List(); while (tokenizer.HasNext()) { Token current = tokenizer.Next(); if (current.type == TokenTypeEnum.WhiteSpace) { row.Add(double.NaN); } else if (current.type == TokenTypeEnum.Double) { // just take the value row.Add(current.doubleValue); } else if (current.type == TokenTypeEnum.Int) { // translate the int value to double row.Add((double)current.intValue); } else if (current == Tokenizer.NewlineToken) { // when parsing strictly all rows have to have the same number of values if (strict) { // the first row defines how many samples are needed if (samplesList.Count > 0 && samplesList[0].Count != row.Count) { Error("The first row of the dataset has " + samplesList[0].Count + " columns." + "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber); } } else if (samplesList.Count > 0) { // when we are not strict then fill or drop elements as needed if (samplesList[0].Count > row.Count) { // fill with NAN for (int i = row.Count; i < samplesList[0].Count; i++) { row.Add(double.NaN); } } else if (samplesList[0].Count < row.Count) { // drop last k elements where k = n - length of first row row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count); } } // add the current row to the collection of rows and start a new row samplesList.Add(row); row = new List(); } else { // found an unexpected token => return false when parsing strictly // when we are parsing non-strictly we also allow unreadable values inserting NAN instead if (strict) { Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber); } else { row.Add(double.NaN); } } } } private void ParseMetaData(bool strict) { while (tokenizer.HasNext() && (tokenizer.Peek().type == TokenTypeEnum.WhiteSpace || tokenizer.Peek().type == TokenTypeEnum.String)) { while (tokenizer.HasNext() && tokenizer.Peek().type == TokenTypeEnum.WhiteSpace) tokenizer.Next(); Token nameToken = tokenizer.Next(); if (nameToken.type != TokenTypeEnum.String) Error("Expected a variable name.", nameToken.stringValue, tokenizer.CurrentLineNumber); List tokens = new List(); Token valueToken; while (tokenizer.HasNext() && tokenizer.Peek().type == TokenTypeEnum.WhiteSpace) valueToken = tokenizer.Next(); valueToken = tokenizer.Next(); while (valueToken != Tokenizer.NewlineToken) { tokens.Add(valueToken); while (tokenizer.HasNext() && tokenizer.Peek().type == TokenTypeEnum.WhiteSpace) tokenizer.Next(); valueToken = tokenizer.Next(); } metadata[nameToken.stringValue] = tokens; } } private void Expect(Token expectedToken) { Token actualToken = tokenizer.Next(); if (actualToken != expectedToken) { Error("Expected: " + expectedToken, actualToken.stringValue, tokenizer.CurrentLineNumber); } } private void Error(string message, string token, int lineNumber) { throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber); } #endregion } }