Context Navigation

← Previous Change
Next Change →

CsvFileParser.cs

Timestamp:

04/04/10 18:53:55 (14 years ago)

Author:

gkronber

Message:

Implemented import of CSV files for regression problems. #938 (Data types and operators for regression problems)

File:

: 1 copied

trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/CsvFileParser.cs (copied) (copied from trunk/sources/HeuristicLab.DataAnalysis/3.2/DatasetParser.cs) (12 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/CsvFileParser.cs

-                      r3262
+                      r3264
 #region License Information
 /* HeuristicLab
  * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
+ * Copyright (C) 2002-2010 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
+ *
  * This file is part of HeuristicLab.
 …
 using System.Text;
+namespace HeuristicLab.DataAnalysis {
+  public class DatasetParser {
+    private const string PROBLEMNAME = "PROBLEMNAME";
+namespace HeuristicLab.Problems.DataAnalysis.Regression {
+  public class CsvFileParser {
     private const string VARIABLENAMES = "VARIABLENAMES";
-    private const string TARGETVARIABLE = "TARGETVARIABLE";
-    private const string MAXIMUMTREEHEIGHT = "MAXIMUMTREEHEIGHT";
-    private const string MAXIMUMTREESIZE = "MAXIMUMTREESIZE";
-    private const string TRAININGSAMPLESSTART = "TRAININGSAMPLESSTART";
-    private const string TRAININGSAMPLESEND = "TRAININGSAMPLESEND";
-    private const string VALIDATIONSAMPLESSTART = "VALIDATIONSAMPLESSTART";
-    private const string VALIDATIONSAMPLESEND = "VALIDATIONSAMPLESEND";
-    private const string TESTSAMPLESSTART = "TESTSAMPLESSTART";
-    private const string TESTSAMPLESEND = "TESTSAMPLESEND";
-    private const string NONINPUTVARIABLES = "NONINPUTVARIABLES";
     private Tokenizer tokenizer;
     private Dictionary<string, List<Token>> metadata;
     private List<List<double>> samplesList;
+    private List<string> variableNames;
+    private List<List<double>> rowValues;
     private int rows;
 …
+    }
     private double[] samples;
     public double[] Samples {
+    private double[,] values;
+    public double[,] Values {
       get {
         return samples;
+      }
+    }
     public string ProblemName {
+        return values;
+      }
+    }
+    public IEnumerable<string> VariableNames {
       get {
+        if (metadata.ContainsKey(PROBLEMNAME)) {
+          return metadata[PROBLEMNAME][0].stringValue;
+        } else return "-";
+      }
+    }
+    public string[] VariableNames {
+      get {
+        if (metadata.ContainsKey(VARIABLENAMES)) {
+          List<Token> nameList = metadata[VARIABLENAMES];
+          string[] names = new string[nameList.Count];
+          for (int i = 0; i < names.Length; i++) {
+            names[i] = nameList[i].stringValue;
+          }
+          return names;
+        } else {
+        if (variableNames.Count > 0) return variableNames;
+        else {
           string[] names = new string[columns];
           for (int i = 0; i < names.Length; i++) {
 …
+    }
+    public int TargetVariable {
+      get {
+        if (metadata.ContainsKey(TARGETVARIABLE)) {
+          return metadata[TARGETVARIABLE][0].intValue;
+        } else return 0; // default is the first column
+      }
+    }
+    public int MaxTreeHeight {
+      get {
+        if (metadata.ContainsKey(MAXIMUMTREEHEIGHT)) {
+          return metadata[MAXIMUMTREEHEIGHT][0].intValue;
+        } else return 0;
+      }
+    }
+    public int MaxTreeSize {
+      get {
+        if (metadata.ContainsKey(MAXIMUMTREESIZE)) {
+          return metadata[MAXIMUMTREESIZE][0].intValue;
+        } else return 0;
+      }
+    }
+    public int TrainingSamplesStart {
+      get {
+        if (metadata.ContainsKey(TRAININGSAMPLESSTART)) {
+          return metadata[TRAININGSAMPLESSTART][0].intValue;
+        } else return 0;
+      }
+    }
+    public int TrainingSamplesEnd {
+      get {
+        if (metadata.ContainsKey(TRAININGSAMPLESEND)) {
+          return metadata[TRAININGSAMPLESEND][0].intValue;
+        } else return rows;
+      }
+    }
+    public int ValidationSamplesStart {
+      get {
+        if (metadata.ContainsKey(VALIDATIONSAMPLESSTART)) {
+          return metadata[VALIDATIONSAMPLESSTART][0].intValue;
+        } else return 0;
+      }
+    }
+    public int ValidationSamplesEnd {
+      get {
+        if (metadata.ContainsKey(VALIDATIONSAMPLESEND)) {
+          return metadata[VALIDATIONSAMPLESEND][0].intValue;
+        } else return rows;
+      }
+    }
+    public int TestSamplesStart {
+      get {
+        if (metadata.ContainsKey(TESTSAMPLESSTART)) {
+          return metadata[TESTSAMPLESSTART][0].intValue;
+        } else return 0;
+      }
+    }
+    public int TestSamplesEnd {
+      get {
+        if (metadata.ContainsKey(TESTSAMPLESEND)) {
+          return metadata[TESTSAMPLESEND][0].intValue;
+        } else return rows;
+      }
+    }
+    public List<int> NonInputVariables {
+      get {
+        List<int> disallowedVariables = new List<int>();
+        if (metadata.ContainsKey(NONINPUTVARIABLES)) {
+          foreach (Token t in metadata[NONINPUTVARIABLES]) {
+            disallowedVariables.Add(t.intValue);
+          }
+        }
+        return disallowedVariables;
+      }
+    }
+    public DatasetParser() {
+      this.metadata = new Dictionary<string, List<Token>>();
+      samplesList = new List<List<double>>();
+    }
+    public void Reset() {
+      metadata.Clear();
+      samplesList.Clear();
+    }
+    public void Import(string importFileName, bool strict) {
+      TryParse(importFileName, strict);
+    public CsvFileParser() {
+      rowValues = new List<List<double>>();
+      variableNames = new List<string>();
+    }
+    private void Reset() {
+      variableNames.Clear();
+      rowValues.Clear();
+    }
+    public void Parse(string fileName) {
+      TryParse(fileName);
       // translate the list of samples into a DoubleMatrixData item
       samples = new double[samplesList.Count * samplesList[0].Count];
       rows = samplesList.Count;
       columns = samplesList[0].Count;
       int i = 0;
       int j = 0;
       foreach (List<double> row in samplesList) {
         j = 0;
+      rows = rowValues.Count;
+      columns = rowValues[0].Count;
+      values = new double[rows, columns];
+      int rowIndex = 0;
+      int columnIndex = 0;
+      foreach (List<double> row in rowValues) {
+        columnIndex = 0;
         foreach (double element in row) {
+          samples[i * columns + j] = element;
+          j++;
+        }
+        i++;
+      }
+    }
+    private void TryParse(string importFileName, bool strict) {
+          values[rowIndex, columnIndex++] = element;
+        }
+        rowIndex++;
+      }
+    }
+    private void TryParse(string fileName) {
       Exception lastEx = null;
       NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { NumberFormatInfo.InvariantInfo, CultureInfo.GetCultureInfo("de-DE").NumberFormat, NumberFormatInfo.CurrentInfo };
+      NumberFormatInfo[] possibleFormats = new NumberFormatInfo[] { NumberFormatInfo.InvariantInfo };
       foreach (NumberFormatInfo numberFormat in possibleFormats) {
         using (StreamReader reader = new StreamReader(importFileName)) {
+        using (StreamReader reader = new StreamReader(fileName)) {
           tokenizer = new Tokenizer(reader, numberFormat);
           try {
             // parse the file
             Parse(strict);
+            Parse();
             return; // parsed without errors -> return;
+          }
 …
     #region tokenizer
     internal enum TokenTypeEnum {
       At, Assign, NewLine, SemiColon, String, Double, Int
+      NewLine, Separator, String, Double
+    }
 …
       public string stringValue;
       public double doubleValue;
-      public int intValue;
       public Token(TokenTypeEnum type, string value) {
 …
         stringValue = value;
         doubleValue = 0.0;
-        intValue = 0;
+      }
 …
     class Tokenizer {
+    internal class Tokenizer {
       private StreamReader reader;
       private List<Token> tokens;
       private NumberFormatInfo numberFormatInfo;
+      public int CurrentLineNumber = 0;
+      public string CurrentLine;
+      public static Token NewlineToken = new Token(TokenTypeEnum.NewLine, "\n");
+      public static Token AtToken = new Token(TokenTypeEnum.At, "@");
+      public static Token AssignmentToken = new Token(TokenTypeEnum.Assign, "=");
+      public static Token SeparatorToken = new Token(TokenTypeEnum.SemiColon, ";");
+      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo) {
+      private int currentLineNumber = 0;
+      public int CurrentLineNumber {
+        get { return currentLineNumber; }
+        private set { currentLineNumber = value; }
+      }
+      private string currentLine;
+      public string CurrentLine {
+        get { return currentLine; }
+        private set { currentLine = value; }
+      }
+      private Token newlineToken;
+      public Token NewlineToken {
+        get { return newlineToken; }
+        private set { newlineToken = value; }
+      }
+      private Token separatorToken;
+      public Token SeparatorToken {
+        get { return separatorToken; }
+        private set { separatorToken = value; }
+      }
+      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo, char separator) {
         this.reader = reader;
         this.numberFormatInfo = numberFormatInfo;
+        separatorToken = new Token(TokenTypeEnum.Separator, separator.ToString());
+        newlineToken = new Token(TokenTypeEnum.NewLine, Environment.NewLine);
         tokens = new List<Token>();
         ReadNextTokens();
+      }
+      public Tokenizer(StreamReader reader, NumberFormatInfo numberFormatInfo)
+        : this(reader, numberFormatInfo, ';') {
+      }
 …
         StringBuilder subStr = new StringBuilder();
         foreach (char c in line) {
           if (c == '@' || c == '=' || c == ';') {
+          if (c == ';') {
             yield return subStr.ToString();
             subStr = new StringBuilder();
 …
       private Token MakeToken(string strToken) {
         Token token = new Token(TokenTypeEnum.String, strToken);
+        if (strToken.Equals(AtToken.stringValue)) {
+          return AtToken;
+        } else if (strToken.Equals(AssignmentToken.stringValue)) {
+          return AssignmentToken;
+        } else if (strToken.Equals(SeparatorToken.stringValue)) {
+        if (strToken.Equals(SeparatorToken.stringValue)) {
           return SeparatorToken;
-        } else if (int.TryParse(strToken, NumberStyles.Integer, numberFormatInfo, out token.intValue)) {
-          token.type = TokenTypeEnum.Int;
-          return token;
         } else if (double.TryParse(strToken, NumberStyles.Float, numberFormatInfo, out token.doubleValue)) {
           token.type = TokenTypeEnum.Double;
 …
     #region parsing
     private void Parse(bool strict) {
       ParseMetaData(strict);
+    private void Parse() {
+      ParseVariableNames();
       if (!tokenizer.HasNext()) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
       ParseSampleData(strict);
       if (samplesList.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
+    }
     private void ParseSampleData(bool strict) {
+      ParseValues();
+      if (rowValues.Count == 0) Error("Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
+    }
+    private void ParseValues() {
       while (tokenizer.HasNext()) {
         List<double> row = new List<double>();
+        row.Add(NextValue(tokenizer, strict));
+        while (tokenizer.HasNext() && tokenizer.Peek() == Tokenizer.SeparatorToken) {
+          Expect(Tokenizer.SeparatorToken);
+          row.Add(NextValue(tokenizer, strict));
+        }
+        Expect(Tokenizer.NewlineToken);
+        // when parsing strictly all rows have to have the same number of values
+        if (strict) {
+          // the first row defines how many samples are needed
+          if (samplesList.Count > 0 && samplesList[0].Count != row.Count) {
+            Error("The first row of the dataset has " + samplesList[0].Count + " columns." +
+              "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
+          }
+        } else if (samplesList.Count > 0) {
+          // when we are not strict then fill or drop elements as needed
+          if (samplesList[0].Count > row.Count) {
+            // fill with NAN
+            for (int i = row.Count; i < samplesList[0].Count; i++) {
+              row.Add(double.NaN);
+            }
+          } else if (samplesList[0].Count < row.Count) {
+            // drop last k elements where k = n - length of first row
+            row.RemoveRange(samplesList[0].Count - 1, row.Count - samplesList[0].Count);
+          }
+        }
+        row.Add(NextValue(tokenizer));
+        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
+          Expect(tokenizer.SeparatorToken);
+          row.Add(NextValue(tokenizer));
+        }
+        Expect(tokenizer.NewlineToken);
+        // all rows have to have the same number of values
+        // the first row defines how many samples are needed
+        if (rowValues.Count > 0 && rowValues[0].Count != row.Count) {
+          Error("The first row of the dataset has " + rowValues[0].Count + " columns." +
+            "\nLine " + tokenizer.CurrentLineNumber + " has " + row.Count + " columns.", "", tokenizer.CurrentLineNumber);
+        }
         // add the current row to the collection of rows and start a new row
         samplesList.Add(row);
+        rowValues.Add(row);
         row = new List<double>();
+      }
+    }
     private double NextValue(Tokenizer tokenizer, bool strict) {
       if (tokenizer.Peek() == Tokenizer.SeparatorToken || tokenizer.Peek() == Tokenizer.NewlineToken) return double.NaN;
+    private double NextValue(Tokenizer tokenizer) {
+      if (tokenizer.Peek() == tokenizer.SeparatorToken || tokenizer.Peek() == tokenizer.NewlineToken) return double.NaN;
       Token current = tokenizer.Next();
       if (current.type == TokenTypeEnum.SemiColon || current.type == TokenTypeEnum.String) {
+      if (current.type == TokenTypeEnum.Separator || current.type == TokenTypeEnum.String) {
         return double.NaN;
       } else if (current.type == TokenTypeEnum.Double) {
         // just take the value
         return current.doubleValue;
+      } else if (current.type == TokenTypeEnum.Int) {
+        // translate the int value to double
+        return (double)current.intValue;
+      } else {
+        // found an unexpected token => throw error when parsing strictly
+        // when we are parsing non-strictly we also allow unreadable values inserting NAN instead
+        if (strict) {
+          Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
+        } else {
+          return double.NaN;
+        }
+      }
+      return double.NaN;
+    }
+    private void ParseMetaData(bool strict) {
+      while (tokenizer.HasNext() && tokenizer.Peek() == Tokenizer.AtToken) {
+        Expect(Tokenizer.AtToken);
+        Token nameToken = tokenizer.Next();
+        Expect(Tokenizer.AssignmentToken);
+      }
+      // found an unexpected token => throw error
+      Error("Unexpected token.", current.stringValue, tokenizer.CurrentLineNumber);
+      // this line is never executed because Error() throws an exception
+      throw new InvalidOperationException();
+    }
+    private void ParseVariableNames() {
+      // if the first line doesn't start with a double value then we assume that the
+      // first line contains variable names
+      if (tokenizer.HasNext() && tokenizer.Peek().type != TokenTypeEnum.Double) {
         List<Token> tokens = new List<Token>();
 …
         valueToken = tokenizer.Next();
         tokens.Add(valueToken);
         while (tokenizer.HasNext() && tokenizer.Peek() == Tokenizer.SeparatorToken) {
           Expect(Tokenizer.SeparatorToken);
+        while (tokenizer.HasNext() && tokenizer.Peek() == tokenizer.SeparatorToken) {
+          Expect(tokenizer.SeparatorToken);
           valueToken = tokenizer.Next();
           if (valueToken != Tokenizer.NewlineToken) {
+          if (valueToken != tokenizer.NewlineToken) {
             tokens.Add(valueToken);
+          }
+        }
         if (valueToken != Tokenizer.NewlineToken) {
           Expect(Tokenizer.NewlineToken);
+        }
         metadata[nameToken.stringValue] = tokens;
+        if (valueToken != tokenizer.NewlineToken) {
+          Expect(tokenizer.NewlineToken);
+        }
+        variableNames = tokens.Select(x => x.stringValue.Trim()).ToList();
+      }
+    }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 3264 for trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/CsvFileParser.cs

Legend:

trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/CsvFileParser.cs

Download in other formats: