Context Navigation

← Previous Change
Next Change →

Changeset 14330 for branches/symbreg-factors-2650/HeuristicLab.Problems.Instances.DataAnalysis

Timestamp:

10/13/16 19:47:41 (8 years ago)

Author:

gkronber

Message:

#2650 Merged r14282:14322 from trunk to branch (fixing conflicts)

Location:

branches/symbreg-factors-2650

Files:

: 4 edited

. (modified) (1 prop)
HeuristicLab.Problems.Instances.DataAnalysis (modified) (1 prop)
HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/VariableNetworks/VariableNetwork.cs (modified) (4 diffs)
HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs (modified) (6 diffs)

Legend:

: Unmodified
: Added
: Removed

branches/symbreg-factors-2650
- Property svn:mergeinfo changed
  /trunk/sources merged: 14282,14284-14300,14307,14314-14316,14319,14322
branches/symbreg-factors-2650/HeuristicLab.Problems.Instances.DataAnalysis
- Property svn:mergeinfo changed
  /trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis (added) merged: 14285,14291,14296

branches/symbreg-factors-2650/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/VariableNetworks/VariableNetwork.cs

-                      r14277
+                      r14330
 using HeuristicLab.Common;
 using HeuristicLab.Core;
+using HeuristicLab.Problems.DataAnalysis;
 using HeuristicLab.Random;
 …
       int nl = xs.Length;
       int nRows = xs.First().Count;
+      double[,] K = new double[nRows, nRows];
+      // sample length-scales
+      // sample u iid ~ N(0, 1)
+      var u = Enumerable.Range(0, nRows).Select(_ => NormalDistributedRandom.NextDouble(random, 0, 1)).ToArray();
+      // sample actual length-scales
       var l = Enumerable.Range(0, nl)
         .Select(_ => random.NextDouble() * 2 + 0.5)
         .ToArray();
+      // calculate covariance matrix
+      double[,] K = CalculateCovariance(xs, l);
+      // decompose
+      alglib.trfac.spdmatrixcholesky(ref K, nRows, false);
+      // calc y = Lu
+      var y = new double[u.Length];
+      alglib.ablas.rmatrixmv(nRows, nRows, K, 0, 0, 0, u, 0, ref y, 0);
+      // calculate relevance by removing dimensions
+      relevance = CalculateRelevance(y, u, xs, l);
+      // calculate variable relevance
+      // as per Rasmussen and Williams "Gaussian Processes for Machine Learning" page 106:
+      // ,,For the squared exponential covariance function [...] the l1, ..., lD hyperparameters
+      // play the role of characteristic length scales [...]. Such a covariance function implements
+      // automatic relevance determination (ARD) [Neal, 1996], since the inverse of the length-scale
+      // determines how relevant an input is: if the length-scale has a very large value, the covariance
+      // will become almost independent of that input, effectively removing it from inference.''
+      // relevance = l.Select(li => 1.0 / li).ToArray();
+      return y;
+    }
+    // calculate variable relevance based on removal of variables
+    //  1) to remove a variable we set it's length scale to infinity (no relation of the variable value to the target)
+    //  2) calculate MSE of the original target values (y) to the updated targes y' (after variable removal)
+    //  3) relevance is larger if MSE(y,y') is large
+    //  4) scale impacts so that the most important variable has impact = 1
+    private double[] CalculateRelevance(double[] y, double[] u, List<double>[] xs, double[] l) {
+      int nRows = xs.First().Count;
+      var changedL = new double[l.Length];
+      var relevance = new double[l.Length];
+      for (int i = 0; i < l.Length; i++) {
+        Array.Copy(l, changedL, changedL.Length);
+        changedL[i] = double.MaxValue;
+        var changedK = CalculateCovariance(xs, changedL);
+        var yChanged = new double[u.Length];
+        alglib.ablas.rmatrixmv(nRows, nRows, changedK, 0, 0, 0, u, 0, ref yChanged, 0);
+        OnlineCalculatorError error;
+        var mse = OnlineMeanSquaredErrorCalculator.Calculate(y, yChanged, out error);
+        if (error != OnlineCalculatorError.None) mse = double.MaxValue;
+        relevance[i] = mse;
+      }
+      // scale so that max relevance is 1.0
+      var maxRel = relevance.Max();
+      for (int i = 0; i < relevance.Length; i++) relevance[i] /= maxRel;
+      return relevance;
+    }
+    private double[,] CalculateCovariance(List<double>[] xs, double[] l) {
+      int nRows = xs.First().Count;
+      double[,] K = new double[nRows, nRows];
       for (int r = 0; r < nRows; r++) {
         double[] xi = xs.Select(x => x[r]).ToArray();
 …
+        }
+      }
       // add a small diagonal matrix for numeric stability
       for (int i = 0; i < nRows; i++) {
 …
+      }
+      // decompose
+      alglib.trfac.spdmatrixcholesky(ref K, nRows, false);
+      // sample u iid ~ N(0, 1)
+      var u = Enumerable.Range(0, nRows).Select(_ => NormalDistributedRandom.NextDouble(random, 0, 1)).ToArray();
+      // calc y = Lu
+      var y = new double[u.Length];
+      alglib.ablas.rmatrixmv(nRows, nRows, K, 0, 0, 0, u, 0, ref y, 0);
+      // calculate variable relevance
+      // as per Rasmussen and Williams "Gaussian Processes for Machine Learning" page 106:
+      // ,,For the squared exponential covariance function [...] the l1, ..., lD hyperparameters
+      // play the role of characteristic length scales [...]. Such a covariance function implements
+      // automatic relevance determination (ARD) [Neal, 1996], since the inverse of the length-scale
+      // determines how relevant an input is: if the length-scale has a very large value, the covariance
+      // will become almost independent of that input, effectively removing it from inference.''
+      relevance = l.Select(li => 1.0 / li).ToArray();
+      return y;
+      return K;
+    }
+  }

branches/symbreg-factors-2650/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs

-                      r14185
+                      r14330
     /// <param name="columnNamesInFirstLine"></param>
     public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
+      using (StreamReader reader = new StreamReader(stream, Encoding)) {
+      if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
+      using (var reader = new StreamReader(stream)) {
         tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
+        var strValues = new List<List<string>>();
         values = new List<IList>();
+        if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
+        if (columnNamesInFirstLine) {
+          ParseVariableNames();
+          if (!tokenizer.HasNext())
+            Error(
+              "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
+              "", tokenizer.CurrentLineNumber);
+        }
+        // read values... start in first row
+        Prepare(columnNamesInFirstLine, strValues);
         int nLinesParsed = 0;
         int colIdx = 0;
-        int numValuesInFirstRow = columnNamesInFirstLine ? variableNames.Count : -1; // number of variables or inizialize based on first row of values (-1)
         while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) {
           if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
 …
             // all rows have to have the same number of values
             // the first row defines how many samples are needed
             if (numValuesInFirstRow < 0) numValuesInFirstRow = values.Count; // set to number of colums in the first row
             else if (colIdx > 0 && numValuesInFirstRow != colIdx) { // read at least one value in the row (support for skipping empty lines)
               Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
+            // the first row defines how many elements are needed
+            if (colIdx > 0 && values.Count != colIdx) {
+              // read at least one value in the row (support for skipping empty lines)
+              Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine +
                     "Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "",
                     tokenizer.CurrentLineNumber);
+                tokenizer.CurrentLineNumber);
+            }
             OnReport(tokenizer.BytesRead);
 …
           } else {
             // read one value
+            TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal;
+            TokenTypeEnum type;
+            string strVal;
+            double dblVal;
+            DateTime dateTimeVal;
             tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
+            // initialize columns on the first row (fixing data types as presented in the first row...)
+            if (nLinesParsed == 0) {
+              values.Add(CreateList(type, estimatedNumberOfLines));
+            } else if (colIdx == values.Count) {
+              Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
+            if (colIdx == values.Count) {
+              Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine +
                     "Line " + tokenizer.CurrentLineNumber + " has more columns.", "",
                 tokenizer.CurrentLineNumber);
+            }
             if (!IsColumnTypeCompatible(values[colIdx], type)) {
               values[colIdx] = ConvertToStringColumn(values[colIdx]);
+              values[colIdx] = strValues[colIdx];
+            }
             // add the value to the column
+            AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal);
+            AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal);
+            if (!(values[colIdx] is List<string>)) { // optimization: don't store the string values in another list if the column is list<string>
+              strValues[colIdx].Add(strVal);
+            }
+            colIdx++;
+          }
+        }
+        if (!values.Any() || values.First().Count == 0)
           Error("Couldn't parse data values. Probably because of incorrect number format " +
                 "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
+      }
+      }
+      if (!values.Any() || values.First().Count == 0)
+        Error("Couldn't parse data values. Probably because of incorrect number format " +
+              "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
       this.rows = values.First().Count;
 …
       // for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction)
       GC.Collect(2, GCCollectionMode.Forced);
+    }
+    private void Prepare(bool columnNamesInFirstLine, List<List<string>> strValues) {
+      if (columnNamesInFirstLine) {
+        ParseVariableNames();
+        if (!tokenizer.HasNext())
+          Error(
+            "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
+            "", tokenizer.CurrentLineNumber);
+      }
+      // read first line to determine types and allocate specific lists
+      // read values... start in first row
+      int colIdx = 0;
+      while (tokenizer.PeekType() != TokenTypeEnum.NewLine) {
+        // read one value
+        TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal;
+        tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
+        // initialize column
+        values.Add(CreateList(type, estimatedNumberOfLines));
+        if (type == TokenTypeEnum.String)
+          strValues.Add(new List<string>(0)); // optimization: don't store the string values in another list if the column is list<string>
+        else
+          strValues.Add(new List<string>(estimatedNumberOfLines));
+        AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal);
+        if (type != TokenTypeEnum.String)
+          strValues[colIdx].Add(strVal);
+        colIdx++;
+      }
+      tokenizer.Skip(); // skip newline
+    }
 …
                 type = TokenTypeEnum.Double;
                 doubleVals[i] = doubleVal;
+              } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) {
+              } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.NoCurrentDateDefault, out dateTimeValue)
+                && dateTimeValue.Year > 1 && dateTimeValue.Month > 1 && dateTimeValue.Day > 1 // if no date is given it is returned as 1.1.0001 -> don't allow this
+                ) {
                 type = TokenTypeEnum.DateTime;
                 dateTimeVals[i] = dateTimeValue;
 …
     private void Error(string message, string token, int lineNumber) {
       throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
+      throw new IOException(string.Format("Error while parsing. {0} (token: {1} lineNumber: {2}).", message, token, lineNumber));
+    }
     #endregion
-    [Serializable]
-    public class DataFormatException : Exception {
-      private int line;
-      public int Line {
-        get { return line; }
+      }
-      private string token;
-      public string Token {
-        get { return token; }
+      }
-      public DataFormatException(string message, string token, int line)
-        : base(message + "\nToken: " + token + " (line: " + line + ")") {
-        this.token = token;
-        this.line = line;
+      }
-      public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
+    }
+  }
+}

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats:

Update cookies preferences