Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
10/13/16 19:47:41 (8 years ago)
Author:
gkronber
Message:

#2650 Merged r14282:14322 from trunk to branch (fixing conflicts)

Location:
branches/symbreg-factors-2650
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • branches/symbreg-factors-2650

  • branches/symbreg-factors-2650/HeuristicLab.Problems.Instances.DataAnalysis

  • branches/symbreg-factors-2650/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/VariableNetworks/VariableNetwork.cs

    r14277 r14330  
    2626using HeuristicLab.Common;
    2727using HeuristicLab.Core;
     28using HeuristicLab.Problems.DataAnalysis;
    2829using HeuristicLab.Random;
    2930
     
    195196      int nl = xs.Length;
    196197      int nRows = xs.First().Count;
    197       double[,] K = new double[nRows, nRows];
    198 
    199       // sample length-scales
     198
     199      // sample u iid ~ N(0, 1)
     200      var u = Enumerable.Range(0, nRows).Select(_ => NormalDistributedRandom.NextDouble(random, 0, 1)).ToArray();
     201
     202      // sample actual length-scales
    200203      var l = Enumerable.Range(0, nl)
    201204        .Select(_ => random.NextDouble() * 2 + 0.5)
    202205        .ToArray();
    203       // calculate covariance matrix
     206
     207      double[,] K = CalculateCovariance(xs, l);
     208
     209      // decompose
     210      alglib.trfac.spdmatrixcholesky(ref K, nRows, false);
     211
     212
     213      // calc y = Lu
     214      var y = new double[u.Length];
     215      alglib.ablas.rmatrixmv(nRows, nRows, K, 0, 0, 0, u, 0, ref y, 0);
     216
     217      // calculate relevance by removing dimensions
     218      relevance = CalculateRelevance(y, u, xs, l);
     219
     220
     221      // calculate variable relevance
     222      // as per Rasmussen and Williams "Gaussian Processes for Machine Learning" page 106:
     223      // ,,For the squared exponential covariance function [...] the l1, ..., lD hyperparameters
     224      // play the role of characteristic length scales [...]. Such a covariance function implements
     225      // automatic relevance determination (ARD) [Neal, 1996], since the inverse of the length-scale
     226      // determines how relevant an input is: if the length-scale has a very large value, the covariance
     227      // will become almost independent of that input, effectively removing it from inference.''
     228      // relevance = l.Select(li => 1.0 / li).ToArray();
     229
     230      return y;
     231    }
     232
     233    // calculate variable relevance based on removal of variables
     234    //  1) to remove a variable we set it's length scale to infinity (no relation of the variable value to the target)
     235    //  2) calculate MSE of the original target values (y) to the updated targes y' (after variable removal)
     236    //  3) relevance is larger if MSE(y,y') is large
     237    //  4) scale impacts so that the most important variable has impact = 1
     238    private double[] CalculateRelevance(double[] y, double[] u, List<double>[] xs, double[] l) {
     239      int nRows = xs.First().Count;
     240      var changedL = new double[l.Length];
     241      var relevance = new double[l.Length];
     242      for (int i = 0; i < l.Length; i++) {
     243        Array.Copy(l, changedL, changedL.Length);
     244        changedL[i] = double.MaxValue;
     245        var changedK = CalculateCovariance(xs, changedL);
     246
     247        var yChanged = new double[u.Length];
     248        alglib.ablas.rmatrixmv(nRows, nRows, changedK, 0, 0, 0, u, 0, ref yChanged, 0);
     249
     250        OnlineCalculatorError error;
     251        var mse = OnlineMeanSquaredErrorCalculator.Calculate(y, yChanged, out error);
     252        if (error != OnlineCalculatorError.None) mse = double.MaxValue;
     253        relevance[i] = mse;
     254      }
     255      // scale so that max relevance is 1.0
     256      var maxRel = relevance.Max();
     257      for (int i = 0; i < relevance.Length; i++) relevance[i] /= maxRel;
     258      return relevance;
     259    }
     260
     261    private double[,] CalculateCovariance(List<double>[] xs, double[] l) {
     262      int nRows = xs.First().Count;
     263      double[,] K = new double[nRows, nRows];
    204264      for (int r = 0; r < nRows; r++) {
    205265        double[] xi = xs.Select(x => x[r]).ToArray();
     
    213273        }
    214274      }
    215 
    216275      // add a small diagonal matrix for numeric stability
    217276      for (int i = 0; i < nRows; i++) {
     
    219278      }
    220279
    221       // decompose
    222       alglib.trfac.spdmatrixcholesky(ref K, nRows, false);
    223 
    224       // sample u iid ~ N(0, 1)
    225       var u = Enumerable.Range(0, nRows).Select(_ => NormalDistributedRandom.NextDouble(random, 0, 1)).ToArray();
    226 
    227       // calc y = Lu
    228       var y = new double[u.Length];
    229       alglib.ablas.rmatrixmv(nRows, nRows, K, 0, 0, 0, u, 0, ref y, 0);
    230 
    231       // calculate variable relevance
    232       // as per Rasmussen and Williams "Gaussian Processes for Machine Learning" page 106:
    233       // ,,For the squared exponential covariance function [...] the l1, ..., lD hyperparameters
    234       // play the role of characteristic length scales [...]. Such a covariance function implements
    235       // automatic relevance determination (ARD) [Neal, 1996], since the inverse of the length-scale
    236       // determines how relevant an input is: if the length-scale has a very large value, the covariance
    237       // will become almost independent of that input, effectively removing it from inference.''
    238       relevance = l.Select(li => 1.0 / li).ToArray();
    239 
    240       return y;
     280      return K;
    241281    }
    242282  }
  • branches/symbreg-factors-2650/HeuristicLab.Problems.Instances.DataAnalysis/3.3/TableFileParser.cs

    r14185 r14330  
    198198    /// <param name="columnNamesInFirstLine"></param>
    199199    public void Parse(Stream stream, NumberFormatInfo numberFormat, DateTimeFormatInfo dateTimeFormatInfo, char separator, bool columnNamesInFirstLine, int lineLimit = -1) {
    200       using (StreamReader reader = new StreamReader(stream, Encoding)) {
     200      if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
     201
     202      using (var reader = new StreamReader(stream)) {
    201203        tokenizer = new Tokenizer(reader, numberFormat, dateTimeFormatInfo, separator);
     204        var strValues = new List<List<string>>();
    202205        values = new List<IList>();
    203         if (lineLimit > 0) estimatedNumberOfLines = lineLimit;
    204 
    205         if (columnNamesInFirstLine) {
    206           ParseVariableNames();
    207           if (!tokenizer.HasNext())
    208             Error(
    209               "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
    210               "", tokenizer.CurrentLineNumber);
    211         }
    212 
    213 
    214         // read values... start in first row
     206        Prepare(columnNamesInFirstLine, strValues);
     207
    215208        int nLinesParsed = 0;
    216209        int colIdx = 0;
    217         int numValuesInFirstRow = columnNamesInFirstLine ? variableNames.Count : -1; // number of variables or inizialize based on first row of values (-1)
    218210        while (tokenizer.HasNext() && (lineLimit < 0 || nLinesParsed < lineLimit)) {
    219211          if (tokenizer.PeekType() == TokenTypeEnum.NewLine) {
     
    221213
    222214            // all rows have to have the same number of values
    223             // the first row defines how many samples are needed
    224             if (numValuesInFirstRow < 0) numValuesInFirstRow = values.Count; // set to number of colums in the first row
    225             else if (colIdx > 0 && numValuesInFirstRow != colIdx) { // read at least one value in the row (support for skipping empty lines)
    226               Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
     215            // the first row defines how many elements are needed
     216            if (colIdx > 0 && values.Count != colIdx) {
     217              // read at least one value in the row (support for skipping empty lines)
     218              Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine +
    227219                    "Line " + tokenizer.CurrentLineNumber + " has " + colIdx + " columns.", "",
    228                     tokenizer.CurrentLineNumber);
     220                tokenizer.CurrentLineNumber);
    229221            }
    230222            OnReport(tokenizer.BytesRead);
     
    234226          } else {
    235227            // read one value
    236             TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal;
     228            TokenTypeEnum type;
     229            string strVal;
     230            double dblVal;
     231            DateTime dateTimeVal;
    237232            tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
    238233
    239             // initialize columns on the first row (fixing data types as presented in the first row...)
    240             if (nLinesParsed == 0) {
    241               values.Add(CreateList(type, estimatedNumberOfLines));
    242             } else if (colIdx == values.Count) {
    243               Error("The first row of the dataset has " + numValuesInFirstRow + " columns." + Environment.NewLine +
     234            if (colIdx == values.Count) {
     235              Error("The first row of the dataset has " + values.Count + " columns." + Environment.NewLine +
    244236                    "Line " + tokenizer.CurrentLineNumber + " has more columns.", "",
    245237                tokenizer.CurrentLineNumber);
    246238            }
    247239            if (!IsColumnTypeCompatible(values[colIdx], type)) {
    248               values[colIdx] = ConvertToStringColumn(values[colIdx]);
     240              values[colIdx] = strValues[colIdx];
    249241            }
     242
    250243            // add the value to the column
    251             AddValue(type, values[colIdx++], strVal, dblVal, dateTimeVal);
     244            AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal);
     245            if (!(values[colIdx] is List<string>)) { // optimization: don't store the string values in another list if the column is list<string>
     246              strValues[colIdx].Add(strVal);
     247            }
     248            colIdx++;
    252249          }
    253250        }
    254 
    255         if (!values.Any() || values.First().Count == 0)
    256           Error("Couldn't parse data values. Probably because of incorrect number format " +
    257                 "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
    258       }
     251      }
     252
     253      if (!values.Any() || values.First().Count == 0)
     254        Error("Couldn't parse data values. Probably because of incorrect number format " +
     255              "(the parser expects english number format with a '.' as decimal separator).", "", tokenizer.CurrentLineNumber);
    259256
    260257      this.rows = values.First().Count;
     
    277274      // for large files we created a lot of memory pressure, cannot hurt to run GC.Collect here (TableFileParser is called seldomly on user interaction)
    278275      GC.Collect(2, GCCollectionMode.Forced);
     276    }
     277
     278    private void Prepare(bool columnNamesInFirstLine, List<List<string>> strValues) {
     279      if (columnNamesInFirstLine) {
     280        ParseVariableNames();
     281        if (!tokenizer.HasNext())
     282          Error(
     283            "Couldn't parse data values. Probably because of incorrect number format (the parser expects english number format with a '.' as decimal separator).",
     284            "", tokenizer.CurrentLineNumber);
     285      }
     286      // read first line to determine types and allocate specific lists
     287      // read values... start in first row
     288      int colIdx = 0;
     289      while (tokenizer.PeekType() != TokenTypeEnum.NewLine) {
     290        // read one value
     291        TokenTypeEnum type; string strVal; double dblVal; DateTime dateTimeVal;
     292        tokenizer.Next(out type, out strVal, out dblVal, out dateTimeVal);
     293
     294        // initialize column
     295        values.Add(CreateList(type, estimatedNumberOfLines));
     296        if (type == TokenTypeEnum.String)
     297          strValues.Add(new List<string>(0)); // optimization: don't store the string values in another list if the column is list<string>
     298        else
     299          strValues.Add(new List<string>(estimatedNumberOfLines));
     300
     301        AddValue(type, values[colIdx], strVal, dblVal, dateTimeVal);
     302        if (type != TokenTypeEnum.String)
     303          strValues[colIdx].Add(strVal);
     304        colIdx++;
     305      }
     306      tokenizer.Skip(); // skip newline
    279307    }
    280308
     
    530558                type = TokenTypeEnum.Double;
    531559                doubleVals[i] = doubleVal;
    532               } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.None, out dateTimeValue)) {
     560              } else if (DateTime.TryParse(tok, dateTimeFormatInfo, DateTimeStyles.NoCurrentDateDefault, out dateTimeValue)
     561                && dateTimeValue.Year > 1 && dateTimeValue.Month > 1 && dateTimeValue.Day > 1 // if no date is given it is returned as 1.1.0001 -> don't allow this
     562                ) {
    533563                type = TokenTypeEnum.DateTime;
    534564                dateTimeVals[i] = dateTimeValue;
     
    606636
    607637    private void Error(string message, string token, int lineNumber) {
    608       throw new DataFormatException("Error while parsing.\n" + message, token, lineNumber);
     638      throw new IOException(string.Format("Error while parsing. {0} (token: {1} lineNumber: {2}).", message, token, lineNumber));
    609639    }
    610640    #endregion
    611 
    612     [Serializable]
    613     public class DataFormatException : Exception {
    614       private int line;
    615       public int Line {
    616         get { return line; }
    617       }
    618       private string token;
    619       public string Token {
    620         get { return token; }
    621       }
    622       public DataFormatException(string message, string token, int line)
    623         : base(message + "\nToken: " + token + " (line: " + line + ")") {
    624         this.token = token;
    625         this.line = line;
    626       }
    627 
    628       public DataFormatException(SerializationInfo info, StreamingContext context) : base(info, context) { }
    629     }
    630641  }
    631642}
Note: See TracChangeset for help on using the changeset viewer.