Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/VariableNetworks/VariableNetwork.cs @ 14291

Last change on this file since 14291 was 14291, checked in by gkronber, 8 years ago

#2660: changed calculation of variable relevance values for variable interaction networks based on sampling Gaussian processes.
Instead of taking inverse length scale (ARD). We calculate the deviation from the original function (y) after individual variables are removed (y').

File size: 12.9 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.Linq;
26using HeuristicLab.Common;
27using HeuristicLab.Core;
28using HeuristicLab.Problems.DataAnalysis;
29using HeuristicLab.Random;
30
31namespace HeuristicLab.Problems.Instances.DataAnalysis {
32  public class VariableNetwork : ArtificialRegressionDataDescriptor {
33    private int nTrainingSamples;
34    private int nTestSamples;
35
36    private int numberOfFeatures;
37    private double noiseRatio;
38    private IRandom random;
39
40    public override string Name { get { return string.Format("VariableNetwork-{0:0%} ({1} dim)", noiseRatio, numberOfFeatures); } }
41    private string networkDefinition;
42    public string NetworkDefinition { get { return networkDefinition; } }
43    public override string Description {
44      get {
45        return "The data are generated specifically to test methods for variable network analysis.";
46      }
47    }
48
49    public VariableNetwork(int numberOfFeatures, double noiseRatio,
50      IRandom rand)
51      : this(250, 250, numberOfFeatures, noiseRatio, rand) { }
52
53    public VariableNetwork(int nTrainingSamples, int nTestSamples,
54      int numberOfFeatures, double noiseRatio, IRandom rand) {
55      this.nTrainingSamples = nTrainingSamples;
56      this.nTestSamples = nTestSamples;
57      this.noiseRatio = noiseRatio;
58      this.random = rand;
59      this.numberOfFeatures = numberOfFeatures;
60      // default variable names
61      variableNames = Enumerable.Range(1, numberOfFeatures)
62        .Select(i => string.Format("X{0:000}", i))
63        .ToArray();
64
65      variableRelevances = new Dictionary<string, IEnumerable<KeyValuePair<string, double>>>();
66    }
67
68    private string[] variableNames;
69    protected override string[] VariableNames {
70      get {
71        return variableNames;
72      }
73    }
74
75    // there is no specific target variable in variable network analysis but we still need to specify one
76    protected override string TargetVariable { get { return VariableNames.Last(); } }
77
78    protected override string[] AllowedInputVariables {
79      get {
80        return VariableNames.Take(numberOfFeatures - 1).ToArray();
81      }
82    }
83
84    protected override int TrainingPartitionStart { get { return 0; } }
85    protected override int TrainingPartitionEnd { get { return nTrainingSamples; } }
86    protected override int TestPartitionStart { get { return nTrainingSamples; } }
87    protected override int TestPartitionEnd { get { return nTrainingSamples + nTestSamples; } }
88
89    private Dictionary<string, IEnumerable<KeyValuePair<string, double>>> variableRelevances;
90    public IEnumerable<KeyValuePair<string, double>> GetVariableRelevance(string targetVar) {
91      return variableRelevances[targetVar];
92    }
93
94    protected override List<List<double>> GenerateValues() {
95      // variable names are shuffled in the beginning (and sorted at the end)
96      variableNames = variableNames.Shuffle(random).ToArray();
97
98      // a third of all variables are independent vars
99      List<List<double>> lvl0 = new List<List<double>>();
100      int numLvl0 = (int)Math.Ceiling(numberOfFeatures * 0.33);
101
102      List<string> description = new List<string>(); // store information how the variable is actually produced
103      List<string[]> inputVarNames = new List<string[]>(); // store information to produce graphviz file
104      List<double[]> relevances = new List<double[]>(); // stores variable relevance information (same order as given in inputVarNames)
105
106      var nrand = new NormalDistributedRandom(random, 0, 1);
107      for (int c = 0; c < numLvl0; c++) {
108        inputVarNames.Add(new string[] { });
109        relevances.Add(new double[] { });
110        description.Add(" ~ N(0, 1)");
111        lvl0.Add(Enumerable.Range(0, TestPartitionEnd).Select(_ => nrand.NextDouble()).ToList());
112      }
113
114      // lvl1 contains variables which are functions of vars in lvl0 (+ noise)
115      int numLvl1 = (int)Math.Ceiling(numberOfFeatures * 0.33);
116      List<List<double>> lvl1 = CreateVariables(lvl0, numLvl1, inputVarNames, description, relevances);
117
118      // lvl2 contains variables which are functions of vars in lvl0 and lvl1 (+ noise)
119      int numLvl2 = (int)Math.Ceiling(numberOfFeatures * 0.2);
120      List<List<double>> lvl2 = CreateVariables(lvl0.Concat(lvl1).ToList(), numLvl2, inputVarNames, description, relevances);
121
122      // lvl3 contains variables which are functions of vars in lvl0, lvl1 and lvl2 (+ noise)
123      int numLvl3 = numberOfFeatures - numLvl0 - numLvl1 - numLvl2;
124      List<List<double>> lvl3 = CreateVariables(lvl0.Concat(lvl1).Concat(lvl2).ToList(), numLvl3, inputVarNames, description, relevances);
125
126      this.variableRelevances.Clear();
127      for (int i = 0; i < variableNames.Length; i++) {
128        var targetVarName = variableNames[i];
129        var targetRelevantInputs =
130          inputVarNames[i].Zip(relevances[i], (inputVar, rel) => new KeyValuePair<string, double>(inputVar, rel))
131            .ToArray();
132        variableRelevances.Add(targetVarName, targetRelevantInputs);
133      }
134
135      networkDefinition = string.Join(Environment.NewLine, variableNames.Zip(description, (n, d) => n + d).OrderBy(x => x));
136      // for graphviz
137      networkDefinition += Environment.NewLine + "digraph G {";
138      for (int i = 0; i < variableNames.Length; i++) {
139        var name = variableNames[i];
140        var selectedVarNames = inputVarNames[i];
141        var selectedRelevances = relevances[i];
142        for (int j = 0; j < selectedVarNames.Length; j++) {
143          var selectedVarName = selectedVarNames[j];
144          var selectedRelevance = selectedRelevances[j];
145          networkDefinition += Environment.NewLine + selectedVarName + " -> " + name +
146            string.Format(CultureInfo.InvariantCulture, " [label={0:N3}]", selectedRelevance);
147        }
148      }
149      networkDefinition += Environment.NewLine + "}";
150
151      // return a random permutation of all variables (to mix lvl0, lvl1, ... variables)
152      var allVars = lvl0.Concat(lvl1).Concat(lvl2).Concat(lvl3).ToList();
153      var orderedVars = allVars.Zip(variableNames, Tuple.Create).OrderBy(t => t.Item2).Select(t => t.Item1).ToList();
154      variableNames = variableNames.OrderBy(n => n).ToArray();
155      return orderedVars;
156    }
157
158    private List<List<double>> CreateVariables(List<List<double>> allowedInputs, int numVars, List<string[]> inputVarNames, List<string> description, List<double[]> relevances) {
159      var res = new List<List<double>>();
160      for (int c = 0; c < numVars; c++) {
161        string[] selectedVarNames;
162        double[] relevance;
163        var x = GenerateRandomFunction(random, allowedInputs, out selectedVarNames, out relevance);
164        var sigma = x.StandardDeviation();
165        var noisePrng = new NormalDistributedRandom(random, 0, sigma * Math.Sqrt(noiseRatio / (1.0 - noiseRatio)));
166        res.Add(x.Select(t => t + noisePrng.NextDouble()).ToList());
167        Array.Sort(selectedVarNames, relevance);
168        inputVarNames.Add(selectedVarNames);
169        relevances.Add(relevance);
170        var desc = string.Format("f({0})", string.Join(",", selectedVarNames));
171        // for the relevance information order variables by decreasing relevance
172        var relevanceStr = string.Join(", ",
173          selectedVarNames.Zip(relevance, Tuple.Create)
174          .OrderByDescending(t => t.Item2)
175          .Select(t => string.Format(CultureInfo.InvariantCulture, "{0}: {1:N3}", t.Item1, t.Item2)));
176        description.Add(string.Format(" ~ N({0}, {1:N3}) [Relevances: {2}]", desc, noisePrng.Sigma, relevanceStr));
177      }
178      return res;
179    }
180
181    // sample the input variables that are actually used and sample from a Gaussian process
182    private IEnumerable<double> GenerateRandomFunction(IRandom rand, List<List<double>> xs, out string[] selectedVarNames, out double[] relevance) {
183      double r = -Math.Log(1.0 - rand.NextDouble()) * 2.0; // r is exponentially distributed with lambda = 2
184      int nl = (int)Math.Floor(1.5 + r); // number of selected vars is likely to be between three and four
185      if (nl > xs.Count) nl = xs.Count; // limit max
186
187      var selectedIdx = Enumerable.Range(0, xs.Count).Shuffle(random)
188        .Take(nl).ToArray();
189
190      var selectedVars = selectedIdx.Select(i => xs[i]).ToArray();
191      selectedVarNames = selectedIdx.Select(i => VariableNames[i]).ToArray();
192      return SampleGaussianProcess(random, selectedVars, out relevance);
193    }
194
195    private IEnumerable<double> SampleGaussianProcess(IRandom random, List<double>[] xs, out double[] relevance) {
196      int nl = xs.Length;
197      int nRows = xs.First().Count;
198
199      // sample u iid ~ N(0, 1)
200      var u = Enumerable.Range(0, nRows).Select(_ => NormalDistributedRandom.NextDouble(random, 0, 1)).ToArray();
201
202      // sample actual length-scales
203      var l = Enumerable.Range(0, nl)
204        .Select(_ => random.NextDouble() * 2 + 0.5)
205        .ToArray();
206
207      double[,] K = CalculateCovariance(xs, l);
208
209      // decompose
210      alglib.trfac.spdmatrixcholesky(ref K, nRows, false);
211
212
213      // calc y = Lu
214      var y = new double[u.Length];
215      alglib.ablas.rmatrixmv(nRows, nRows, K, 0, 0, 0, u, 0, ref y, 0);
216
217      // calculate relevance by removing dimensions
218      relevance = CalculateRelevance(y, u, xs, l);
219
220
221      // calculate variable relevance
222      // as per Rasmussen and Williams "Gaussian Processes for Machine Learning" page 106:
223      // ,,For the squared exponential covariance function [...] the l1, ..., lD hyperparameters
224      // play the role of characteristic length scales [...]. Such a covariance function implements
225      // automatic relevance determination (ARD) [Neal, 1996], since the inverse of the length-scale
226      // determines how relevant an input is: if the length-scale has a very large value, the covariance
227      // will become almost independent of that input, effectively removing it from inference.''
228      // relevance = l.Select(li => 1.0 / li).ToArray();
229
230      return y;
231    }
232
233    // calculate variable relevance based on removal of variables
234    //  1) to remove a variable we set it's length scale to infinity (no relation of the variable value to the target)
235    //  2) calculate MSE of the original target values (y) to the updated targes y' (after variable removal)
236    //  3) relevance is larger if MSE(y,y') is large
237    //  4) scale impacts so that the most important variable has impact = 1
238    private double[] CalculateRelevance(double[] y, double[] u, List<double>[] xs, double[] l) {
239      int nRows = xs.First().Count;
240      var changedL = new double[l.Length];
241      var relevance = new double[l.Length];
242      for (int i = 0; i < l.Length; i++) {
243        Array.Copy(l, changedL, changedL.Length);
244        changedL[i] = double.MaxValue;
245        var changedK = CalculateCovariance(xs, changedL);
246
247        var yChanged = new double[u.Length];
248        alglib.ablas.rmatrixmv(nRows, nRows, changedK, 0, 0, 0, u, 0, ref yChanged, 0);
249
250        OnlineCalculatorError error;
251        var mse = OnlineMeanSquaredErrorCalculator.Calculate(y, yChanged, out error);
252        if (error != OnlineCalculatorError.None) mse = double.MaxValue;
253        relevance[i] = mse;
254      }
255      // scale so that max relevance is 1.0
256      var maxRel = relevance.Max();
257      for (int i = 0; i < relevance.Length; i++) relevance[i] /= maxRel;
258      return relevance;
259    }
260
261    private double[,] CalculateCovariance(List<double>[] xs, double[] l) {
262      int nRows = xs.First().Count;
263      double[,] K = new double[nRows, nRows];
264      for (int r = 0; r < nRows; r++) {
265        double[] xi = xs.Select(x => x[r]).ToArray();
266        for (int c = 0; c <= r; c++) {
267          double[] xj = xs.Select(x => x[c]).ToArray();
268          double dSqr = xi.Zip(xj, (xik, xjk) => (xik - xjk))
269            .Select(dk => dk * dk)
270            .Zip(l, (dk, lk) => dk / lk)
271            .Sum();
272          K[r, c] = Math.Exp(-dSqr);
273        }
274      }
275      // add a small diagonal matrix for numeric stability
276      for (int i = 0; i < nRows; i++) {
277        K[i, i] += 1.0E-7;
278      }
279
280      return K;
281    }
282  }
283}
Note: See TracBrowser for help on using the repository browser.