Changeset 15195


Ignore:
Timestamp:
07/11/17 12:55:52 (12 days ago)
Author:
gkronber
Message:

#2660: merged r14260,r14271,r14291,r14623,r14630,r15194 from trunk to stable

Location:
stable/HeuristicLab.Problems.Instances.DataAnalysis/3.3
Files:
4 edited
2 copied

Legend:

Unmodified
Added
Removed
  • stable/HeuristicLab.Problems.Instances.DataAnalysis/3.3/HeuristicLab.Problems.Instances.DataAnalysis-3.3.csproj

    r14305 r15195  
    157157    <Compile Include="Regression\FeatureSelection\FeatureSelection.cs" />
    158158    <Compile Include="Regression\FeatureSelection\FeatureSelectionInstanceProvider.cs" />
     159    <Compile Include="Regression\VariableNetworks\LinearVariableNetwork.cs" />
     160    <Compile Include="Regression\VariableNetworks\GaussianProcessVariableNetwork.cs" />
    159161    <Compile Include="Regression\VariableNetworks\VariableNetwork.cs" />
    160162    <Compile Include="Regression\VariableNetworks\VariableNetworkInstanceProvider.cs" />
  • stable/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/Friedman/FriedmanRandomFunction.cs

    r14186 r15195  
    9494    }
    9595
    96     // as described in Greedy Function Approxination paper
     96    // as described in Greedy Function Approximation paper
    9797    private IEnumerable<double> GenerateRandomFunction(IRandom rand, List<List<double>> xs, int nTerms = 20) {
    9898      int nRows = xs.First().Count;
  • stable/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/VariableNetworks/VariableNetwork.cs

    r14186 r15195  
    2222using System;
    2323using System.Collections.Generic;
     24using System.Globalization;
    2425using System.Linq;
    2526using HeuristicLab.Common;
    2627using HeuristicLab.Core;
     28using HeuristicLab.Problems.DataAnalysis;
    2729using HeuristicLab.Random;
    2830
    2931namespace HeuristicLab.Problems.Instances.DataAnalysis {
    30   public class VariableNetwork : ArtificialRegressionDataDescriptor {
     32  public abstract class VariableNetwork : ArtificialRegressionDataDescriptor {
    3133    private int nTrainingSamples;
    3234    private int nTestSamples;
     
    3638    private IRandom random;
    3739
    38     public override string Name { get { return string.Format("VariableNetwork-{0:0%} ({1} dim)", noiseRatio, numberOfFeatures); } }
    3940    private string networkDefinition;
    4041    public string NetworkDefinition { get { return networkDefinition; } }
     
    4546    }
    4647
    47     public VariableNetwork(int numberOfFeatures, double noiseRatio,
    48       IRandom rand)
    49       : this(250, 250, numberOfFeatures, noiseRatio, rand) { }
    50 
    51     public VariableNetwork(int nTrainingSamples, int nTestSamples,
     48    protected VariableNetwork(int nTrainingSamples, int nTestSamples,
    5249      int numberOfFeatures, double noiseRatio, IRandom rand) {
    5350      this.nTrainingSamples = nTrainingSamples;
     
    6057        .Select(i => string.Format("X{0:000}", i))
    6158        .ToArray();
     59
     60      variableRelevances = new Dictionary<string, IEnumerable<KeyValuePair<string, double>>>();
    6261    }
    6362
     
    8382    protected override int TestPartitionEnd { get { return nTrainingSamples + nTestSamples; } }
    8483
     84    private Dictionary<string, IEnumerable<KeyValuePair<string, double>>> variableRelevances;
     85    public IEnumerable<KeyValuePair<string, double>> GetVariableRelevance(string targetVar) {
     86      return variableRelevances[targetVar];
     87    }
    8588
    8689    protected override List<List<double>> GenerateValues() {
     
    9497      List<string> description = new List<string>(); // store information how the variable is actually produced
    9598      List<string[]> inputVarNames = new List<string[]>(); // store information to produce graphviz file
     99      List<double[]> relevances = new List<double[]>(); // stores variable relevance information (same order as given in inputVarNames)
    96100
    97101      var nrand = new NormalDistributedRandom(random, 0, 1);
    98       for (int c = 0; c < numLvl0; c++) {
    99         var datai = Enumerable.Range(0, TestPartitionEnd).Select(_ => nrand.NextDouble()).ToList();
     102      for(int c = 0; c < numLvl0; c++) {
    100103        inputVarNames.Add(new string[] { });
    101         description.Add("~ N(0, 1)");
    102         lvl0.Add(datai);
     104        relevances.Add(new double[] { });
     105        description.Add(" ~ N(0, 1 + noiseLvl)");
     106        // use same generation procedure for all variables
     107        var x = Enumerable.Range(0, TestPartitionEnd).Select(_ => nrand.NextDouble()).ToList();
     108        var sigma = x.StandardDeviationPop();
     109        var mean = x.Average();
     110        for(int i = 0; i < x.Count; i++) x[i] = (x[i] - mean) / sigma;
     111        var noisePrng = new NormalDistributedRandom(random, 0, Math.Sqrt(noiseRatio / (1.0 - noiseRatio)));
     112        lvl0.Add(x.Select(t => t + noisePrng.NextDouble()).ToList());
    103113      }
    104114
    105115      // lvl1 contains variables which are functions of vars in lvl0 (+ noise)
    106       List<List<double>> lvl1 = new List<List<double>>();
    107116      int numLvl1 = (int)Math.Ceiling(numberOfFeatures * 0.33);
    108       for (int c = 0; c < numLvl1; c++) {
    109         string[] selectedVarNames;
    110         var x = GenerateRandomFunction(random, lvl0, out selectedVarNames);
    111         var sigma = x.StandardDeviation();
    112         var noisePrng = new NormalDistributedRandom(random, 0, sigma * Math.Sqrt(noiseRatio / (1.0 - noiseRatio)));
    113         lvl1.Add(x.Select(t => t + noisePrng.NextDouble()).ToList());
     117      List<List<double>> lvl1 = CreateVariables(lvl0, numLvl1, inputVarNames, description, relevances);
    114118
    115         inputVarNames.Add(selectedVarNames);
    116         var desc = string.Format("f({0})", string.Join(",", selectedVarNames));
    117         description.Add(string.Format(" ~ N({0}, {1:N3})", desc, noisePrng.Sigma));
     119      // lvl2 contains variables which are functions of vars in lvl0 and lvl1 (+ noise)
     120      int numLvl2 = (int)Math.Ceiling(numberOfFeatures * 0.2);
     121      List<List<double>> lvl2 = CreateVariables(lvl0.Concat(lvl1).ToList(), numLvl2, inputVarNames, description, relevances);
     122
     123      // lvl3 contains variables which are functions of vars in lvl0, lvl1 and lvl2 (+ noise)
     124      int numLvl3 = numberOfFeatures - numLvl0 - numLvl1 - numLvl2;
     125      List<List<double>> lvl3 = CreateVariables(lvl0.Concat(lvl1).Concat(lvl2).ToList(), numLvl3, inputVarNames, description, relevances);
     126
     127      this.variableRelevances.Clear();
     128      for(int i = 0; i < variableNames.Length; i++) {
     129        var targetVarName = variableNames[i];
     130        var targetRelevantInputs =
     131          inputVarNames[i].Zip(relevances[i], (inputVar, rel) => new KeyValuePair<string, double>(inputVar, rel))
     132            .ToArray();
     133        variableRelevances.Add(targetVarName, targetRelevantInputs);
    118134      }
    119135
    120       // lvl2 contains variables which are functions of vars in lvl0 and lvl1 (+ noise)
    121       List<List<double>> lvl2 = new List<List<double>>();
    122       int numLvl2 = (int)Math.Ceiling(numberOfFeatures * 0.2);
    123       for (int c = 0; c < numLvl2; c++) {
    124         string[] selectedVarNames;
    125         var x = GenerateRandomFunction(random, lvl0.Concat(lvl1).ToList(), out selectedVarNames);
    126         var sigma = x.StandardDeviation();
    127         var noisePrng = new NormalDistributedRandom(random, 0, sigma * Math.Sqrt(noiseRatio / (1.0 - noiseRatio)));
    128         lvl2.Add(x.Select(t => t + noisePrng.NextDouble()).ToList());
    129 
    130         inputVarNames.Add(selectedVarNames);
    131         var desc = string.Format("f({0})", string.Join(",", selectedVarNames));
    132         description.Add(string.Format(" ~ N({0}, {1:N3})", desc, noisePrng.Sigma));
    133       }
    134 
    135       // lvl3 contains variables which are functions of vars in lvl0, lvl1 and lvl2 (+ noise)
    136       List<List<double>> lvl3 = new List<List<double>>();
    137       int numLvl3 = numberOfFeatures - numLvl0 - numLvl1 - numLvl2;
    138       for (int c = 0; c < numLvl3; c++) {
    139         string[] selectedVarNames;
    140         var x = GenerateRandomFunction(random, lvl0.Concat(lvl1).Concat(lvl2).ToList(), out selectedVarNames);
    141         var sigma = x.StandardDeviation();
    142         var noisePrng = new NormalDistributedRandom(random, 0, sigma * Math.Sqrt(noiseRatio / (1.0 - noiseRatio)));
    143         lvl3.Add(x.Select(t => t + noisePrng.NextDouble()).ToList());
    144 
    145         inputVarNames.Add(selectedVarNames);
    146         var desc = string.Format("f({0})", string.Join(",", selectedVarNames));
    147         description.Add(string.Format(" ~ N({0}, {1:N3})", desc, noisePrng.Sigma));
    148       }
    149 
    150       networkDefinition = string.Join(Environment.NewLine, variableNames.Zip(description, (n, d) => n + d));
     136      networkDefinition = string.Join(Environment.NewLine, variableNames.Zip(description, (n, d) => n + d).OrderBy(x => x));
    151137      // for graphviz
    152138      networkDefinition += Environment.NewLine + "digraph G {";
    153       foreach (var t in variableNames.Zip(inputVarNames, Tuple.Create).OrderBy(t => t.Item1)) {
    154         var name = t.Item1;
    155         var selectedVarNames = t.Item2;
    156         foreach (var selectedVarName in selectedVarNames) {
    157           networkDefinition += Environment.NewLine + selectedVarName + " -> " + name;
     139      for(int i = 0; i < variableNames.Length; i++) {
     140        var name = variableNames[i];
     141        var selectedVarNames = inputVarNames[i];
     142        var selectedRelevances = relevances[i];
     143        for(int j = 0; j < selectedVarNames.Length; j++) {
     144          var selectedVarName = selectedVarNames[j];
     145          var selectedRelevance = selectedRelevances[j];
     146          networkDefinition += Environment.NewLine + selectedVarName + " -> " + name +
     147            string.Format(CultureInfo.InvariantCulture, " [label={0:N3}]", selectedRelevance);
    158148        }
    159149      }
    160150      networkDefinition += Environment.NewLine + "}";
    161151
    162       // return a random permutation of all variables
     152      // return a random permutation of all variables (to mix lvl0, lvl1, ... variables)
    163153      var allVars = lvl0.Concat(lvl1).Concat(lvl2).Concat(lvl3).ToList();
    164154      var orderedVars = allVars.Zip(variableNames, Tuple.Create).OrderBy(t => t.Item2).Select(t => t.Item1).ToList();
     
    167157    }
    168158
    169     // sample the input variables that are actually used and sample from a Gaussian process
    170     private IEnumerable<double> GenerateRandomFunction(IRandom rand, List<List<double>> xs, out string[] selectedVarNames) {
     159    private List<List<double>> CreateVariables(List<List<double>> allowedInputs, int numVars, List<string[]> inputVarNames, List<string> description, List<double[]> relevances) {
     160      var newVariables = new List<List<double>>();
     161      for(int c = 0; c < numVars; c++) {
     162        string[] selectedVarNames;
     163        double[] relevance;
     164        var x = GenerateRandomFunction(random, allowedInputs, out selectedVarNames, out relevance).ToArray();
     165        // standardize x
     166        var sigma = x.StandardDeviation();
     167        var mean = x.Average();
     168        for(int i = 0; i < x.Length; i++) x[i] = (x[i] - mean) / sigma;
     169
     170        var noisePrng = new NormalDistributedRandom(random, 0, Math.Sqrt(noiseRatio / (1.0 - noiseRatio)));
     171        newVariables.Add(x.Select(t => t + noisePrng.NextDouble()).ToList());
     172        Array.Sort(selectedVarNames, relevance);
     173        inputVarNames.Add(selectedVarNames);
     174        relevances.Add(relevance);
     175        var desc = string.Format("f({0})", string.Join(",", selectedVarNames));
     176        // for the relevance information order variables by decreasing relevance
     177        var relevanceStr = string.Join(", ",
     178          selectedVarNames.Zip(relevance, Tuple.Create)
     179          .OrderByDescending(t => t.Item2)
     180          .Select(t => string.Format(CultureInfo.InvariantCulture, "{0}: {1:N3}", t.Item1, t.Item2)));
     181        description.Add(string.Format(" ~ N({0}, {1:N3}) [Relevances: {2}]", desc, noisePrng.Sigma, relevanceStr));
     182      }
     183      return newVariables;
     184    }
     185
     186    public int SampleNumberOfVariables(IRandom rand, int maxNumberOfVariables) {
    171187      double r = -Math.Log(1.0 - rand.NextDouble()) * 2.0; // r is exponentially distributed with lambda = 2
    172188      int nl = (int)Math.Floor(1.5 + r); // number of selected vars is likely to be between three and four
    173       if (nl > xs.Count) nl = xs.Count; // limit max
    174 
    175       var selectedIdx = Enumerable.Range(0, xs.Count).Shuffle(random)
    176         .Take(nl).ToArray();
    177 
    178       var selectedVars = selectedIdx.Select(i => xs[i]).ToArray();
    179       selectedVarNames = selectedIdx.Select(i => VariableNames[i]).ToArray();
    180       return SampleGaussianProcess(random, selectedVars);
     189      return Math.Min(maxNumberOfVariables, nl);
    181190    }
    182191
    183     private IEnumerable<double> SampleGaussianProcess(IRandom random, List<double>[] xs) {
    184       int nl = xs.Length;
    185       int nRows = xs.First().Count;
    186       double[,] K = new double[nRows, nRows];
    187 
    188       // sample length-scales
    189       var l = Enumerable.Range(0, nl)
    190         .Select(_ => random.NextDouble() * 2 + 0.5)
    191         .ToArray();
    192       // calculate covariance matrix
    193       for (int r = 0; r < nRows; r++) {
    194         double[] xi = xs.Select(x => x[r]).ToArray();
    195         for (int c = 0; c <= r; c++) {
    196           double[] xj = xs.Select(x => x[c]).ToArray();
    197           double dSqr = xi.Zip(xj, (xik, xjk) => (xik - xjk))
    198             .Select(dk => dk * dk)
    199             .Zip(l, (dk, lk) => dk / lk)
    200             .Sum();
    201           K[r, c] = Math.Exp(-dSqr);
    202         }
    203       }
    204 
    205       // add a small diagonal matrix for numeric stability
    206       for (int i = 0; i < nRows; i++) {
    207         K[i, i] += 1.0E-7;
    208       }
    209 
    210       // decompose
    211       alglib.trfac.spdmatrixcholesky(ref K, nRows, false);
    212 
    213       // sample u iid ~ N(0, 1)
    214       var u = Enumerable.Range(0, nRows).Select(_ => NormalDistributedRandom.NextDouble(random, 0, 1)).ToArray();
    215 
    216       // calc y = Lu
    217       var y = new double[u.Length];
    218       alglib.ablas.rmatrixmv(nRows, nRows, K, 0, 0, 0, u, 0, ref y, 0);
    219 
    220       return y;
    221     }
     192    // sample a random function and calculate the variable relevances
     193    protected abstract IEnumerable<double> GenerateRandomFunction(IRandom rand, List<List<double>> xs, out string[] selectedVarNames, out double[] relevance);
    222194  }
    223195}
  • stable/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/VariableNetworks/VariableNetworkInstanceProvider.cs

    r14305 r15195  
    3232    }
    3333    public override string Description {
    34       get { return "A set of regression benchmark instances for variable network analysis"; }
     34      get { return "A set of regression benchmark instances for variable network analysis. The data for these instances are randomly generated as described in the reference publication."; }
    3535    }
    3636    public override Uri WebLink {
     
    3838    }
    3939    public override string ReferencePublication {
    40       get { return ""; }
     40      get { return "G. Kronberger, B. Burlacu, M. Kommenda, S. Winkler, M. Affenzeller. Measures for the Evaluation and Comparison of Graphical Model Structures. to appear in Computer Aided Systems Theory - EUROCAST 2017, Springer 2018"; }
    4141    }
    4242    public int Seed { get; private set; }
     
    4949    public override IEnumerable<IDataDescriptor> GetDataDescriptors() {
    5050      var numVariables = new int[] { 10, 20, 50, 100 };
    51       var noiseRatios = new double[] { 0.01, 0.05, 0.1 };
     51      var noiseRatios = new double[] { 0, 0.01, 0.05, 0.1, 0.2 };
    5252      var rand = new MersenneTwister((uint)Seed); // use fixed seed for deterministic problem generation
    53       return (from size in numVariables
    54               from noiseRatio in noiseRatios
    55               select new VariableNetwork(size, noiseRatio, new MersenneTwister((uint)rand.Next())))
    56               .Cast<IDataDescriptor>()
    57               .ToList();
     53      var lr = (from size in numVariables
     54                from noiseRatio in noiseRatios
     55                select new LinearVariableNetwork(size, noiseRatio, new MersenneTwister((uint)rand.Next())))
     56                .Cast<IDataDescriptor>()
     57                .ToList();
     58      var gp = (from size in numVariables
     59                from noiseRatio in noiseRatios
     60                select new GaussianProcessVariableNetwork(size, noiseRatio, new MersenneTwister((uint)rand.Next())))
     61                .Cast<IDataDescriptor>()
     62                .ToList();
     63      return lr.Concat(gp);
    5864    }
    5965
Note: See TracChangeset for help on using the changeset viewer.