Free cookie consent management tool by TermsFeed Policy Generator

source: stable/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/FeatureSelection/FeatureSelection.cs @ 11085

Last change on this file since 11085 was 10478, checked in by gkronber, 11 years ago

released bug fix for (#2137 The calculated stdev of the noise for feature selection problems is wrong)

File size: 6.3 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2013 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25using HeuristicLab.Common;
26using HeuristicLab.Core;
27using HeuristicLab.Random;
28
29namespace HeuristicLab.Problems.Instances.DataAnalysis {
30  public class FeatureSelection : ArtificialRegressionDataDescriptor {
31    private int nTrainingSamples;
32    private int nTestSamples;
33
34    private int numberOfFeatures;
35    private double selectionProbability;
36    private double noiseRatio;
37    private IRandom xRandom;
38    private IRandom weightRandom;
39
40    public override string Name { get { return string.Format("FeatSel-{0}-{1:0%}-{2:0%}", numberOfFeatures, selectionProbability, noiseRatio); } }
41    public override string Description {
42      get {
43        return "This problem is specifically designed to test feature selection." + Environment.NewLine
44               + "In this instance the number of rows for training (" + nTrainingSamples +
45               ") is only slightly larger than the number of columns (" + numberOfFeatures +
46               ") and only a subset of the columns must be selected for the predictive model." + Environment.NewLine
47               + "The target variable is calculated as a noisy linear combination of randomly selected features: y = w * S + n." + Environment.NewLine
48               + "Where is the S is a N x d matrix containing the selected columns from N x k the matrix of all features X" + Environment.NewLine
49               + "For each feature the probability that it is selected is " + selectionProbability + "%" + Environment.NewLine
50               + "X(i,j) ~ N(0, 1) iid, w(i) ~ U(0, 10) iid, n ~ N(0, sigma(w*S) * SQRT(" + noiseRatio / (1 - noiseRatio)  + "))" + Environment.NewLine
51               + "The noise level is " + noiseRatio + " * sigma, thus an optimal model has R² = "
52               + Math.Round(optimalRSquared, 2) + " (or equivalently: NMSE = " + noiseRatio + ")" + Environment.NewLine
53               + "N = " + (nTrainingSamples + nTestSamples) + " (" + nTrainingSamples + " training, " + nTestSamples + " test)" + Environment.NewLine
54               + "k = " + numberOfFeatures;
55        ;
56      }
57    }
58
59    private double[] w;
60    public double[] Weights {
61      get { return w; }
62    }
63
64    private string[] selectedFeatures;
65    public string[] SelectedFeatures {
66      get { return selectedFeatures; }
67    }
68
69    private double optimalRSquared;
70    public double OptimalRSquared {
71      get { return optimalRSquared; }
72    }
73
74
75    public FeatureSelection(int numberOfFeatures, double selectionProbability, double noiseRatio, IRandom xGenerator, IRandom weightGenerator)
76      : this((int)Math.Round(numberOfFeatures * 1.2), 5000, numberOfFeatures,
77      selectionProbability, noiseRatio, xGenerator, weightGenerator) { }
78
79    public FeatureSelection(int nTrainingSamples, int nTestSamples, int numberOfFeatures,
80      double selectionProbability, double noiseRatio, IRandom xGenerator, IRandom weightGenerator) {
81      this.numberOfFeatures = numberOfFeatures;
82      this.nTrainingSamples = nTrainingSamples;
83      this.nTestSamples = nTestSamples;
84      this.selectionProbability = selectionProbability;
85      this.noiseRatio = noiseRatio;
86      this.xRandom = xGenerator;
87      this.weightRandom = weightGenerator;
88    }
89
90    protected override string TargetVariable { get { return "Y"; } }
91
92    protected override string[] VariableNames {
93      get { return AllowedInputVariables.Concat(new string[] { "Y" }).ToArray(); }
94    }
95
96    protected override string[] AllowedInputVariables {
97      get {
98        return Enumerable.Range(1, numberOfFeatures)
99          .Select(i => string.Format("X{0:000}", i))
100          .ToArray();
101      }
102    }
103
104    protected override int TrainingPartitionStart { get { return 0; } }
105    protected override int TrainingPartitionEnd { get { return nTrainingSamples; } }
106    protected override int TestPartitionStart { get { return nTrainingSamples; } }
107    protected override int TestPartitionEnd { get { return nTrainingSamples + nTestSamples; } }
108
109
110    protected override List<List<double>> GenerateValues() {
111      List<List<double>> data = new List<List<double>>();
112      for (int i = 0; i < AllowedInputVariables.Count(); i++) {
113        data.Add(Enumerable.Range(0, TestPartitionEnd)
114          .Select(_ => xRandom.NextDouble())
115          .ToList());
116      }
117
118      var random = new MersenneTwister();
119      var selectedFeatures =
120        Enumerable.Range(0, AllowedInputVariables.Count())
121        .Where(_ => random.NextDouble() < selectionProbability)
122        .ToArray();
123
124      w = selectedFeatures.Select(_ => weightRandom.NextDouble()).ToArray();
125      var target = new List<double>();
126      for (int i = 0; i < data[0].Count; i++) {
127        var s = selectedFeatures
128          .Select(index => data[index][i])
129          .ToArray();
130        target.Add(ScalarProd(s, w));
131      }
132      var targetSigma = target.StandardDeviation();
133      var noisePrng = new NormalDistributedRandom(random, 0, targetSigma * Math.Sqrt(noiseRatio / (1.0 - noiseRatio)));
134
135      data.Add(target.Select(t => t + noisePrng.NextDouble()).ToList());
136
137      // set property listing the selected features as string[]
138      this.selectedFeatures = selectedFeatures.Select(i => AllowedInputVariables[i]).ToArray();
139      optimalRSquared = 1 - noiseRatio;
140      return data;
141    }
142
143    private double ScalarProd(double[] s, double[] w) {
144      if (s.Length != w.Length) throw new ArgumentException();
145      return s.Zip(w, (a, b) => a * b).Sum();
146    }
147  }
148}
Note: See TracBrowser for help on using the repository browser.