Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/FeatureSelection/FeatureSelection.cs @ 9093

Last change on this file since 9093 was 9093, checked in by gkronber, 12 years ago

#1999 added a provider and a configurable problem instance for testing feature selection

File size: 5.3 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2012 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25using HeuristicLab.Common;
26using HeuristicLab.Random;
27
28namespace HeuristicLab.Problems.Instances.DataAnalysis {
29  public class FeatureSelection : ArtificialRegressionDataDescriptor {
30    private int trainingSamples;
31    private const int TestSamples = 5000;
32
33    private int numberOfFeatures;
34    private double selectionProbability;
35    private double noiseRatio;
36
37    public override string Name { get { return string.Format("FeatSel-{0}-{1:0%}-{2:0%}", numberOfFeatures, selectionProbability, noiseRatio); } }
38    public override string Description {
39      get {
40        return "This problem is specifically designed to test feature selection." + Environment.NewLine
41               + "In this instance the number of rows for training (" + trainingSamples +
42               ") is only slightly larger than the number of columns (" + numberOfFeatures +
43               ") and only a subset of the columns must be selected for the predictive model." + Environment.NewLine
44               +
45               "The target variable is calculated as a noisy linear combination of randomly selected features: y = w * S + n." +
46               Environment.NewLine
47               +
48               "Where is the S is a N x d matrix containing the selected columns from N x k the matrix of all features X" +
49               Environment.NewLine
50               + "For each feature the probability that it is selected is " + selectionProbability + "%" +
51               Environment.NewLine
52               + "X(i,j) ~ N(0, 1) iid, w(i) ~ U(0, 10) iid, n ~ N(0, sigma(w*S) * SQRT(" + noiseRatio + "))" +
53               Environment.NewLine
54               + "The noise level is " + noiseRatio + " * sigma, thus an optimal model has R² = " +
55               Math.Round(1 - noiseRatio, 2) + " (or equivalently: NMSE = " + noiseRatio + ")" + Environment.NewLine
56               + "N = " + (trainingSamples + TestSamples) + " (" + trainingSamples + " training, " + TestSamples +
57               " test)" + Environment.NewLine
58               + "k = " + numberOfFeatures;
59        ;
60      }
61    }
62
63    public FeatureSelection(int numberOfFeatures, double selectionProbability, double noiseRatio) {
64      this.numberOfFeatures = numberOfFeatures;
65      this.trainingSamples = (int)Math.Round(numberOfFeatures * 1.2); // 20% more rows than columns
66      this.selectionProbability = selectionProbability;
67      this.noiseRatio = noiseRatio;
68    }
69
70    protected override string TargetVariable { get { return "Y"; } }
71
72    protected override string[] VariableNames {
73      get { return AllowedInputVariables.Concat(new string[] { "Y" }).ToArray(); }
74    }
75
76    protected override string[] AllowedInputVariables {
77      get {
78        return Enumerable.Range(1, numberOfFeatures)
79          .Select(i => string.Format("X{0:000}", i))
80          .ToArray();
81      }
82    }
83    protected override int TrainingPartitionStart { get { return 0; } }
84    protected override int TrainingPartitionEnd { get { return trainingSamples; } }
85    protected override int TestPartitionStart { get { return trainingSamples; } }
86    protected override int TestPartitionEnd { get { return trainingSamples + TestSamples; } }
87
88    protected override List<List<double>> GenerateValues() {
89      List<List<double>> data = new List<List<double>>();
90      for (int i = 0; i < AllowedInputVariables.Count(); i++) {
91        data.Add(ValueGenerator.GenerateNormalDistributedValues(TestPartitionEnd, 0, 1).ToList());
92      }
93
94      var random = new MersenneTwister();
95      var selectedFeatures =
96        Enumerable.Range(0, AllowedInputVariables.Count())
97        .Where(_ => random.NextDouble() < selectionProbability)
98        .ToArray();
99      var w = ValueGenerator.GenerateUniformDistributedValues(selectedFeatures.Length, 0, 10)
100        .ToArray();
101      var target = new List<double>();
102      for (int i = 0; i < data[0].Count; i++) {
103        var s = selectedFeatures
104          .Select(index => data[index][i])
105          .ToArray();
106        target.Add(ScalarProd(s, w));
107      }
108      var targetSigma = target.StandardDeviation();
109      var noisePrng = new NormalDistributedRandom(random, 0, targetSigma * Math.Sqrt(noiseRatio));
110
111      data.Add(target.Select(t => t + noisePrng.NextDouble()).ToList());
112
113      return data;
114    }
115
116    private double ScalarProd(double[] s, double[] w) {
117      if (s.Length != w.Length) throw new ArgumentException();
118      return s.Zip(w, (a, b) => a * b).Sum();
119    }
120  }
121}
Note: See TracBrowser for help on using the repository browser.