Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/FeatureSelection/FeatureSelection.cs @ 9094

Last change on this file since 9094 was 9094, checked in by gkronber, 12 years ago

#1999: formatting

File size: 5.2 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2012 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25using HeuristicLab.Common;
26using HeuristicLab.Random;
27
28namespace HeuristicLab.Problems.Instances.DataAnalysis {
29  public class FeatureSelection : ArtificialRegressionDataDescriptor {
30    private int trainingSamples;
31    private const int TestSamples = 5000;
32
33    private int numberOfFeatures;
34    private double selectionProbability;
35    private double noiseRatio;
36
37    public override string Name { get { return string.Format("FeatSel-{0}-{1:0%}-{2:0%}", numberOfFeatures, selectionProbability, noiseRatio); } }
38    public override string Description {
39      get {
40        return "This problem is specifically designed to test feature selection." + Environment.NewLine
41               + "In this instance the number of rows for training (" + trainingSamples +
42               ") is only slightly larger than the number of columns (" + numberOfFeatures +
43               ") and only a subset of the columns must be selected for the predictive model." + Environment.NewLine
44               + "The target variable is calculated as a noisy linear combination of randomly selected features: y = w * S + n." + Environment.NewLine
45               + "Where is the S is a N x d matrix containing the selected columns from N x k the matrix of all features X" + Environment.NewLine
46               + "For each feature the probability that it is selected is " + selectionProbability + "%" + Environment.NewLine
47               + "X(i,j) ~ N(0, 1) iid, w(i) ~ U(0, 10) iid, n ~ N(0, sigma(w*S) * SQRT(" + noiseRatio + "))" + Environment.NewLine
48               + "The noise level is " + noiseRatio + " * sigma, thus an optimal model has R² = "
49               + Math.Round(1 - noiseRatio, 2) + " (or equivalently: NMSE = " + noiseRatio + ")" + Environment.NewLine
50               + "N = " + (trainingSamples + TestSamples) + " (" + trainingSamples + " training, " + TestSamples + " test)" + Environment.NewLine
51               + "k = " + numberOfFeatures;
52        ;
53      }
54    }
55
56    public FeatureSelection(int numberOfFeatures, double selectionProbability, double noiseRatio) {
57      this.numberOfFeatures = numberOfFeatures;
58      this.trainingSamples = (int)Math.Round(numberOfFeatures * 1.2); // 20% more rows than columns
59      this.selectionProbability = selectionProbability;
60      this.noiseRatio = noiseRatio;
61    }
62
63    protected override string TargetVariable { get { return "Y"; } }
64
65    protected override string[] VariableNames {
66      get { return AllowedInputVariables.Concat(new string[] { "Y" }).ToArray(); }
67    }
68
69    protected override string[] AllowedInputVariables {
70      get {
71        return Enumerable.Range(1, numberOfFeatures)
72          .Select(i => string.Format("X{0:000}", i))
73          .ToArray();
74      }
75    }
76    protected override int TrainingPartitionStart { get { return 0; } }
77    protected override int TrainingPartitionEnd { get { return trainingSamples; } }
78    protected override int TestPartitionStart { get { return trainingSamples; } }
79    protected override int TestPartitionEnd { get { return trainingSamples + TestSamples; } }
80
81    protected override List<List<double>> GenerateValues() {
82      List<List<double>> data = new List<List<double>>();
83      for (int i = 0; i < AllowedInputVariables.Count(); i++) {
84        data.Add(ValueGenerator.GenerateNormalDistributedValues(TestPartitionEnd, 0, 1).ToList());
85      }
86
87      var random = new MersenneTwister();
88      var selectedFeatures =
89        Enumerable.Range(0, AllowedInputVariables.Count())
90        .Where(_ => random.NextDouble() < selectionProbability)
91        .ToArray();
92      var w = ValueGenerator.GenerateUniformDistributedValues(selectedFeatures.Length, 0, 10)
93        .ToArray();
94      var target = new List<double>();
95      for (int i = 0; i < data[0].Count; i++) {
96        var s = selectedFeatures
97          .Select(index => data[index][i])
98          .ToArray();
99        target.Add(ScalarProd(s, w));
100      }
101      var targetSigma = target.StandardDeviation();
102      var noisePrng = new NormalDistributedRandom(random, 0, targetSigma * Math.Sqrt(noiseRatio));
103
104      data.Add(target.Select(t => t + noisePrng.NextDouble()).ToList());
105
106      return data;
107    }
108
109    private double ScalarProd(double[] s, double[] w) {
110      if (s.Length != w.Length) throw new ArgumentException();
111      return s.Zip(w, (a, b) => a * b).Sum();
112    }
113  }
114}
Note: See TracBrowser for help on using the repository browser.