Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/Various/FeatureSelection.cs @ 9091

Last change on this file since 9091 was 9091, checked in by gkronber, 11 years ago

#1979: added a new artificial benchmark problem for regression specifically for testing feature selection algorithms

File size: 4.6 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2012 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25using HeuristicLab.Common;
26using HeuristicLab.Random;
27
28namespace HeuristicLab.Problems.Instances.DataAnalysis {
29  public class FeatureSelection : ArtificialRegressionDataDescriptor {
30    private const int NumberOfFeatures = 200;
31    private const int NumberOfSelectedFeatures = 80;
32    private const int TrainingSamples = 250;
33    private const int TestSamples = 20000;
34    public override string Name { get { return "Feature Selection - I"; } }
35    public override string Description {
36      get {
37        return "This problem is specifically designed to test feature selection." + Environment.NewLine
38               + "In this instance the number of rows for training (" + TrainingSamples + ") only slightly larger than the number of columns (" + NumberOfFeatures + ") and only a subset of the columns must be selected for the predictive model." + Environment.NewLine
39               + "The target variable is calculated as a noisy linear combination of m randomly selected features: y = w * S + n." + Environment.NewLine
40               + "Where is the S is a N x d matrix containing d randomly selected columns from N x k the matrix of all features X" + Environment.NewLine
41               + "X(i,j) ~ N(0, 1) iid and w(i) ~ U(0, 10) iid, n ~ N(0, sigma(w*S) * SQRT(0.1))" + Environment.NewLine
42               + "The noise level is 1/10 sigma, thus an optimal model has R² = 0.9 (equivalently: NMSE = 0.1)" + Environment.NewLine
43               + "N = " + (TrainingSamples + TestSamples) + " (" + TrainingSamples + " training, " + TestSamples + " test)" + Environment.NewLine
44               + "k = " + NumberOfFeatures + ", m = " + NumberOfSelectedFeatures;
45        ;
46      }
47    }
48    protected override string TargetVariable { get { return "Y"; } }
49
50    protected override string[] VariableNames {
51      get { return AllowedInputVariables.Concat(new string[] { "Y" }).ToArray(); }
52    }
53
54    protected override string[] AllowedInputVariables {
55      get {
56        return Enumerable.Range(1, NumberOfFeatures)
57          .Select(i => string.Format("X{0:000}", i))
58          .ToArray();
59      }
60    }
61    protected override int TrainingPartitionStart { get { return 0; } }
62    protected override int TrainingPartitionEnd { get { return TrainingSamples; } }
63    protected override int TestPartitionStart { get { return TrainingSamples; } }
64    protected override int TestPartitionEnd { get { return TrainingSamples + TestSamples; } }
65
66    protected override List<List<double>> GenerateValues() {
67      List<List<double>> data = new List<List<double>>();
68      for (int i = 0; i < AllowedInputVariables.Count(); i++) {
69        data.Add(ValueGenerator.GenerateNormalDistributedValues(TestPartitionEnd, 0, 1).ToList());
70      }
71
72      var random = new MersenneTwister();
73      var selectedFeatures =
74        Enumerable.Range(0, AllowedInputVariables.Count())
75        .SampleRandomWithoutRepetition(random, NumberOfSelectedFeatures)
76        .ToArray();
77      var w = ValueGenerator.GenerateUniformDistributedValues(NumberOfSelectedFeatures, 0, 10)
78        .ToArray();
79      var target = new List<double>();
80      for (int i = 0; i < data[0].Count; i++) {
81        var s = selectedFeatures
82          .Select(index => data[index][i])
83          .ToArray();
84        target.Add(ScalarProd(s, w));
85      }
86      var targetSigma = target.StandardDeviation();
87      var noisePrng = new NormalDistributedRandom(random, 0, targetSigma * Math.Sqrt(0.1));
88
89      data.Add(target.Select(t => t + noisePrng.NextDouble()).ToList());
90
91      return data;
92    }
93
94    private double ScalarProd(double[] s, double[] w) {
95      if (s.Length != w.Length) throw new ArgumentException();
96      return s.Zip(w, (a, b) => a * b).Sum();
97    }
98  }
99}
Note: See TracBrowser for help on using the repository browser.