#region License Information /* HeuristicLab * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Collections.Generic; using System.Linq; using HeuristicLab.Common; using HeuristicLab.Core; using HeuristicLab.Random; namespace HeuristicLab.Problems.Instances.DataAnalysis { public class FeatureSelection : ArtificialRegressionDataDescriptor { private int nTrainingSamples; private int nTestSamples; private int numberOfFeatures; private double selectionProbability; private double noiseRatio; private IRandom xRandom; private IRandom weightRandom; public override string Name { get { return string.Format("FeatSel-{0}-{1:0%}-{2:0%}", numberOfFeatures, selectionProbability, noiseRatio); } } public override string Description { get { return "This problem is specifically designed to test feature selection." + Environment.NewLine + "In this instance the number of rows for training (" + nTrainingSamples + ") is only slightly larger than the number of columns (" + numberOfFeatures + ") and only a subset of the columns must be selected for the predictive model." + Environment.NewLine + "The target variable is calculated as a noisy linear combination of randomly selected features: y = w * S + n." + Environment.NewLine + "Where is the S is a N x d matrix containing the selected columns from N x k the matrix of all features X" + Environment.NewLine + "For each feature the probability that it is selected is " + selectionProbability + "%" + Environment.NewLine + "X(i,j) ~ N(0, 1) iid, w(i) ~ U(0, 10) iid, n ~ N(0, sigma(w*S) * SQRT(" + noiseRatio / (1 - noiseRatio) + "))" + Environment.NewLine + "The noise level is " + noiseRatio + " * sigma, thus an optimal model has R² = " + Math.Round(optimalRSquared, 2) + " (or equivalently: NMSE = " + noiseRatio + ")" + Environment.NewLine + "N = " + (nTrainingSamples + nTestSamples) + " (" + nTrainingSamples + " training, " + nTestSamples + " test)" + Environment.NewLine + "k = " + numberOfFeatures; ; } } private double[] w; public double[] Weights { get { return w; } } private string[] selectedFeatures; public string[] SelectedFeatures { get { return selectedFeatures; } } private double optimalRSquared; public double OptimalRSquared { get { return optimalRSquared; } } public FeatureSelection(int numberOfFeatures, double selectionProbability, double noiseRatio, IRandom xGenerator, IRandom weightGenerator) : this((int)Math.Round(numberOfFeatures * 1.2), 5000, numberOfFeatures, selectionProbability, noiseRatio, xGenerator, weightGenerator) { } public FeatureSelection(int nTrainingSamples, int nTestSamples, int numberOfFeatures, double selectionProbability, double noiseRatio, IRandom xGenerator, IRandom weightGenerator) { this.numberOfFeatures = numberOfFeatures; this.nTrainingSamples = nTrainingSamples; this.nTestSamples = nTestSamples; this.selectionProbability = selectionProbability; this.noiseRatio = noiseRatio; this.xRandom = xGenerator; this.weightRandom = weightGenerator; } protected override string TargetVariable { get { return "Y"; } } protected override string[] VariableNames { get { return AllowedInputVariables.Concat(new string[] { "Y" }).ToArray(); } } protected override string[] AllowedInputVariables { get { return Enumerable.Range(1, numberOfFeatures) .Select(i => string.Format("X{0:000}", i)) .ToArray(); } } protected override int TrainingPartitionStart { get { return 0; } } protected override int TrainingPartitionEnd { get { return nTrainingSamples; } } protected override int TestPartitionStart { get { return nTrainingSamples; } } protected override int TestPartitionEnd { get { return nTrainingSamples + nTestSamples; } } protected override List> GenerateValues() { List> data = new List>(); for (int i = 0; i < AllowedInputVariables.Count(); i++) { data.Add(Enumerable.Range(0, TestPartitionEnd) .Select(_ => xRandom.NextDouble()) .ToList()); } var random = new MersenneTwister(); var selectedFeatures = Enumerable.Range(0, AllowedInputVariables.Count()) .Where(_ => random.NextDouble() < selectionProbability) .ToArray(); w = selectedFeatures.Select(_ => weightRandom.NextDouble()).ToArray(); var target = new List(); for (int i = 0; i < data[0].Count; i++) { var s = selectedFeatures .Select(index => data[index][i]) .ToArray(); target.Add(ScalarProd(s, w)); } var targetSigma = target.StandardDeviation(); var noisePrng = new NormalDistributedRandom(random, 0, targetSigma * Math.Sqrt(noiseRatio / (1.0 - noiseRatio))); data.Add(target.Select(t => t + noisePrng.NextDouble()).ToList()); // set property listing the selected features as string[] this.selectedFeatures = selectedFeatures.Select(i => AllowedInputVariables[i]).ToArray(); optimalRSquared = 1 - noiseRatio; return data; } private double ScalarProd(double[] s, double[] w) { if (s.Length != w.Length) throw new ArgumentException(); return s.Zip(w, (a, b) => a * b).Sum(); } } }