Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
02/17/13 17:18:01 (12 years ago)
Author:
gkronber
Message:

#1999 improved implementation of feature selection problem instances based on the review comments by mkommend.

  • Created a PRNG for uniformly distributed values with a specified range [min..max[
  • Created a class FeatureSelectionRegressionProblemData with additional informative parameters derived from RegressionProblemData
  • fixed typos: shuffeled and varialbe
Location:
trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3
Files:
1 added
5 edited

Legend:

Unmodified
Added
Removed
  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/DataAnalysisInstanceProvider.cs

    r8878 r9217  
    4949      int count = values.First().Count;
    5050      int[] indices = Enumerable.Range(0, count).Shuffle(new FastRandom()).ToArray();
    51       List<IList> shuffeledValues = new List<IList>(values.Count);
     51      List<IList> shuffled = new List<IList>(values.Count);
    5252      for (int col = 0; col < values.Count; col++) {
    5353
    5454        if (values[col] is List<double>)
    55           shuffeledValues.Add(new List<double>());
     55          shuffled.Add(new List<double>());
    5656        else if (values[col] is List<DateTime>)
    57           shuffeledValues.Add(new List<DateTime>());
     57          shuffled.Add(new List<DateTime>());
    5858        else if (values[col] is List<string>)
    59           shuffeledValues.Add(new List<string>());
     59          shuffled.Add(new List<string>());
    6060        else
    6161          throw new InvalidOperationException();
    6262
    6363        for (int i = 0; i < count; i++) {
    64           shuffeledValues[col].Add(values[col][indices[i]]);
     64          shuffled[col].Add(values[col][indices[i]]);
    6565        }
    6666      }
    67       return shuffeledValues;
     67      return shuffled;
    6868    }
    6969
  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/HeuristicLab.Problems.Instances.DataAnalysis-3.3.csproj

    r9208 r9217  
    140140    <Compile Include="Regression\ArtificialRegressionInstanceProvider.cs" />
    141141    <Compile Include="Regression\CSV\RegressionCSVInstanceProvider.cs" />
     142    <Compile Include="Regression\FeatureSelection\FeatureSelectionRegressionProblemData.cs" />
    142143    <Compile Include="Regression\FeatureSelection\FeatureSelection.cs" />
    143144    <Compile Include="Regression\FeatureSelection\FeatureSelectionInstanceProvider.cs" />
     
    245246      <Private>False</Private>
    246247    </ProjectReference>
     248    <ProjectReference Include="..\..\HeuristicLab.Parameters\3.3\HeuristicLab.Parameters-3.3.csproj">
     249      <Project>{56F9106A-079F-4C61-92F6-86A84C2D84B7}</Project>
     250      <Name>HeuristicLab.Parameters-3.3</Name>
     251    </ProjectReference>
     252    <ProjectReference Include="..\..\HeuristicLab.Persistence\3.3\HeuristicLab.Persistence-3.3.csproj">
     253      <Project>{102BC7D3-0EF9-439C-8F6D-96FF0FDB8E1B}</Project>
     254      <Name>HeuristicLab.Persistence-3.3</Name>
     255    </ProjectReference>
    247256    <ProjectReference Include="..\..\HeuristicLab.PluginInfrastructure\3.3\HeuristicLab.PluginInfrastructure-3.3.csproj">
    248257      <Project>{94186A6A-5176-4402-AE83-886557B53CCA}</Project>
  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Plugin.cs.frame

    r8568 r9217  
    2828  [PluginDependency("HeuristicLab.Core", "3.3")]
    2929  [PluginDependency("HeuristicLab.Data", "3.3")]
     30  [PluginDependency("HeuristicLab.Parameters", "3.3")]
     31  [PluginDependency("HeuristicLab.Persistence", "3.3")]
    3032  [PluginDependency("HeuristicLab.Problems.DataAnalysis", "3.4")]
    3133  [PluginDependency("HeuristicLab.Problems.Instances", "3.3")]
  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/FeatureSelection/FeatureSelection.cs

    r9094 r9217  
    2424using System.Linq;
    2525using HeuristicLab.Common;
     26using HeuristicLab.Core;
    2627using HeuristicLab.Random;
    2728
    2829namespace HeuristicLab.Problems.Instances.DataAnalysis {
    2930  public class FeatureSelection : ArtificialRegressionDataDescriptor {
    30     private int trainingSamples;
    31     private const int TestSamples = 5000;
     31    private int nTrainingSamples;
     32    private int nTestSamples;
    3233
    3334    private int numberOfFeatures;
    3435    private double selectionProbability;
    3536    private double noiseRatio;
     37    private IRandom xRandom;
     38    private IRandom weightRandom;
    3639
    3740    public override string Name { get { return string.Format("FeatSel-{0}-{1:0%}-{2:0%}", numberOfFeatures, selectionProbability, noiseRatio); } }
     
    3942      get {
    4043        return "This problem is specifically designed to test feature selection." + Environment.NewLine
    41                + "In this instance the number of rows for training (" + trainingSamples +
     44               + "In this instance the number of rows for training (" + nTrainingSamples +
    4245               ") is only slightly larger than the number of columns (" + numberOfFeatures +
    4346               ") and only a subset of the columns must be selected for the predictive model." + Environment.NewLine
     
    4750               + "X(i,j) ~ N(0, 1) iid, w(i) ~ U(0, 10) iid, n ~ N(0, sigma(w*S) * SQRT(" + noiseRatio + "))" + Environment.NewLine
    4851               + "The noise level is " + noiseRatio + " * sigma, thus an optimal model has R² = "
    49                + Math.Round(1 - noiseRatio, 2) + " (or equivalently: NMSE = " + noiseRatio + ")" + Environment.NewLine
    50                + "N = " + (trainingSamples + TestSamples) + " (" + trainingSamples + " training, " + TestSamples + " test)" + Environment.NewLine
     52               + Math.Round(optimalRSquared) + " (or equivalently: NMSE = " + noiseRatio + ")" + Environment.NewLine
     53               + "N = " + (nTrainingSamples + nTestSamples) + " (" + nTrainingSamples + " training, " + nTestSamples + " test)" + Environment.NewLine
    5154               + "k = " + numberOfFeatures;
    5255        ;
     
    5457    }
    5558
    56     public FeatureSelection(int numberOfFeatures, double selectionProbability, double noiseRatio) {
     59    private double[] w;
     60    public double[] Weights {
     61      get { return w; }
     62    }
     63
     64    private string[] selectedFeatures;
     65    public string[] SelectedFeatures {
     66      get { return selectedFeatures; }
     67    }
     68
     69    private double optimalRSquared;
     70    public double OptimalRSquared {
     71      get { return optimalRSquared; }
     72    }
     73
     74
     75    public FeatureSelection(int numberOfFeatures, double selectionProbability, double noiseRatio, IRandom xGenerator, IRandom weightGenerator)
     76      : this((int)Math.Round(numberOfFeatures * 1.2), 5000, numberOfFeatures,
     77      selectionProbability, noiseRatio, xGenerator, weightGenerator) { }
     78
     79    public FeatureSelection(int nTrainingSamples, int nTestSamples, int numberOfFeatures,
     80      double selectionProbability, double noiseRatio, IRandom xGenerator, IRandom weightGenerator) {
    5781      this.numberOfFeatures = numberOfFeatures;
    58       this.trainingSamples = (int)Math.Round(numberOfFeatures * 1.2); // 20% more rows than columns
     82      this.nTrainingSamples = nTrainingSamples;
     83      this.nTestSamples = nTestSamples;
    5984      this.selectionProbability = selectionProbability;
    6085      this.noiseRatio = noiseRatio;
     86      this.xRandom = xGenerator;
     87      this.weightRandom = weightGenerator;
    6188    }
    6289
     
    74101      }
    75102    }
     103
    76104    protected override int TrainingPartitionStart { get { return 0; } }
    77     protected override int TrainingPartitionEnd { get { return trainingSamples; } }
    78     protected override int TestPartitionStart { get { return trainingSamples; } }
    79     protected override int TestPartitionEnd { get { return trainingSamples + TestSamples; } }
     105    protected override int TrainingPartitionEnd { get { return nTrainingSamples; } }
     106    protected override int TestPartitionStart { get { return nTrainingSamples; } }
     107    protected override int TestPartitionEnd { get { return nTrainingSamples + nTestSamples; } }
     108
    80109
    81110    protected override List<List<double>> GenerateValues() {
    82111      List<List<double>> data = new List<List<double>>();
    83112      for (int i = 0; i < AllowedInputVariables.Count(); i++) {
    84         data.Add(ValueGenerator.GenerateNormalDistributedValues(TestPartitionEnd, 0, 1).ToList());
     113        data.Add(Enumerable.Range(0, TestPartitionEnd)
     114          .Select(_ => xRandom.NextDouble())
     115          .ToList());
    85116      }
    86117
     
    90121        .Where(_ => random.NextDouble() < selectionProbability)
    91122        .ToArray();
    92       var w = ValueGenerator.GenerateUniformDistributedValues(selectedFeatures.Length, 0, 10)
    93         .ToArray();
     123
     124      w = selectedFeatures.Select(_ => weightRandom.NextDouble()).ToArray();
    94125      var target = new List<double>();
    95126      for (int i = 0; i < data[0].Count; i++) {
     
    104135      data.Add(target.Select(t => t + noisePrng.NextDouble()).ToList());
    105136
     137      // set property listing the selected features as string[]
     138      this.selectedFeatures = selectedFeatures.Select(i => AllowedInputVariables[i]).ToArray();
     139      optimalRSquared = 1 - noiseRatio;
    106140      return data;
    107141    }
  • trunk/sources/HeuristicLab.Problems.Instances.DataAnalysis/3.3/Regression/FeatureSelection/FeatureSelectionInstanceProvider.cs

    r9093 r9217  
    2222using System;
    2323using System.Collections.Generic;
     24using System.Linq;
     25using HeuristicLab.Data;
     26using HeuristicLab.Problems.DataAnalysis;
     27using HeuristicLab.Random;
    2428
    2529namespace HeuristicLab.Problems.Instances.DataAnalysis {
     
    3943
    4044    public override IEnumerable<IDataDescriptor> GetDataDescriptors() {
    41       List<IDataDescriptor> descriptorList = new List<IDataDescriptor>();
    4245      var sizes = new int[] { 50, 100, 200 };
    4346      var pp = new double[] { 0.1, 0.25, 0.5 };
    4447      var noiseRatios = new double[] { 0.01, 0.05, 0.1, 0.2 };
    45       foreach (var size in sizes) {
    46         foreach (var p in pp) {
    47           foreach (var noiseRatio in noiseRatios) {
    48             descriptorList.Add(new FeatureSelection(size, p, noiseRatio));
    49           }
    50         }
    51       }
    52       return descriptorList;
     48      var mt = new MersenneTwister();
     49      var xGenerator = new NormalDistributedRandom(mt, 0, 1);
     50      var weightGenerator = new UniformDistributedRandom(mt, 0, 10);
     51      return (from size in sizes
     52              from p in pp
     53              from noiseRatio in noiseRatios
     54              select new FeatureSelection(size, p, noiseRatio, xGenerator, weightGenerator))
     55              .Cast<IDataDescriptor>()
     56              .ToList();
     57    }
     58
     59    public override IRegressionProblemData LoadData(IDataDescriptor descriptor) {
     60      var featureSelectionDescriptor = descriptor as FeatureSelection;
     61      if (featureSelectionDescriptor == null) throw new ArgumentException("FeatureSelectionInstanceProvider expects an FeatureSelection data descriptor.");
     62      // base call generates a regression problem data
     63      var regProblemData = base.LoadData(featureSelectionDescriptor);
     64      var problemData =
     65        new FeatureSelectionRegressionProblemData(
     66          regProblemData.Dataset, regProblemData.AllowedInputVariables, regProblemData.TargetVariable,
     67          featureSelectionDescriptor.SelectedFeatures, featureSelectionDescriptor.Weights,
     68          featureSelectionDescriptor.OptimalRSquared);
     69
     70      // copy values from regProblemData to feature selection problem data
     71      problemData.Name = regProblemData.Name;
     72      problemData.Description = regProblemData.Description;
     73      problemData.TrainingPartition.Start = regProblemData.TrainingPartition.Start;
     74      problemData.TrainingPartition.End = regProblemData.TrainingPartition.End;
     75      problemData.TestPartition.Start = regProblemData.TestPartition.Start;
     76      problemData.TestPartition.End = regProblemData.TestPartition.End;
     77
     78      return problemData;
    5379    }
    5480  }
Note: See TracChangeset for help on using the changeset viewer.