Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
09/22/10 11:22:49 (14 years ago)
Author:
mkommend
Message:

Preparation for cross validation - removed the test samples from the trainining samples and added ValidationPercentage parameter (ticket #1199).

Location:
trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3
Files:
13 edited

Legend:

Unmodified
Added
Removed
  • trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Analyzers/RegressionSolutionAnalyzer.cs

    r4068 r4468  
    121121      var solution = bestSolution;
    122122      #region update R2,MSE, Rel Error
    123       IEnumerable<double> trainingValues = problemData.Dataset.GetEnumeratedVariableValues(
    124         problemData.TargetVariable.Value,
    125         problemData.TrainingSamplesStart.Value,
    126         problemData.TrainingSamplesEnd.Value);
    127       IEnumerable<double> testValues = problemData.Dataset.GetEnumeratedVariableValues(
    128         problemData.TargetVariable.Value,
    129         problemData.TestSamplesStart.Value,
    130         problemData.TestSamplesEnd.Value);
     123      IEnumerable<double> trainingValues = problemData.Dataset.GetEnumeratedVariableValues(problemData.TargetVariable.Value, problemData.TrainingIndizes);
     124      IEnumerable<double> testValues = problemData.Dataset.GetEnumeratedVariableValues(problemData.TargetVariable.Value, problemData.TestIndizes);
    131125      OnlineMeanSquaredErrorEvaluator mseEvaluator = new OnlineMeanSquaredErrorEvaluator();
    132126      OnlineMeanAbsolutePercentageErrorEvaluator relErrorEvaluator = new OnlineMeanAbsolutePercentageErrorEvaluator();
    133127      OnlinePearsonsRSquaredEvaluator r2Evaluator = new OnlinePearsonsRSquaredEvaluator();
     128
    134129      #region training
    135130      var originalEnumerator = trainingValues.GetEnumerator();
     
    144139      double trainingRelError = relErrorEvaluator.MeanAbsolutePercentageError;
    145140      #endregion
     141
    146142      mseEvaluator.Reset();
    147143      relErrorEvaluator.Reset();
    148144      r2Evaluator.Reset();
     145
    149146      #region test
    150147      originalEnumerator = testValues.GetEnumerator();
     
    159156      double testRelError = relErrorEvaluator.MeanAbsolutePercentageError;
    160157      #endregion
     158
    161159      if (results.ContainsKey(BestSolutionResultName)) {
    162160        results[BestSolutionResultName].Value = solution;
  • trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers/BestSymbolicRegressionSolutionAnalyzer.cs

    r4125 r4468  
    9191        var model = new SymbolicRegressionModel((ISymbolicExpressionTreeInterpreter)SymbolicExpressionTreeInterpreter.Clone(),
    9292          SymbolicExpressionTree[i]);
    93         var solution = new SymbolicRegressionSolution(ProblemData, model, lowerEstimationLimit, upperEstimationLimit);
     93        DataAnalysisProblemData problemDataClone = (DataAnalysisProblemData)ProblemData.Clone();
     94        var solution = new SymbolicRegressionSolution(problemDataClone, model, lowerEstimationLimit, upperEstimationLimit);
    9495        solution.Name = BestSolutionParameterName;
    9596        solution.Description = "Best solution on validation partition found over the whole run.";
    9697        BestSolutionParameter.ActualValue = solution;
    9798        BestSolutionQualityParameter.ActualValue = Quality[i];
    98         BestSymbolicRegressionSolutionAnalyzer.UpdateSymbolicRegressionBestSolutionResults(solution, ProblemData, Results, VariableFrequencies);
     99        BestSymbolicRegressionSolutionAnalyzer.UpdateSymbolicRegressionBestSolutionResults(solution, problemDataClone, Results, VariableFrequencies);
    99100      }
    100101      return BestSolutionParameter.ActualValue;
  • trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers/FixedValidationBestScaledSymbolicRegressionSolutionAnalyzer.cs

    r4415 r4468  
    212212      int count = (int)((validationEnd - validationStart) * RelativeNumberOfEvaluatedSamples.Value);
    213213      if (count == 0) count = 1;
    214       IEnumerable<int> rows = RandomEnumerable.SampleRandomNumbers(seed, validationStart, validationEnd, count);
     214      IEnumerable<int> rows = RandomEnumerable.SampleRandomNumbers(seed, validationStart, validationEnd, count)
     215        .Where(row => row < ProblemData.TestSamplesStart.Value || ProblemData.TestSamplesEnd.Value <= row);
    215216
    216217      double upperEstimationLimit = UpperEstimationLimit != null ? UpperEstimationLimit.Value : double.PositiveInfinity;
     
    241242        // calculate scaling parameters and only for the best tree using the full training set
    242243        double alpha, beta;
    243         int trainingStart = ProblemData.TrainingSamplesStart.Value;
    244         int trainingEnd = ProblemData.TrainingSamplesEnd.Value;
    245         IEnumerable<int> trainingRows = Enumerable.Range(trainingStart, trainingEnd - trainingStart);
    246244        SymbolicRegressionScaledMeanSquaredErrorEvaluator.Calculate(SymbolicExpressionTreeInterpreter, bestTree,
    247245          lowerEstimationLimit, upperEstimationLimit,
    248246          ProblemData.Dataset, targetVariable,
    249           trainingRows, out beta, out alpha);
     247          ProblemData.TrainingIndizes, out beta, out alpha);
    250248
    251249        // scale tree for solution
     
    253251        var model = new SymbolicRegressionModel((ISymbolicExpressionTreeInterpreter)SymbolicExpressionTreeInterpreter.Clone(),
    254252          scaledTree);
    255         var solution = new SymbolicRegressionSolution(ProblemData, model, lowerEstimationLimit, upperEstimationLimit);
     253        var solution = new SymbolicRegressionSolution((DataAnalysisProblemData)ProblemData.Clone(), model, lowerEstimationLimit, upperEstimationLimit);
    256254        solution.Name = BestSolutionParameterName;
    257255        solution.Description = "Best solution on validation partition found over the whole run.";
  • trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers/SymbolicRegressionModelQualityAnalyzer.cs

    r4068 r4468  
    137137      Analyze(SymbolicExpressionTreeParameter.ActualValue, SymbolicExpressionTreeInterpreterParameter.ActualValue,
    138138        UpperEstimationLimit.Value, LowerEstimationLimit.Value, ProblemDataParameter.ActualValue,
    139         ProblemDataParameter.ActualValue.TrainingSamplesStart.Value, ProblemDataParameter.ActualValue.TrainingSamplesEnd.Value,
    140         ProblemDataParameter.ActualValue.TestSamplesStart.Value, ProblemDataParameter.ActualValue.TestSamplesEnd.Value,
    141139        ResultsParameter.ActualValue);
    142140      return base.Apply();
     
    145143    public static void Analyze(IEnumerable<SymbolicExpressionTree> trees, ISymbolicExpressionTreeInterpreter interpreter,
    146144      double upperEstimationLimit, double lowerEstimationLimit,
    147       DataAnalysisProblemData problemData, int trainingStart, int trainingEnd, int testStart, int testEnd, ResultCollection results) {
     145      DataAnalysisProblemData problemData, ResultCollection results) {
    148146      int targetVariableIndex = problemData.Dataset.GetVariableIndex(problemData.TargetVariable.Value);
    149       IEnumerable<double> originalTrainingValues = problemData.Dataset.GetEnumeratedVariableValues(targetVariableIndex, trainingStart, trainingEnd);
    150       IEnumerable<double> originalTestValues = problemData.Dataset.GetEnumeratedVariableValues(targetVariableIndex, testStart, testEnd);
     147      IEnumerable<double> originalTrainingValues = problemData.Dataset.GetEnumeratedVariableValues(targetVariableIndex, problemData.TrainingIndizes);
     148      IEnumerable<double> originalTestValues = problemData.Dataset.GetEnumeratedVariableValues(targetVariableIndex, problemData.TestIndizes);
    151149      List<double> trainingMse = new List<double>();
    152150      List<double> trainingR2 = new List<double>();
     
    162160      foreach (var tree in trees) {
    163161        #region training
    164         var estimatedTrainingValues = interpreter.GetSymbolicExpressionTreeValues(tree, problemData.Dataset, Enumerable.Range(trainingStart, trainingEnd - trainingStart));
     162        var estimatedTrainingValues = interpreter.GetSymbolicExpressionTreeValues(tree, problemData.Dataset, problemData.TrainingIndizes);
    165163        mseEvaluator.Reset();
    166164        r2Evaluator.Reset();
     
    184182        #endregion
    185183        #region test
    186         var estimatedTestValues = interpreter.GetSymbolicExpressionTreeValues(tree, problemData.Dataset, Enumerable.Range(testStart, testEnd - testStart));
     184        var estimatedTestValues = interpreter.GetSymbolicExpressionTreeValues(tree, problemData.Dataset, problemData.TestIndizes);
    187185
    188186        mseEvaluator.Reset();
  • trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers/SymbolicRegressionModelQualityCalculator.cs

    r4068 r4468  
    2020#endregion
    2121
     22using System;
    2223using HeuristicLab.Core;
    2324using HeuristicLab.Data;
     
    3536  [Item("SymbolicRegressionModelQualityCalculator", "An operator to calculate the quality values of a symbolic regression solution symbolic expression tree encoding.")]
    3637  [StorableClass]
     38  [Obsolete("This class should not be used anymore because of performance reasons and will therefore not be updated.")]
    3739  public sealed class SymbolicRegressionModelQualityCalculator : AlgorithmOperator {
    3840    private const string SymbolicExpressionTreeInterpreterParameterName = "SymbolicExpressionTreeInterpreter";
  • trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers/SymbolicRegressionTournamentPruning.cs

    r4191 r4468  
    2828using HeuristicLab.Optimization;
    2929using HeuristicLab.Parameters;
     30using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
    3031using HeuristicLab.Problems.DataAnalysis.Symbolic;
    3132using HeuristicLab.Problems.DataAnalysis.Symbolic.Symbols;
    32 using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
    3333
    3434namespace HeuristicLab.Problems.DataAnalysis.Regression.Symbolic.Analyzers {
     
    241241      double lowerEstimationLimit, double upperEstimationLimit,
    242242      double maxPruningRatio, double qualityGainWeight) {
    243       IEnumerable<int> rows = Enumerable.Range(samplesStart, samplesEnd - samplesStart);
     243        IEnumerable<int> rows = Enumerable.Range(samplesStart, samplesEnd - samplesStart)
     244          .Where(i => i < problemData.TestSamplesStart.Value || problemData.TestSamplesEnd.Value <= i);
    244245      int originalSize = tree.Size;
    245246      double originalQuality = evaluator.Evaluate(interpreter, tree,
  • trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Analyzers/ValidationBestScaledSymbolicRegressionSolutionAnalyzer.cs

    r4068 r4468  
    3939  [Item("ValidationBestScaledSymbolicRegressionSolutionAnalyzer", "An operator that analyzes the validation best scaled symbolic regression solution.")]
    4040  [StorableClass]
     41  [Obsolete("This class should not be used anymore because of performance reasons and will therefore not be updated.")]
    4142  public sealed class ValidationBestScaledSymbolicRegressionSolutionAnalyzer : AlgorithmOperator, ISymbolicRegressionAnalyzer {
    4243    private const string SymbolicExpressionTreeParameterName = "SymbolicExpressionTree";
  • trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Evaluators/MultiObjectiveSymbolicRegressionEvaluator.cs

    r4246 r4468  
    2020#endregion
    2121
    22 using System;
    2322using System.Collections.Generic;
     23using System.Linq;
    2424using HeuristicLab.Core;
    2525using HeuristicLab.Data;
     
    122122    public override IOperation Apply() {
    123123      int seed = Random.Next();
    124       IEnumerable<int> rows = SingleObjectiveSymbolicRegressionEvaluator.GenerateRowsToEvaluate(seed, RelativeNumberOfEvaluatedSamples.Value, SamplesStart.Value, SamplesEnd.Value);
     124      IEnumerable<int> rows = SingleObjectiveSymbolicRegressionEvaluator.GenerateRowsToEvaluate(seed, RelativeNumberOfEvaluatedSamples.Value, SamplesStart.Value, SamplesEnd.Value)
     125         .Where(i => i < RegressionProblemData.TestSamplesStart.Value || RegressionProblemData.TestSamplesEnd.Value <= i);
    125126      double[] qualities = Evaluate(SymbolicExpressionTreeInterpreter, SymbolicExpressionTree, RegressionProblemData.Dataset,
    126127        RegressionProblemData.TargetVariable, rows);
  • trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/Evaluators/SingleObjectiveSymbolicRegressionEvaluator.cs

    r4246 r4468  
    2222using System;
    2323using System.Collections.Generic;
     24using System.Linq;
    2425using HeuristicLab.Core;
    2526using HeuristicLab.Data;
     
    140141    public override IOperation Apply() {
    141142      int seed = Random.Next();
    142       IEnumerable<int> rows = GenerateRowsToEvaluate(seed, RelativeNumberOfEvaluatedSamples.Value, SamplesStart.Value, SamplesEnd.Value);
     143      IEnumerable<int> rows = GenerateRowsToEvaluate(seed, RelativeNumberOfEvaluatedSamples.Value, SamplesStart.Value, SamplesEnd.Value)
     144          .Where(i => i < RegressionProblemData.TestSamplesStart.Value || RegressionProblemData.TestSamplesEnd.Value <= i);
    143145      double quality = Evaluate(SymbolicExpressionTreeInterpreter, SymbolicExpressionTree, LowerEstimationLimit.Value, UpperEstimationLimit.Value,
    144146        RegressionProblemData.Dataset,
  • trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/SymbolicRegressionModel.cs

    r4415 r4468  
    6868
    6969    public IEnumerable<double> GetEstimatedValues(DataAnalysisProblemData problemData, int start, int end) {
    70       return interpreter.GetSymbolicExpressionTreeValues(tree, problemData.Dataset, Enumerable.Range(start, end - start));
     70      return GetEstimatedValues(problemData, Enumerable.Range(start, end - start));
     71    }
     72    public IEnumerable<double> GetEstimatedValues(DataAnalysisProblemData problemData, IEnumerable<int> rows) {
     73      return interpreter.GetSymbolicExpressionTreeValues(tree, problemData.Dataset, rows);
    7174    }
    7275
  • trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/SymbolicRegressionProblem.cs

    r4250 r4468  
    170170          fixedBestValidationSolutionAnalyzer.BestKnownQualityParameter.ActualName = BestKnownQualityParameter.Name;
    171171        }
     172
    172173        var bestValidationSolutionAnalyzer = analyzer as ValidationBestScaledSymbolicRegressionSolutionAnalyzer;
    173174        if (bestValidationSolutionAnalyzer != null) {
  • trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/SymbolicRegressionProblemBase.cs

    r4251 r4468  
    125125    }
    126126    public IntValue TrainingSamplesStart {
    127       get { return new IntValue(DataAnalysisProblemData.TrainingSamplesStart.Value); }
     127      get { return new IntValue(DataAnalysisProblemData.TrainingIndizes.First()); }
    128128    }
    129129    public IntValue TrainingSamplesEnd {
    130130      get {
    131         return new IntValue((DataAnalysisProblemData.TrainingSamplesStart.Value +
    132           DataAnalysisProblemData.TrainingSamplesEnd.Value) / 2);
     131        int endIndex = (int)(DataAnalysisProblemData.TrainingIndizes.Count() * (1.0 - DataAnalysisProblemData.ValidationPercentage.Value));
     132        return new IntValue(DataAnalysisProblemData.TrainingIndizes.ElementAt(endIndex));
    133133      }
    134134    }
     
    137137    }
    138138    public IntValue ValidationSamplesEnd {
    139       get { return new IntValue(DataAnalysisProblemData.TrainingSamplesEnd.Value); }
     139      get { return new IntValue(DataAnalysisProblemData.TrainingIndizes.Last() + 1); }
    140140    }
    141141    public IntValue TestSamplesStart {
  • trunk/sources/HeuristicLab.Problems.DataAnalysis.Regression/3.3/Symbolic/SymbolicRegressionSolution.cs

    r4415 r4468  
    6767      get {
    6868        if (estimatedValues == null) RecalculateEstimatedValues();
    69         return estimatedValues.AsEnumerable();
     69        return estimatedValues;
    7070      }
    7171    }
    7272
    7373    public override IEnumerable<double> EstimatedTrainingValues {
    74       get {
    75         if (estimatedValues == null) RecalculateEstimatedValues();
    76         int start = ProblemData.TrainingSamplesStart.Value;
    77         int n = ProblemData.TrainingSamplesEnd.Value - start;
    78         return estimatedValues.Skip(start).Take(n).ToList();
    79       }
     74      get { return GetEstimatedValues(ProblemData.TrainingIndizes); }
    8075    }
    8176
    8277    public override IEnumerable<double> EstimatedTestValues {
    83       get {
    84         if (estimatedValues == null) RecalculateEstimatedValues();
    85         int start = ProblemData.TestSamplesStart.Value;
    86         int n = ProblemData.TestSamplesEnd.Value - start;
    87         return estimatedValues.Skip(start).Take(n).ToList();
    88       }
     78      get { return GetEstimatedValues(ProblemData.TestIndizes); }
     79    }
     80
     81    public virtual IEnumerable<double> GetEstimatedValues(IEnumerable<int> rows) {
     82      if (estimatedValues == null) RecalculateEstimatedValues();
     83      foreach (int row in rows)
     84        yield return estimatedValues[row];
    8985    }
    9086  }
Note: See TracChangeset for help on using the changeset viewer.