Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
07/10/12 15:26:13 (12 years ago)
Author:
sforsten
Message:

#1292:

  • merged r8034:8179 from trunk
  • added BackgroundWorker
  • added ProgressBar
  • added SpearmansRankCorrelationCoefficientCalculator
  • corrected bug in HoeffdingsDependenceCalculator
  • made some changes in the GUI
Location:
branches/DatasetFeatureCorrelation/HeuristicLab.Problems.DataAnalysis
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • branches/DatasetFeatureCorrelation/HeuristicLab.Problems.DataAnalysis

  • branches/DatasetFeatureCorrelation/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Classification/ClassificationEnsembleSolution.cs

    r7259 r8276  
    3737  [Creatable("Data Analysis - Ensembles")]
    3838  public sealed class ClassificationEnsembleSolution : ClassificationSolution, IClassificationEnsembleSolution {
     39    private readonly Dictionary<int, double> trainingEvaluationCache = new Dictionary<int, double>();
     40    private readonly Dictionary<int, double> testEvaluationCache = new Dictionary<int, double>();
     41
    3942    public new IClassificationEnsembleModel Model {
    4043      get { return (IClassificationEnsembleModel)base.Model; }
     
    8588      }
    8689
     90      trainingEvaluationCache = new Dictionary<int, double>(original.ProblemData.TrainingIndices.Count());
     91      testEvaluationCache = new Dictionary<int, double>(original.ProblemData.TestIndices.Count());
     92
    8793      classificationSolutions = cloner.Clone(original.classificationSolutions);
    8894      RegisterClassificationSolutionsEventHandler();
     
    128134      }
    129135
     136      trainingEvaluationCache = new Dictionary<int, double>(problemData.TrainingIndices.Count());
     137      testEvaluationCache = new Dictionary<int, double>(problemData.TestIndices.Count());
     138
    130139      RegisterClassificationSolutionsEventHandler();
    131140      classificationSolutions.AddRange(solutions);
     
    148157    public override IEnumerable<double> EstimatedTrainingClassValues {
    149158      get {
    150         var rows = ProblemData.TrainingIndizes;
    151         var estimatedValuesEnumerators = (from model in Model.Models
    152                                           select new { Model = model, EstimatedValuesEnumerator = model.GetEstimatedClassValues(ProblemData.Dataset, rows).GetEnumerator() })
    153                                          .ToList();
    154         var rowsEnumerator = rows.GetEnumerator();
    155         // aggregate to make sure that MoveNext is called for all enumerators
    156         while (rowsEnumerator.MoveNext() & estimatedValuesEnumerators.Select(en => en.EstimatedValuesEnumerator.MoveNext()).Aggregate(true, (acc, b) => acc & b)) {
    157           int currentRow = rowsEnumerator.Current;
    158 
    159           var selectedEnumerators = from pair in estimatedValuesEnumerators
    160                                     where RowIsTrainingForModel(currentRow, pair.Model) && !RowIsTestForModel(currentRow, pair.Model)
    161                                     select pair.EstimatedValuesEnumerator;
    162           yield return AggregateEstimatedClassValues(selectedEnumerators.Select(x => x.Current));
     159        var rows = ProblemData.TrainingIndices;
     160        var rowsToEvaluate = rows.Except(trainingEvaluationCache.Keys);
     161        var rowsEnumerator = rowsToEvaluate.GetEnumerator();
     162        var valuesEnumerator = GetEstimatedValues(rowsToEvaluate, (r, m) => RowIsTrainingForModel(r, m) && !RowIsTestForModel(r, m)).GetEnumerator();
     163
     164        while (rowsEnumerator.MoveNext() & valuesEnumerator.MoveNext()) {
     165          trainingEvaluationCache.Add(rowsEnumerator.Current, valuesEnumerator.Current);
    163166        }
     167
     168        return rows.Select(row => trainingEvaluationCache[row]);
    164169      }
    165170    }
     
    167172    public override IEnumerable<double> EstimatedTestClassValues {
    168173      get {
    169         var rows = ProblemData.TestIndizes;
    170         var estimatedValuesEnumerators = (from model in Model.Models
    171                                           select new { Model = model, EstimatedValuesEnumerator = model.GetEstimatedClassValues(ProblemData.Dataset, rows).GetEnumerator() })
    172                                          .ToList();
    173         var rowsEnumerator = ProblemData.TestIndizes.GetEnumerator();
    174         // aggregate to make sure that MoveNext is called for all enumerators
    175         while (rowsEnumerator.MoveNext() & estimatedValuesEnumerators.Select(en => en.EstimatedValuesEnumerator.MoveNext()).Aggregate(true, (acc, b) => acc & b)) {
    176           int currentRow = rowsEnumerator.Current;
    177 
    178           var selectedEnumerators = from pair in estimatedValuesEnumerators
    179                                     where RowIsTestForModel(currentRow, pair.Model)
    180                                     select pair.EstimatedValuesEnumerator;
    181 
    182           yield return AggregateEstimatedClassValues(selectedEnumerators.Select(x => x.Current));
     174        var rows = ProblemData.TestIndices;
     175        var rowsToEvaluate = rows.Except(testEvaluationCache.Keys);
     176        var rowsEnumerator = rowsToEvaluate.GetEnumerator();
     177        var valuesEnumerator = GetEstimatedValues(rowsToEvaluate, RowIsTestForModel).GetEnumerator();
     178
     179        while (rowsEnumerator.MoveNext() & valuesEnumerator.MoveNext()) {
     180          testEvaluationCache.Add(rowsEnumerator.Current, valuesEnumerator.Current);
    183181        }
     182
     183        return rows.Select(row => testEvaluationCache[row]);
     184      }
     185    }
     186
     187    private IEnumerable<double> GetEstimatedValues(IEnumerable<int> rows, Func<int, IClassificationModel, bool> modelSelectionPredicate) {
     188      var estimatedValuesEnumerators = (from model in Model.Models
     189                                        select new { Model = model, EstimatedValuesEnumerator = model.GetEstimatedClassValues(ProblemData.Dataset, rows).GetEnumerator() })
     190                                       .ToList();
     191      var rowsEnumerator = rows.GetEnumerator();
     192      // aggregate to make sure that MoveNext is called for all enumerators
     193      while (rowsEnumerator.MoveNext() & estimatedValuesEnumerators.Select(en => en.EstimatedValuesEnumerator.MoveNext()).Aggregate(true, (acc, b) => acc & b)) {
     194        int currentRow = rowsEnumerator.Current;
     195
     196        var selectedEnumerators = from pair in estimatedValuesEnumerators
     197                                  where modelSelectionPredicate(currentRow, pair.Model)
     198                                  select pair.EstimatedValuesEnumerator;
     199
     200        yield return AggregateEstimatedClassValues(selectedEnumerators.Select(x => x.Current));
    184201      }
    185202    }
     
    196213
    197214    public override IEnumerable<double> GetEstimatedClassValues(IEnumerable<int> rows) {
    198       return from xs in GetEstimatedClassValueVectors(ProblemData.Dataset, rows)
    199              select AggregateEstimatedClassValues(xs);
     215      var rowsToEvaluate = rows.Except(evaluationCache.Keys);
     216      var rowsEnumerator = rowsToEvaluate.GetEnumerator();
     217      var valuesEnumerator = (from xs in GetEstimatedClassValueVectors(ProblemData.Dataset, rowsToEvaluate)
     218                              select AggregateEstimatedClassValues(xs))
     219                             .GetEnumerator();
     220
     221      while (rowsEnumerator.MoveNext() & valuesEnumerator.MoveNext()) {
     222        evaluationCache.Add(rowsEnumerator.Current, valuesEnumerator.Current);
     223      }
     224
     225      return rows.Select(row => evaluationCache[row]);
    200226    }
    201227
     
    223249
    224250    protected override void OnProblemDataChanged() {
     251      trainingEvaluationCache.Clear();
     252      testEvaluationCache.Clear();
     253      evaluationCache.Clear();
     254
    225255      IClassificationProblemData problemData = new ClassificationProblemData(ProblemData.Dataset,
    226256                                                                     ProblemData.AllowedInputVariables,
     
    251281    public void AddClassificationSolutions(IEnumerable<IClassificationSolution> solutions) {
    252282      classificationSolutions.AddRange(solutions);
     283
     284      trainingEvaluationCache.Clear();
     285      testEvaluationCache.Clear();
     286      evaluationCache.Clear();
    253287    }
    254288    public void RemoveClassificationSolutions(IEnumerable<IClassificationSolution> solutions) {
    255289      classificationSolutions.RemoveRange(solutions);
     290
     291      trainingEvaluationCache.Clear();
     292      testEvaluationCache.Clear();
     293      evaluationCache.Clear();
    256294    }
    257295
     
    275313      trainingPartitions[solution.Model] = solution.ProblemData.TrainingPartition;
    276314      testPartitions[solution.Model] = solution.ProblemData.TestPartition;
     315
     316      trainingEvaluationCache.Clear();
     317      testEvaluationCache.Clear();
     318      evaluationCache.Clear();
    277319    }
    278320
     
    282324      trainingPartitions.Remove(solution.Model);
    283325      testPartitions.Remove(solution.Model);
     326
     327      trainingEvaluationCache.Clear();
     328      testEvaluationCache.Clear();
     329      evaluationCache.Clear();
    284330    }
    285331  }
  • branches/DatasetFeatureCorrelation/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Classification/ClassificationProblemData.cs

    r7823 r8276  
    207207
    208208    #region parameter properties
    209     public ConstrainedValueParameter<StringValue> TargetVariableParameter {
    210       get { return (ConstrainedValueParameter<StringValue>)Parameters[TargetVariableParameterName]; }
     209    public IConstrainedValueParameter<StringValue> TargetVariableParameter {
     210      get { return (IConstrainedValueParameter<StringValue>)Parameters[TargetVariableParameterName]; }
    211211    }
    212212    public IFixedValueParameter<StringMatrix> ClassNamesParameter {
  • branches/DatasetFeatureCorrelation/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Classification/ClassificationSolution.cs

    r7259 r8276  
    4444    public ClassificationSolution(IClassificationModel model, IClassificationProblemData problemData)
    4545      : base(model, problemData) {
    46       evaluationCache = new Dictionary<int, double>();
     46      evaluationCache = new Dictionary<int, double>(problemData.Dataset.Rows);
    4747    }
    4848
     
    5151    }
    5252    public override IEnumerable<double> EstimatedTrainingClassValues {
    53       get { return GetEstimatedClassValues(ProblemData.TrainingIndizes); }
     53      get { return GetEstimatedClassValues(ProblemData.TrainingIndices); }
    5454    }
    5555    public override IEnumerable<double> EstimatedTestClassValues {
    56       get { return GetEstimatedClassValues(ProblemData.TestIndizes); }
     56      get { return GetEstimatedClassValues(ProblemData.TestIndices); }
    5757    }
    5858
  • branches/DatasetFeatureCorrelation/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Classification/ClassificationSolutionBase.cs

    r7259 r8276  
    8787    protected void CalculateResults() {
    8888      double[] estimatedTrainingClassValues = EstimatedTrainingClassValues.ToArray(); // cache values
    89       double[] originalTrainingClassValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TrainingIndizes).ToArray();
     89      double[] originalTrainingClassValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TrainingIndices).ToArray();
    9090      double[] estimatedTestClassValues = EstimatedTestClassValues.ToArray(); // cache values
    91       double[] originalTestClassValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TestIndizes).ToArray();
     91      double[] originalTestClassValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TestIndices).ToArray();
    9292
    9393      OnlineCalculatorError errorState;
  • branches/DatasetFeatureCorrelation/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Classification/DiscriminantFunctionClassificationSolution.cs

    r7259 r8276  
    5959    }
    6060    public override IEnumerable<double> EstimatedTrainingClassValues {
    61       get { return GetEstimatedClassValues(ProblemData.TrainingIndizes); }
     61      get { return GetEstimatedClassValues(ProblemData.TrainingIndices); }
    6262    }
    6363    public override IEnumerable<double> EstimatedTestClassValues {
    64       get { return GetEstimatedClassValues(ProblemData.TestIndizes); }
     64      get { return GetEstimatedClassValues(ProblemData.TestIndices); }
    6565    }
    6666
     
    8282    }
    8383    public override IEnumerable<double> EstimatedTrainingValues {
    84       get { return GetEstimatedValues(ProblemData.TrainingIndizes); }
     84      get { return GetEstimatedValues(ProblemData.TrainingIndices); }
    8585    }
    8686    public override IEnumerable<double> EstimatedTestValues {
    87       get { return GetEstimatedValues(ProblemData.TestIndizes); }
     87      get { return GetEstimatedValues(ProblemData.TestIndices); }
    8888    }
    8989
  • branches/DatasetFeatureCorrelation/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Classification/DiscriminantFunctionClassificationSolutionBase.cs

    r7259 r8276  
    103103    protected void CalculateRegressionResults() {
    104104      double[] estimatedTrainingValues = EstimatedTrainingValues.ToArray(); // cache values
    105       double[] originalTrainingValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TrainingIndizes).ToArray();
     105      double[] originalTrainingValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TrainingIndices).ToArray();
    106106      double[] estimatedTestValues = EstimatedTestValues.ToArray(); // cache values
    107       double[] originalTestValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TestIndizes).ToArray();
     107      double[] originalTestValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TestIndices).ToArray();
    108108
    109109      OnlineCalculatorError errorState;
     
    140140      double[] classValues;
    141141      double[] thresholds;
    142       var targetClassValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TrainingIndizes);
     142      var targetClassValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TrainingIndices);
    143143      AccuracyMaximizationThresholdCalculator.CalculateThresholds(ProblemData, EstimatedTrainingValues, targetClassValues, out classValues, out thresholds);
    144144
     
    149149      double[] classValues;
    150150      double[] thresholds;
    151       var targetClassValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TrainingIndizes);
     151      var targetClassValues = ProblemData.Dataset.GetDoubleValues(ProblemData.TargetVariable, ProblemData.TrainingIndices);
    152152      NormalDistributionCutPointsThresholdCalculator.CalculateThresholds(ProblemData, EstimatedTrainingValues, targetClassValues, out classValues, out thresholds);
    153153
  • branches/DatasetFeatureCorrelation/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Classification/ThresholdCalculators/AccuracyMaximizationThresholdCalculator.cs

    r7259 r8276  
    5454    public static void CalculateThresholds(IClassificationProblemData problemData, IEnumerable<double> estimatedValues, IEnumerable<double> targetClassValues, out double[] classValues, out double[] thresholds) {
    5555      int slices = 100;
     56      double minThresholdInc = 10e-5; // necessary to prevent infinite loop when maxEstimated - minEstimated is effectively zero (constant model)
    5657      List<double> estimatedValuesList = estimatedValues.ToList();
    5758      double maxEstimatedValue = estimatedValuesList.Max();
    5859      double minEstimatedValue = estimatedValuesList.Min();
    59       double thresholdIncrement = (maxEstimatedValue - minEstimatedValue) / slices;
     60      double thresholdIncrement = Math.Max((maxEstimatedValue - minEstimatedValue) / slices, minThresholdInc);
    6061      var estimatedAndTargetValuePairs =
    6162        estimatedValuesList.Zip(targetClassValues, (x, y) => new { EstimatedValue = x, TargetClassValue = y })
     
    7071
    7172      // incrementally calculate accuracy of all possible thresholds
    72       int[,] confusionMatrix = new int[nClasses, nClasses];
    73 
    7473      for (int i = 1; i < thresholds.Length; i++) {
    7574        double lowerThreshold = thresholds[i - 1];
Note: See TracChangeset for help on using the changeset viewer.