Changeset 10570


Ignore:
Timestamp:
03/11/14 15:41:18 (6 years ago)
Author:
mkommend
Message:

#1998: Adde missing value handling in new implementation of OneR.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/ClassificationModelComparison/HeuristicLab.Algorithms.DataAnalysis/3.4/Linear/OneRTest.cs

    r10569 r10570  
    6767      List<Split> bestSplits = null;
    6868      string bestVariable = string.Empty;
     69      double bestMissingValuesClass = double.NaN;
     70      var classValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices);
    6971
    70       var classValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices);
    7172      foreach (var variable in problemData.AllowedInputVariables) {
    7273        var inputValues = problemData.Dataset.GetDoubleValues(variable, problemData.TrainingIndices);
    7374        var samples = inputValues.Zip(classValues, (i, v) => new Sample(i, v)).OrderBy(s => s.inputValue);
     75
     76        var missingValuesDistribution = samples.Where(s => double.IsNaN(s.inputValue)).GroupBy(s => s.classValue).ToDictionary(s => s.Key, s => s.Count()).MaxItems(s => s.Value).FirstOrDefault();
    7477
    7578        //calculate class distributions for all distinct inputValues
     
    7780        List<double> thresholds = new List<double>();
    7881        double lastValue = double.NaN;
    79         foreach (var sample in samples) {
     82        foreach (var sample in samples.Where(s => !double.IsNaN(s.inputValue))) {
    8083          if (sample.inputValue > lastValue || double.IsNaN(lastValue)) {
    8184            if (!double.IsNaN(lastValue)) thresholds.Add((lastValue + sample.inputValue) / 2);
     
    9699        for (int i = 1; i < classDistributions.Count; i++) {
    97100          var samplesInSplit = distribution.Max(d => d.Value);
    98           //join splits if too few sample in split or the distributions has the same maximum class value
     101          //join splits if there are too few samples in the split or the distributions has the same maximum class value as the current split
    99102          if (samplesInSplit < minBucketSize ||
    100103            classDistributions[i].MaxItems(d => d.Value).Select(d => d.Key).Contains(
     
    113116        int correctClassified = 0;
    114117        int splitIndex = 0;
    115         foreach (var sample in samples) {
     118        foreach (var sample in samples.Where(s => !double.IsNaN(s.inputValue))) {
    116119          while (sample.inputValue >= splits[splitIndex].thresholdValue)
    117120            splitIndex++;
    118121          correctClassified += sample.classValue == splits[splitIndex].classValue ? 1 : 0;
    119122        }
     123        correctClassified += missingValuesDistribution.Value;
    120124
    121125        if (correctClassified > bestClassified) {
     
    123127          bestSplits = splits;
    124128          bestVariable = variable;
     129          bestMissingValuesClass = missingValuesDistribution.Value == 0 ? double.NaN : missingValuesDistribution.Key;
    125130        }
    126131      }
     
    134139      }
    135140
    136       var model = new OneRClassificationModel(bestVariable, bestSplits.Select(s => s.thresholdValue).ToArray(), bestSplits.Select(s => s.classValue).ToArray(), 0);
     141      var model = new OneRClassificationModel(bestVariable, bestSplits.Select(s => s.thresholdValue).ToArray(), bestSplits.Select(s => s.classValue).ToArray(), bestMissingValuesClass);
    137142      var solution = new OneRClassificationSolution(model, (IClassificationProblemData)problemData.Clone());
    138143
Note: See TracChangeset for help on using the changeset viewer.