Changeset 8917


Ignore:
Timestamp:
11/16/12 20:59:55 (10 years ago)
Author:
gkronber
Message:

#1925 used symbolic solution from MATLAB for calculation of cut points. Fixed numerical problem with calculation of NormalCDF by calculating / approximated the logarithm of the normal CDF.

Location:
trunk/sources
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Classification/ThresholdCalculators/NormalDistributionCutPointsThresholdCalculator.cs

    r8913 r8917  
    9595
    9696      // add small value and large value for the calculation of most influential class in each thresholded section
    97       thresholdList.Insert(0, estimatedValues.Min() - 1);
    98       thresholdList.Add(estimatedValues.Max() + 1);
     97      thresholdList.Insert(0, double.NegativeInfinity);
     98      thresholdList.Add(double.PositiveInfinity);
    9999
    100100      // determine class values for each partition separated by a threshold by calculating the density of all class distributions
    101101      // all points in the partition are classified as the class with the maximal density in the parition
    102       List<double> classValuesList = new List<double>();
    103102      if (thresholdList.Count == 2) {
    104103        // this happens if there are no thresholds (distributions for all classes are exactly the same)
    105104        // -> all samples should be classified as the class with the most observations
    106105        // group observations by target class and select the class with largest count
    107         classValuesList.Add(targetClassValues.GroupBy(c => c)
    108           .OrderBy(g => g.Count())
    109           .Last().Key);
     106        double mostFrequentClass = targetClassValues.GroupBy(c => c)
     107                              .OrderBy(g => g.Count())
     108                              .Last().Key;
     109        thresholds = new double[] { double.NegativeInfinity };
     110        classValues = new double[] { mostFrequentClass };
    110111      } else {
     112
    111113        // at least one reasonable threshold ...
    112114        // find the most likely class for the points between thresholds m
     115        List<double> filteredThresholds = new List<double>();
     116        List<double> filteredClassValues = new List<double>();
    113117        for (int i = 0; i < thresholdList.Count - 1; i++) {
    114 
    115118          // determine class with maximal density mass between the thresholds
    116119          double maxDensity = DensityMass(thresholdList[i], thresholdList[i + 1], classMean[originalClasses[0]], classStdDev[originalClasses[0]]);
     
    123126            }
    124127          }
    125           classValuesList.Add(maxDensityClassValue);
    126         }
    127       }
    128 
    129       // only keep thresholds at which the class changes
    130       // class B overrides threshold s. So only thresholds r and t are relevant and have to be kept
    131       //
    132       //      A    B  C
    133       //       /\  /\/\       
    134       //      / r\/ /\t\       
    135       //     /   /\/  \ \     
    136       //    /   / /\s  \ \     
    137       //  -/---/-/ -\---\-\----
    138 
    139       List<double> filteredThresholds = new List<double>();
    140       List<double> filteredClassValues = new List<double>();
    141       filteredThresholds.Add(double.NegativeInfinity); // the smallest possible threshold for the first class
    142       filteredClassValues.Add(classValuesList[0]);
    143       // do not include the last threshold which was just needed for the previous step
    144       for (int i = 0; i < classValuesList.Count - 1; i++) {
    145         if (!classValuesList[i].IsAlmost(classValuesList[i + 1])) {
    146           filteredThresholds.Add(thresholdList[i + 1]);
    147           filteredClassValues.Add(classValuesList[i + 1]);
    148         }
    149       }
    150       thresholds = filteredThresholds.ToArray();
    151       classValues = filteredClassValues.ToArray();
    152     }
    153 
    154     private static double NormalCDF(double mu, double sigma, double x) {
    155       return 0.5 * (1 + alglib.normaldistr.errorfunction((x - mu) / (sigma * Math.Sqrt(2.0))));
     128          if (maxDensity > double.NegativeInfinity &&
     129            (filteredClassValues.Count == 0 || !maxDensityClassValue.IsAlmost(filteredClassValues.Last()))) {
     130            filteredThresholds.Add(thresholdList[i]);
     131            filteredClassValues.Add(maxDensityClassValue);
     132          }
     133        }
     134        thresholds = filteredThresholds.ToArray();
     135        classValues = filteredClassValues.ToArray();
     136      }
     137    }
     138
     139    private static double sqr2 = Math.Sqrt(2.0);
     140    // returns the density function of the standard normal distribution at x
     141    private static double NormalCDF(double x) {
     142      return 0.5 * (1 + alglib.errorfunction(x / sqr2));
     143    }
     144
     145    // approximation of the log of the normal cummulative distribution from the lightspeed toolbox by Tom Minka
     146    // http://research.microsoft.com/en-us/um/people/minka/software/lightspeed/
     147    private static double[] c = new double[] { -1, 5 / 2.0, -37 / 3.0, 353 / 4.0, -4081 / 5.0, 55205 / 6.0, -854197 / 7.0 };
     148    private static double LogNormalCDF(double x) {
     149      if (x >= -6.5)
     150        // calculate the log directly if x is large enough
     151        return Math.Log(NormalCDF(x));
     152      else {
     153        double z = Math.Pow(x, -2);
     154        // asymptotic series for logcdf
     155        double y = z * (c[0] + z * (c[1] + z * (c[2] + z * (c[3] + z * (c[4] + z * (c[5] + z * c[6]))))));
     156        return y - 0.5 * Math.Log(2 * Math.PI) - 0.5 * x * x - Math.Log(-x);
     157      }
    156158    }
    157159
     
    160162    private static double DensityMass(double lower, double upper, double mu, double sigma) {
    161163      if (sigma.IsAlmost(0.0)) {
    162         if (lower < mu && mu < upper) return 1.0; // all mass is between lower and upper
    163         else return 0; // no mass is between lower and upper
    164       }
    165 
    166       if (double.IsNegativeInfinity(lower)) return NormalCDF(mu, sigma, upper);
    167       else return NormalCDF(mu, sigma, upper) - NormalCDF(mu, sigma, lower);
     164        if (lower < mu && mu < upper) return 0.0; // all mass is between lower and upper
     165        else return double.NegativeInfinity; // no mass is between lower and upper
     166      }
     167
     168      if (lower > mu) {
     169        return DensityMass(-upper, -lower, -mu, sigma);
     170      }
     171
     172      upper = (upper - mu) / sigma;
     173      lower = (lower - mu) / sigma;
     174      if (double.IsNegativeInfinity(lower)) return LogNormalCDF(upper);
     175
     176      return LogNormalCDF(upper) + Math.Log(1 - Math.Exp(LogNormalCDF(lower) - LogNormalCDF(upper)));
    168177    }
    169178
     
    197206          // general case
    198207          // calculate the solutions x1, x2 where N(m1,s1) == N(m2,s2)
    199           double a = (s1 + s2) * (s1 - s2);
    200           double g = Math.Sqrt(s1 * s1 * s2 * s2 * ((m1 - m2) * (m1 - m2) + 2.0 * (s1 * s1 + s2 * s2) * Math.Log(s2 / s1)));
    201           double m1s2 = m1 * s2 * s2;
    202           double m2s1 = m2 * s1 * s1;
    203           x1 = (m2s1 - m1s2 - g) / a;
    204           x2 = (m2s1 - m1s2 + g) / a;
     208          double g = Math.Sqrt(2 * s2 * s2 * Math.Log(s2 / s1) - 2 * s1 * s1 * Math.Log(s2 / s1) - 2 * m1 * m2 + m1 * m1 + m2 * m2);
     209          double s = (s1 * s1 - s2 * s2);
     210          x1 =  (m2 * s1 * s1 - m1 * s2 * s2 + s1 * s2 * g) / s;
     211          x2 = -(m1 * s2 * s2 - m2 * s1 * s1 + s1 * s2 * g) / s;
    205212        }
    206213      }
  • trunk/sources/HeuristicLab.Tests/HeuristicLab.Problems.DataAnalysis-3.4/ThresholdCalculatorsTest.cs

    r8658 r8917  
    7979      {
    8080        // constant output values for all classes
    81         double[] estimatedValues = new double[] { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
    82         double[] targetClassValues = new double[] { 2.0, 2.0, 2.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0 };
     81        // most frequent class is 0
     82        double[] estimatedValues = new double[] { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
     83        double[] targetClassValues = new double[] { 2.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0 };
    8384        double[] classValues;
    8485        double[] thresholds;
     
    179180
    180181
    181         var expectedClassValues = new double[] { 2.0, 1.0, 2.0, 3.0 };
    182         var expectedTresholds = new double[] { double.NegativeInfinity, -18.365068542315438, 1.6573010498191565, 2.314962133866949 };
     182        var expectedClassValues = new double[] { 3.0, 1.0, 2.0, 3.0 };
     183        var expectedTresholds = new double[] { double.NegativeInfinity, -18.36483129043598, 1.6574168546810319, 2.3148463106026012 };
    183184
    184185        AssertEqual(expectedClassValues, classValues);
Note: See TracChangeset for help on using the changeset viewer.