Changeset 8913


Ignore:
Timestamp:
11/14/12 18:30:24 (9 years ago)
Author:
gkronber
Message:

#1925: made corrections as suggested by abeham in the code review. In particular hopefully corrected the calculation of the CDF.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Classification/ThresholdCalculators/NormalDistributionCutPointsThresholdCalculator.cs

    r8680 r8913  
    103103      if (thresholdList.Count == 2) {
    104104        // this happens if there are no thresholds (distributions for all classes are exactly the same)
    105         // -> all samples should be classified as the first class
    106         classValuesList.Add(originalClasses[0]);
     105        // -> all samples should be classified as the class with the most observations
     106        // group observations by target class and select the class with largest count
     107        classValuesList.Add(targetClassValues.GroupBy(c => c)
     108          .OrderBy(g => g.Count())
     109          .Last().Key);
    107110      } else {
    108111        // at least one reasonable threshold ...
     
    111114
    112115          // determine class with maximal density mass between the thresholds
    113           double maxDensity = LogNormalDensityMass(thresholdList[i], thresholdList[i + 1], classMean[originalClasses[0]], classStdDev[originalClasses[0]]);
     116          double maxDensity = DensityMass(thresholdList[i], thresholdList[i + 1], classMean[originalClasses[0]], classStdDev[originalClasses[0]]);
    114117          double maxDensityClassValue = originalClasses[0];
    115118          foreach (var classValue in originalClasses.Skip(1)) {
    116             double density = LogNormalDensityMass(thresholdList[i], thresholdList[i + 1], classMean[classValue], classStdDev[classValue]);
     119            double density = DensityMass(thresholdList[i], thresholdList[i + 1], classMean[classValue], classStdDev[classValue]);
    117120            if (density > maxDensity) {
    118121              maxDensity = density;
     
    149152    }
    150153
    151     private static double LogNormalDensityMass(double lower, double upper, double mu, double sigma) {
     154    private static double NormalCDF(double mu, double sigma, double x) {
     155      return 0.5 * (1 + alglib.normaldistr.errorfunction((x - mu) / (sigma * Math.Sqrt(2.0))));
     156    }
     157
     158    // determines the value NormalCDF(mu,sigma, upper)  - NormalCDF(mu, sigma, lower)
     159    // = the integral of the PDF of N(mu, sigma) in the range [lower, upper]
     160    private static double DensityMass(double lower, double upper, double mu, double sigma) {
    152161      if (sigma.IsAlmost(0.0)) {
    153         if (lower < mu && mu < upper) return double.PositiveInfinity; // log(1)
    154         else return double.NegativeInfinity; // log(0)
    155       }
    156 
    157       Func<double, double> f = (x) =>
    158         x * -0.5 * Math.Log(2.0 * Math.PI * sigma * sigma) - Math.Pow(x - mu, 3) / (3 * 2.0 * sigma * sigma);
    159 
    160       if (double.IsNegativeInfinity(lower)) return f(upper);
    161       else return f(upper) - f(lower);
    162     }
    163 
     162        if (lower < mu && mu < upper) return 1.0; // all mass is between lower and upper
     163        else return 0; // no mass is between lower and upper
     164      }
     165
     166      if (double.IsNegativeInfinity(lower)) return NormalCDF(mu, sigma, upper);
     167      else return NormalCDF(mu, sigma, upper) - NormalCDF(mu, sigma, lower);
     168    }
     169
     170    // Calculates the points x1 and x2 where the distributions N(m1, s1) == N(m2,s2).
     171    // In the general case there should be two cut points. If either s1 or s2 is 0 then x1==x2.
     172    // If both s1 and s2 are zero than there are no cut points but we should return something reasonable (e.g. (m1 + m2) / 2) then.
    164173    private static void CalculateCutPoints(double m1, double s1, double m2, double s2, out double x1, out double x2) {
    165174      if (s1.IsAlmost(s2)) {
     
    168177          x2 = double.NegativeInfinity;
    169178        } else {
     179          // s1==s2 and m1 != m2
     180          // return something reasonable. cut point should be half way between m1 and m2
    170181          x1 = (m1 + m2) / 2;
    171182          x2 = double.NegativeInfinity;
    172183        }
    173184      } else if (s1.IsAlmost(0.0)) {
     185        // when s1 is 0.0 the cut points are exactly at m1 ...
    174186        x1 = m1;
    175187        x2 = m1;
    176188      } else if (s2.IsAlmost(0.0)) {
     189        // ... same for s2
    177190        x1 = m2;
    178191        x2 = m2;
     
    182195          CalculateCutPoints(m2, s2, m1, s1, out x1, out x2);
    183196        } else {
     197          // general case
     198          // calculate the solutions x1, x2 where N(m1,s1) == N(m2,s2)
    184199          double a = (s1 + s2) * (s1 - s2);
    185200          double g = Math.Sqrt(s1 * s1 * s2 * s2 * ((m1 - m2) * (m1 - m2) + 2.0 * (s1 * s1 + s2 * s2) * Math.Log(s2 / s1)));
    186201          double m1s2 = m1 * s2 * s2;
    187202          double m2s1 = m2 * s1 * s1;
    188           x1 = -(-m2s1 + m1s2 + g) / a;
     203          x1 = (m2s1 - m1s2 - g) / a;
    189204          x2 = (m2s1 - m1s2 + g) / a;
    190205        }
Note: See TracChangeset for help on using the changeset viewer.