source: trunk/sources/HeuristicLab.Problems.DataAnalysis/3.4/OnlineCalculators/DependencyCalculator/HoeffdingsDependenceCalculator.cs @ 13938

Last change on this file since 13938 was 13938, checked in by mkommend, 5 years ago

#2619:

  • Refactored and separated the different feature correlation calculations.
  • Added a checkbox to ignore missing values in the calculation.
File size: 5.5 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25
26namespace HeuristicLab.Problems.DataAnalysis {
27  public class HoeffdingsDependenceCalculator : IDependencyCalculator {
28
29    public double Maximum { get { return 1.0; } }
30
31    public double Minimum { get { return -0.5; } }
32
33    public string Name { get { return "Hoeffdings Dependence"; } }
34
35    public double Calculate(IEnumerable<double> originalValues, IEnumerable<double> estimatedValues, out OnlineCalculatorError errorState) {
36      return HoeffdingsDependenceCalculator.CalculateHoeffdings(originalValues, estimatedValues, out errorState);
37    }
38
39    public static double CalculateHoeffdings(IEnumerable<double> originalValues, IEnumerable<double> estimatedValues, out OnlineCalculatorError errorState) {
40      double d = HoeffD(originalValues, estimatedValues, out errorState);
41      if (errorState != OnlineCalculatorError.None) return double.NaN;
42      return d;
43    }
44
45    public double Calculate(IEnumerable<Tuple<double, double>> values, out OnlineCalculatorError errorState) {
46      return HoeffD(values.Select(v => v.Item1), values.Select(v => v.Item2), out errorState);
47    }
48
49    /// <summary>
50    /// computes Hoeffding's dependence coefficient.
51    /// Source: hoeffd.r from R package hmisc http://cran.r-project.org/web/packages/Hmisc/index.html
52    /// </summary>
53    private static double HoeffD(IEnumerable<double> xs, IEnumerable<double> ys, out OnlineCalculatorError errorState) {
54      double[] rx = TiedRank(xs);
55      double[] ry = TiedRank(ys);
56      if (rx.Length != ry.Length) throw new ArgumentException("The number of elements in xs and ys does not match");
57      double[] rxy = TiedRank(xs, ys);
58
59      int n = rx.Length;
60      double q = 0, r = 0, s = 0;
61      double scaling = 1.0 / (n * (n - 1));
62      for (int i = 0; i < n; i++) {
63        q += (rx[i] - 1) * (rx[i] - 2) * (ry[i] - 1) * (ry[i] - 2) * scaling;
64        r += (rx[i] - 2) * (ry[i] - 2) * rxy[i] * scaling;
65        s += rxy[i] * (rxy[i] - 1) * scaling;
66      }
67      errorState = OnlineCalculatorError.None;
68      // return 30.0 * (q - 2 * (n - 2) * r + (n - 2) * (n - 3) * s) / n / (n - 1) / (n - 2) / (n - 3) / (n - 4);
69      double t0 = q / (n - 2) / (n - 3) / (n - 4);
70      double t1 = 2 * r / (n - 3) / (n - 4);
71      double t2 = s / (n - 4);
72      return 30.0 * (t0 - t1 + t2);
73    }
74
75    private static double[] TiedRank(IEnumerable<double> xs) {
76      var xsArr = xs.ToArray();
77      var idx = Enumerable.Range(1, xsArr.Length).ToArray();
78      Array.Sort(xsArr, idx);
79      CRank(xsArr);
80      Array.Sort(idx, xsArr);
81      return xsArr;
82    }
83
84    /// <summary>
85    /// Calculates the joint rank with midranks for ties. Source: hoeffd.r from R package hmisc http://cran.r-project.org/web/packages/Hmisc/index.html
86    /// </summary>
87    /// <param name="xs"></param>
88    /// <param name="ys"></param>
89    /// <returns></returns>
90    private static double[] TiedRank(IEnumerable<double> xs, IEnumerable<double> ys) {
91      var xsArr = xs.ToArray();
92      var ysArr = ys.ToArray();
93      var r = new double[xsArr.Length];
94      int n = r.Length;
95      for (int i = 0; i < n; i++) {
96        var xi = xsArr[i];
97        var yi = ysArr[i];
98        double ri = 0.0;
99        for (int j = 0; j < n; j++) {
100          if (i != j) {
101            double cx;
102            if (xsArr[j] < xi) cx = 1.0;
103            else if (xsArr[j] > xi) cx = 0.0;
104            else cx = 0.5;  // eq
105            double cy;
106            if (ysArr[j] < yi) cy = 1.0;
107            else if (ysArr[j] > yi) cy = 0.0;
108            else cy = 0.5; // eq
109            ri = ri + cx * cy;
110          }
111        }
112        r[i] = ri;
113      }
114      return r;
115    }
116
117    /// <summary>
118    /// Calculates midranks. Source: Numerical Recipes in C. p 642
119    /// </summary>
120    /// <param name="w">Sorted array of elements, replaces the elements by their rank, including midranking of ties</param>
121    /// <returns></returns>
122    private static void CRank(double[] w) {
123      int i = 0;
124      int n = w.Length;
125      while (i < n - 1) {
126        if (w[i + 1] > w[i]) {    // w[i+1] must be larger or equal w[i] as w must be sorted
127          // not a tie
128          w[i] = i + 1;
129          i++;
130        } else {
131          int j;
132          for (j = i + 1; j < n && w[j] <= w[i]; j++) ; // how far does it go (<= effectively means == as w must be sorted, side-step equality for double values)
133          double rank = 1 + 0.5 * (i + j - 1);
134          int k;
135          for (k = i; k < j; k++) w[k] = rank; // set the rank for all tied entries
136          i = j;
137        }
138      }
139
140      if (i == n - 1) w[n - 1] = n;   // if the last element was not tied, this is its rank
141    }
142  }
143}
Note: See TracBrowser for help on using the repository browser.