Free cookie consent management tool by TermsFeed Policy Generator

source: branches/2434_crossvalidation/HeuristicLab.Problems.DataAnalysis.Symbolic.Classification.Views/3.4/SymbolicDiscriminantFunctionClassificationSolutionExcelExporter.cs @ 16752

Last change on this file since 16752 was 11828, checked in by bburlacu, 10 years ago

#2098: Updated error column in the exported excel file to take the correct error (this fixes the MSE calculation).

File size: 14.8 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.IO;
4using System.Linq;
5using System.Text;
6using HeuristicLab.Problems.DataAnalysis.Symbolic.Views;
7using OfficeOpenXml;
8using OfficeOpenXml.Drawing.Chart;
9
10namespace HeuristicLab.Problems.DataAnalysis.Symbolic.Classification.Views {
11  public class SymbolicDiscriminantFunctionClassificationSolutionExcelExporter : SymbolicSolutionExcelExporter {
12    public override void Export(IDataAnalysisSolution solution, string fileName) {
13      var symbDiscriminantSolution = solution as SymbolicDiscriminantFunctionClassificationSolution;
14      if (symbDiscriminantSolution == null) throw new NotSupportedException("This solution cannot be exported to Excel");
15      var formatter = new SymbolicDataAnalysisExpressionExcelFormatter();
16      var formula = formatter.Format(symbDiscriminantSolution.Model.SymbolicExpressionTree, solution.ProblemData.Dataset);
17      ExportChart(fileName, symbDiscriminantSolution, formula);
18    }
19
20    private void ExportChart(string fileName, SymbolicDiscriminantFunctionClassificationSolution solution, string formula) {
21      FileInfo newFile = new FileInfo(fileName);
22      if (newFile.Exists) {
23        newFile.Delete();
24        newFile = new FileInfo(fileName);
25      }
26      var formulaParts = formula.Split(new string[] { Environment.NewLine }, StringSplitOptions.None);
27
28      using (ExcelPackage package = new ExcelPackage(newFile)) {
29        ExcelWorksheet modelWorksheet = package.Workbook.Worksheets.Add("Model");
30        FormatModelSheet(modelWorksheet, solution, formulaParts);
31
32        ExcelWorksheet datasetWorksheet = package.Workbook.Worksheets.Add("Dataset");
33        WriteDatasetToExcel(datasetWorksheet, solution.ProblemData);
34
35        ExcelWorksheet inputsWorksheet = package.Workbook.Worksheets.Add("Inputs");
36        WriteInputSheet(inputsWorksheet, datasetWorksheet, formulaParts.Skip(2), solution.ProblemData.Dataset);
37
38        ExcelWorksheet estimatedWorksheet = package.Workbook.Worksheets.Add("Estimated Values");
39        WriteEstimatedWorksheet(estimatedWorksheet, datasetWorksheet, formulaParts, solution);
40
41        ExcelWorksheet chartsWorksheet = package.Workbook.Worksheets.Add("Charts");
42        AddCharts(chartsWorksheet, solution);
43        package.Workbook.Properties.Title = "Excel Export";
44        package.Workbook.Properties.Author = "HEAL";
45        package.Workbook.Properties.Comments = "Excel export of a symbolic data analysis solution from HeuristicLab";
46
47        package.Save();
48      }
49    }
50
51    private void FormatModelSheet(ExcelWorksheet modelWorksheet, SymbolicDiscriminantFunctionClassificationSolution solution, IEnumerable<string> formulaParts) {
52      int row = 1;
53      modelWorksheet.Cells[row, 1].Value = "Model";
54      modelWorksheet.Cells[row, 2].Value = solution.Name;
55
56      foreach (var part in formulaParts) {
57        modelWorksheet.Cells[row, 4].Value = part;
58        row++;
59      }
60
61      row = 2;
62      modelWorksheet.Cells[row, 1].Value = "Model Depth";
63      modelWorksheet.Cells[row, 2].Value = solution.Model.SymbolicExpressionTree.Depth;
64      row++;
65
66      modelWorksheet.Cells[row, 1].Value = "Model Length";
67      modelWorksheet.Cells[row, 2].Value = solution.Model.SymbolicExpressionTree.Length;
68      row += 2;
69
70      var thresholds = solution.Model.Thresholds.ToList();
71      // skip first (-inf) and last (+inf) thresholds
72      for (int i = 0; i < thresholds.Count; ++i) {
73        if (double.IsInfinity(thresholds[i]) || double.IsNaN(thresholds[i]))
74          continue;
75        modelWorksheet.Cells[row, 1].Value = "Threshold " + i;
76        modelWorksheet.Cells[row, 2].Value = thresholds[i];
77        ++row;
78      }
79      row++;
80
81      modelWorksheet.Cells[row, 1].Value = "Estimation Limits Lower";
82      modelWorksheet.Cells[row, 2].Value = Math.Max(solution.Model.LowerEstimationLimit, -9.99999999999999E+307); // minimal value supported by excel
83      modelWorksheet.Names.Add("EstimationLimitLower", modelWorksheet.Cells[row, 2]);
84      modelWorksheet.Cells[row, 2].Style.Numberformat.Format = "0.000E+00";
85      row++;
86
87      modelWorksheet.Cells[row, 1].Value = "Estimation Limits Upper";
88      modelWorksheet.Cells[row, 2].Value = Math.Min(solution.Model.UpperEstimationLimit, 9.99999999999999E+307);  // maximal value supported by excel
89      modelWorksheet.Names.Add("EstimationLimitUpper", modelWorksheet.Cells[row, 2]);
90      modelWorksheet.Cells[row, 2].Style.Numberformat.Format = "0.000E+00";
91      row += 2;
92
93      modelWorksheet.Cells[row, 1].Value = "Trainings Partition Start";
94      modelWorksheet.Cells[row, 2].Value = solution.ProblemData.TrainingPartition.Start;
95      modelWorksheet.Names.Add(TRAININGSTART, modelWorksheet.Cells[row, 2]);
96      row++;
97
98      modelWorksheet.Cells[row, 1].Value = "Trainings Partition End";
99      modelWorksheet.Cells[row, 2].Value = solution.ProblemData.TrainingPartition.End;
100      modelWorksheet.Names.Add(TRAININGEND, modelWorksheet.Cells[row, 2]);
101      row++;
102
103      modelWorksheet.Cells[row, 1].Value = "Test Partition Start";
104      modelWorksheet.Cells[row, 2].Value = solution.ProblemData.TestPartition.Start;
105      modelWorksheet.Names.Add(TESTSTART, modelWorksheet.Cells[row, 2]);
106      row++;
107
108      modelWorksheet.Cells[row, 1].Value = "Test Partition End";
109      modelWorksheet.Cells[row, 2].Value = solution.ProblemData.TestPartition.End;
110      modelWorksheet.Names.Add(TESTEND, modelWorksheet.Cells[row, 2]);
111      row += 2;
112
113      string excelTrainingTarget = Indirect("B", true);
114      string excelTrainingEstimated = Indirect("C", true);
115      string excelTrainingClassValues = Indirect("D", true);
116      string excelTrainingMeanError = Indirect("F", true);
117      string excelTrainingMSE = Indirect("G", true);
118
119      string excelTestTarget = Indirect("B", false);
120      string excelTestEstimated = Indirect("C", false);
121      string excelTestClassValues = Indirect("D", false);
122      string excelTestMeanError = Indirect("F", false);
123      string excelTestMSE = Indirect("G", false);
124
125      modelWorksheet.Cells[row, 1].Value = "Accuracy (training)";
126      modelWorksheet.Cells[row, 2].Formula = string.Format("SUMPRODUCT(({0}={1})*1)/COUNT({0})", excelTrainingClassValues, excelTrainingTarget);
127      modelWorksheet.Cells[row, 2].Style.Numberformat.Format = "0.000";
128      row++;
129
130      modelWorksheet.Cells[row, 1].Value = "Accuracy (test)";
131      modelWorksheet.Cells[row, 2].Formula = string.Format("SUMPRODUCT(({0}={1})*1)/COUNT({0})", excelTestClassValues, excelTestTarget);
132      modelWorksheet.Cells[row, 2].Style.Numberformat.Format = "0.000";
133      row++;
134
135      modelWorksheet.Cells[row, 1].Value = "Pearson's R² (training)";
136      modelWorksheet.Cells[row, 2].Formula = string.Format("POWER(PEARSON({0},{1}),2)", excelTrainingTarget, excelTrainingEstimated);
137      modelWorksheet.Cells[row, 2].Style.Numberformat.Format = "0.000";
138      row++;
139
140      modelWorksheet.Cells[row, 1].Value = "Pearson's R² (test)";
141      modelWorksheet.Cells[row, 2].Formula = string.Format("POWER(PEARSON({0},{1}),2)", excelTestTarget, excelTestEstimated);
142      modelWorksheet.Cells[row, 2].Style.Numberformat.Format = "0.000";
143      row++;
144
145      modelWorksheet.Cells[row, 1].Value = "Mean Squared Error (training)";
146      modelWorksheet.Cells[row, 2].Formula = string.Format("AVERAGE({0})", excelTrainingMSE);
147      modelWorksheet.Names.Add("TrainingMSE", modelWorksheet.Cells[row, 2]);
148      modelWorksheet.Cells[row, 2].Style.Numberformat.Format = "0.000E+00";
149      row++;
150
151      modelWorksheet.Cells[row, 1].Value = "Mean Squared Error (test)";
152      modelWorksheet.Cells[row, 2].Formula = string.Format("AVERAGE({0})", excelTestMSE);
153      modelWorksheet.Names.Add("TestMSE", modelWorksheet.Cells[row, 2]);
154      modelWorksheet.Cells[row, 2].Style.Numberformat.Format = "0.000E+00";
155      row++;
156
157      modelWorksheet.Cells[row, 1].Value = "Normalized Gini Coefficient (training)";
158      modelWorksheet.Cells[row, 2].Formula = string.Format("AVERAGE({0})", excelTrainingMeanError);
159      modelWorksheet.Cells[row, 2].Style.Numberformat.Format = "0.000E+00";
160      row++;
161
162      modelWorksheet.Cells[row, 1].Value = "Normalized Gini Coefficient (test)";
163      modelWorksheet.Cells[row, 2].Formula = string.Format("AVERAGE({0})", excelTestMeanError);
164      modelWorksheet.Cells[row, 2].Style.Numberformat.Format = "0.000E+00";
165      row++;
166
167      modelWorksheet.Cells["A1:B" + row].AutoFitColumns();
168
169      AddModelTreePicture(modelWorksheet, solution.Model);
170    }
171
172    private void WriteEstimatedWorksheet(ExcelWorksheet estimatedWorksheet, ExcelWorksheet datasetWorksheet, string[] formulaParts, SymbolicDiscriminantFunctionClassificationSolution solution) {
173      string preparedFormula = PrepareFormula(formulaParts);
174      int rows = solution.ProblemData.Dataset.Rows;
175      estimatedWorksheet.Cells[1, 1].Value = "Id"; // A
176      estimatedWorksheet.Cells[1, 2].Value = "Target Variable"; // B
177      estimatedWorksheet.Cells[1, 3].Value = "Estimated Values"; // C
178      estimatedWorksheet.Cells[1, 4].Value = "Estimated Class Values"; // D
179      estimatedWorksheet.Cells[1, 6].Value = "Error"; // F
180      estimatedWorksheet.Cells[1, 7].Value = "Squared Error"; // G
181      estimatedWorksheet.Cells[1, 9].Value = "Unbounded Estimated Values"; // I
182      estimatedWorksheet.Cells[1, 10].Value = "Bounded Estimated Values"; // J
183      estimatedWorksheet.Cells[1, 11].Value = "Random Key"; // K
184
185      var thresholds = solution.Model.Thresholds.Where(x => !double.IsInfinity(x)).ToList();
186      var thresholdsFormula = GenerateThresholdsFormula(thresholds);
187
188      const int columnIndex = 13; // index of beginning columns for class values
189      for (int i = 0; i <= thresholds.Count; ++i) {
190        estimatedWorksheet.Cells[1, i + columnIndex].Value = "Class " + i;
191        if (i < thresholds.Count)
192          estimatedWorksheet.Cells[1, i + columnIndex + thresholds.Count + 1].Value = "Threshold " + i;
193      }
194      estimatedWorksheet.Cells[1, 1, 1, 10].AutoFitColumns();
195
196      int targetIndex = solution.ProblemData.Dataset.VariableNames.ToList().FindIndex(x => x.Equals(solution.ProblemData.TargetVariable)) + 1;
197      for (int i = 0; i < rows; i++) {
198        estimatedWorksheet.Cells[i + 2, 1].Value = i;
199        estimatedWorksheet.Cells[i + 2, 2].Formula = datasetWorksheet.Cells[i + 2, targetIndex].FullAddress; // target values
200        estimatedWorksheet.Cells[i + 2, 9].Formula = string.Format(preparedFormula, i + 2); // formula (estimated) values
201
202        string condition = string.Empty;
203        string rowRef = "C" + (i + 2);
204
205        int nClasses = thresholds.Count + 1;
206        for (int j = columnIndex; j < columnIndex + nClasses; ++j) {
207          int idx = j - columnIndex + 5; // row index for the threshold values
208          if (j == columnIndex) {
209            condition = rowRef + " < Model!$B$" + idx;
210          } else if (j > columnIndex && j < columnIndex + thresholds.Count) {
211            condition = "AND(" + rowRef + "> Model!$B$" + (idx - 1) + ", " + rowRef + " < Model!$B$" + idx + ")";
212          } else if (j == columnIndex + thresholds.Count) {
213            condition = rowRef + " > Model!$B$" + (idx - 1);
214          }
215          estimatedWorksheet.Cells[i + 2, j].Formula = "IF(" + condition + ", " + rowRef + ", #N/A)";
216
217          if (j < columnIndex + thresholds.Count)
218            estimatedWorksheet.Cells[i + 2, j + nClasses].Formula = "Model!$B$" + idx;
219        }
220      }
221      estimatedWorksheet.Cells["B2:B" + (rows + 1)].Style.Numberformat.Format = "0.000";
222
223      estimatedWorksheet.Cells["C2:C" + (rows + 1)].Formula = "J2";
224      estimatedWorksheet.Cells["C2:C" + (rows + 1)].Style.Numberformat.Format = "0.000";
225
226      estimatedWorksheet.Cells["D2:D" + (rows + 1)].Formula = thresholdsFormula;
227      estimatedWorksheet.Cells["D2:D" + (rows + 1)].Style.Numberformat.Format = "0.0";
228
229      estimatedWorksheet.Cells["F2:F" + (rows + 1)].Formula = "B2 - C2";
230      estimatedWorksheet.Cells["F2:F" + (rows + 1)].Style.Numberformat.Format = "0.0";
231
232      estimatedWorksheet.Cells["G2:G" + (rows + 1)].Formula = "POWER(F2, 2)";
233      estimatedWorksheet.Cells["G2:G" + (rows + 1)].Style.Numberformat.Format = "0.000";
234
235      estimatedWorksheet.Cells["I2:I" + (rows + 1)].Style.Numberformat.Format = "0.000";
236      estimatedWorksheet.Cells["J2:J" + (rows + 1)].Formula = "IFERROR(IF(I2 > Model!EstimationLimitUpper, Model!EstimationLimitUpper, IF(I2 < Model!EstimationLimitLower, Model!EstimationLimitLower, I2)), AVERAGE(Model!EstimationLimitLower, Model!EstimationLimitUpper))";
237      estimatedWorksheet.Cells["J2:J" + (rows + 1)].Style.Numberformat.Format = "0.000";
238
239      estimatedWorksheet.Cells["K2:K" + (rows + 1)].Formula = "RAND()";
240      estimatedWorksheet.Cells["K2:K" + (rows + 1)].Style.Numberformat.Format = "0.000";
241    }
242
243    private void AddCharts(ExcelWorksheet chartsWorksheet, DiscriminantFunctionClassificationSolution solution) {
244      List<char> columns = new List<char> { 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' };
245      var scatterPlot = chartsWorksheet.Drawings.AddChart("scatterPlot", eChartType.XYScatter);
246      scatterPlot.SetSize(800, 400);
247      scatterPlot.SetPosition(0, 0);
248      scatterPlot.Title.Text = "Scatter Plot";
249      var thresholds = solution.Model.Thresholds.Where(x => !double.IsInfinity(x)).ToList();
250      chartsWorksheet.Names.AddFormula("XKey", "OFFSET('Estimated Values'!$K$1,1,0, COUNTA('Estimated Values'!$K:$K)-1)");
251
252      for (int i = 0; i <= thresholds.Count; ++i) {
253        string header = "Class" + i;
254        chartsWorksheet.Names.AddFormula(header, String.Format("OFFSET('Estimated Values'!${0}$1, 1, 0, COUNTA('Estimated Values'!${0}:${0})-1)", columns[i]));
255        var series = scatterPlot.Series.Add(header, "XKey");
256        series.Header = header;
257
258        if (i < thresholds.Count) {
259          chartsWorksheet.Names.AddFormula("Threshold" + i, String.Format("OFFSET('Estimated Values'!${0}$1, 1, 0, COUNTA('Estimated Values'!${0}:${0})-1)", columns[i + thresholds.Count + 1]));
260          var s = scatterPlot.Series.Add("Threshold" + i, "XKey");
261          s.Header = "Threshold" + i;
262        }
263      }
264    }
265
266    // this method assumes that the thresholds list is sorted in ascending order
267    private static string GenerateThresholdsFormula(List<double> thresholds) {
268      if (thresholds.Count == 1) {
269        return String.Format("IF(J2 < {0}, 0, 1)", thresholds[0]);
270      }
271      var sb = new StringBuilder();
272      sb.Append(String.Format("IF(J2 < {0}, 0,", thresholds[0]));
273      for (int i = 1; i < thresholds.Count; ++i) {
274        double v = thresholds[i];
275        sb.Append(String.Format("IF(J2 < {0}, {1},", v, i));
276      }
277      for (int i = 1; i < thresholds.Count; ++i)
278        sb.Append(")");
279      sb.Append(")");
280      return sb.ToString();
281    }
282  }
283}
Note: See TracBrowser for help on using the repository browser.