1 | #region License Information
|
---|
2 | /* HeuristicLab
|
---|
3 | * Copyright (C) 2002-2017 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
|
---|
4 | *
|
---|
5 | * This file is part of HeuristicLab.
|
---|
6 | *
|
---|
7 | * HeuristicLab is free software: you can redistribute it and/or modify
|
---|
8 | * it under the terms of the GNU General Public License as published by
|
---|
9 | * the Free Software Foundation, either version 3 of the License, or
|
---|
10 | * (at your option) any later version.
|
---|
11 | *
|
---|
12 | * HeuristicLab is distributed in the hope that it will be useful,
|
---|
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
15 | * GNU General Public License for more details.
|
---|
16 | *
|
---|
17 | * You should have received a copy of the GNU General Public License
|
---|
18 | * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
|
---|
19 | */
|
---|
20 | #endregion
|
---|
21 |
|
---|
22 | using System.Collections.Generic;
|
---|
23 | using System.Linq;
|
---|
24 | using HeuristicLab.Common;
|
---|
25 | using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
|
---|
26 | using HeuristicLab.Problems.DataAnalysis;
|
---|
27 |
|
---|
28 | namespace HeuristicLab.Algorithms.DataAnalysis {
|
---|
29 | [StorableClass]
|
---|
30 | public class PrincipleComponentAnalysisStatic : IDeepCloneable {
|
---|
31 | #region Properties
|
---|
32 | [Storable]
|
---|
33 | private double[,] Matrix { get; set; }
|
---|
34 | [Storable]
|
---|
35 | public double[] Variances { get; private set; }
|
---|
36 | [Storable]
|
---|
37 | public string[] Variables { get; private set; }
|
---|
38 | [Storable]
|
---|
39 | private double[] Deviations { get; set; }
|
---|
40 | [Storable]
|
---|
41 | private double[] Means { get; set; }
|
---|
42 | public string[] Names {
|
---|
43 | get { return Variables.Select((_, x) => "pc" + x).ToArray(); }
|
---|
44 | }
|
---|
45 | #endregion
|
---|
46 |
|
---|
47 | #region HLConstructors
|
---|
48 | [StorableConstructor]
|
---|
49 | protected PrincipleComponentAnalysisStatic(bool deserializing) { }
|
---|
50 | protected PrincipleComponentAnalysisStatic(PrincipleComponentAnalysisStatic original, Cloner cloner) {
|
---|
51 | if (original.Variances != null) Variances = original.Variances.ToArray();
|
---|
52 | if (original.Variables != null) Variables = original.Variables.ToArray();
|
---|
53 | if (original.Matrix == null) return;
|
---|
54 | Matrix = new double[original.Matrix.GetLength(0), original.Matrix.GetLength(1)];
|
---|
55 | for (var i = 0; i < original.Matrix.GetLength(0); i++)
|
---|
56 | for (var j = 0; j < original.Matrix.GetLength(1); j++)
|
---|
57 | Matrix[i, j] = original.Matrix[i, j];
|
---|
58 | }
|
---|
59 | private PrincipleComponentAnalysisStatic() { }
|
---|
60 | public IDeepCloneable Clone(Cloner cloner) {
|
---|
61 | return new PrincipleComponentAnalysisStatic(this, cloner);
|
---|
62 | }
|
---|
63 | public object Clone() {
|
---|
64 | return new Cloner().Clone(this);
|
---|
65 | }
|
---|
66 | #endregion
|
---|
67 |
|
---|
68 | #region Static Interface
|
---|
69 | public static PrincipleComponentAnalysisStatic Create(IDataset dataset, IEnumerable<int> rows, IEnumerable<string> variables, bool normalize = false) {
|
---|
70 | var res = new PrincipleComponentAnalysisStatic();
|
---|
71 | res.BuildPca(dataset, rows, variables, normalize);
|
---|
72 | return res;
|
---|
73 | }
|
---|
74 | #endregion
|
---|
75 |
|
---|
76 | public IRegressionProblemData ProjectProblem(IRegressionProblemData pd) {
|
---|
77 | var projected = ProjectData(pd.Dataset, pd.AllIndices);
|
---|
78 | return CreateProblemData(pd, projected);
|
---|
79 | }
|
---|
80 | public IDataset ProjectDataset(IDataset data) {
|
---|
81 | var projected = ProjectData(data, Enumerable.Range(0, data.Rows));
|
---|
82 | return CreateDataset(data, projected);
|
---|
83 | }
|
---|
84 | public double[,] ProjectData(IDataset dataset, IEnumerable<int> rows) {
|
---|
85 | var instances = rows.ToArray();
|
---|
86 | var result = new double[instances.Length, Variables.Length];
|
---|
87 | for (var r = 0; r < instances.Length; r++)
|
---|
88 | for (var i = 0; i < Variables.Length; i++) {
|
---|
89 | var val = (dataset.GetDoubleValue(Variables[i], instances[r]) - Means[i]) / Deviations[i];
|
---|
90 | for (var j = 0; j < Variables.Length; j++)
|
---|
91 | result[r, j] += val * Matrix[i, j];
|
---|
92 | }
|
---|
93 | return result;
|
---|
94 | }
|
---|
95 | public double[,] ReverseProjection(IDataset dataset, IEnumerable<int> rows, IEnumerable<string> pcaComponents) {
|
---|
96 | var instances = rows.ToArray();
|
---|
97 | var components = pcaComponents.ToArray();
|
---|
98 |
|
---|
99 | var result = new double[instances.Length, Variables.Length];
|
---|
100 | for (var r = 0; r < instances.Length; r++)
|
---|
101 | for (var i = 0; i < components.Length; i++) {
|
---|
102 | var val = dataset.GetDoubleValue(components[i], instances[r]);
|
---|
103 | for (var j = 0; j < Variables.Length; j++)
|
---|
104 | result[r, j] += (val + Means[j]) * Deviations[j] * Matrix[j, i];
|
---|
105 | }
|
---|
106 | return result;
|
---|
107 | }
|
---|
108 |
|
---|
109 | #region Helpers
|
---|
110 | private IRegressionProblemData CreateProblemData(IRegressionProblemData pd, double[,] pcs) {
|
---|
111 | var data = CreateDataset(pd.Dataset, pcs);
|
---|
112 | var res = new RegressionProblemData(data, Names, pd.TargetVariable);
|
---|
113 | res.TestPartition.Start = pd.TestPartition.Start;
|
---|
114 | res.TestPartition.End = pd.TestPartition.End;
|
---|
115 | res.TrainingPartition.Start = pd.TrainingPartition.Start;
|
---|
116 | res.TrainingPartition.End = pd.TrainingPartition.End;
|
---|
117 | return res;
|
---|
118 | }
|
---|
119 | private IDataset CreateDataset(IDataset data, double[,] pcs) {
|
---|
120 | var n = Names;
|
---|
121 | var n2 = data.DoubleVariables.Where(x => !Variables.Contains(x)).ToArray();
|
---|
122 | return new Dataset(n.Concat(n2),
|
---|
123 | n.Select((_, x) => Enumerable.Range(0, pcs.GetLength(0)).Select(r => pcs[r, x]).ToList())
|
---|
124 | .Concat(n2.Select(x => data.GetDoubleValues(x).ToList())));
|
---|
125 | }
|
---|
126 | private void BuildPca(IDataset dataset, IEnumerable<int> rows, IEnumerable<string> variables, bool normalize) {
|
---|
127 | var instances = rows.ToArray();
|
---|
128 | var attributes = variables.ToArray();
|
---|
129 | Means = normalize
|
---|
130 | ? attributes.Select(v => dataset.GetDoubleValues(v, instances).Average()).ToArray()
|
---|
131 | : attributes.Select(x => 0.0).ToArray();
|
---|
132 | Deviations = normalize
|
---|
133 | ? attributes.Select(v => dataset.GetDoubleValues(v, instances).StandardDeviationPop()).Select(x => x.IsAlmost(0.0) ? 1 : x).ToArray()
|
---|
134 | : attributes.Select(x => 1.0).ToArray();
|
---|
135 |
|
---|
136 | var data = new double[instances.Length, attributes.Length];
|
---|
137 |
|
---|
138 | for (var j = 0; j < attributes.Length; j++) {
|
---|
139 | var i = 0;
|
---|
140 | foreach (var v in dataset.GetDoubleValues(attributes[j], instances)) {
|
---|
141 | data[i, j] = (v - Means[j]) / Deviations[j];
|
---|
142 | i++;
|
---|
143 | }
|
---|
144 | }
|
---|
145 |
|
---|
146 | int info;
|
---|
147 | double[] variances;
|
---|
148 | double[,] matrix;
|
---|
149 | alglib.pcabuildbasis(data, instances.Length, attributes.Length, out info, out variances, out matrix);
|
---|
150 | Matrix = matrix;
|
---|
151 | Variances = variances;
|
---|
152 | Variables = attributes;
|
---|
153 | }
|
---|
154 | #endregion
|
---|
155 | }
|
---|
156 | } |
---|