Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.LinearRegression/3.2/LinearRegressionOperator.cs @ 2970

Last change on this file since 2970 was 2843, checked in by gkronber, 15 years ago

Removed max. and min. time offset constraints as algorithm parameters and from all engines. The time constraints were added to the relevant terminal symbols (variable & differential) instead. The time offset constraint can be changed by editing the symbols in the function library. #880 (Max and min time offsets for variable symbols are not set correctly by FunctionLibraryInjectors)

File size: 10.4 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21using System;
22using System.Collections.Generic;
23using System.Linq;
24using System.Text;
25using HeuristicLab.Core;
26using HeuristicLab.Common;
27using HeuristicLab.Data;
28using HeuristicLab.DataAnalysis;
29using HeuristicLab.Modeling;
30using HeuristicLab.GP;
31using HeuristicLab.GP.StructureIdentification;
32using HeuristicLab.GP.Interfaces;
33
34namespace HeuristicLab.LinearRegression {
35  public class LinearRegressionOperator : OperatorBase {
36    private static double constant = 1.0;
37
38    public LinearRegressionOperator() {
39      AddVariableInfo(new VariableInfo("Dataset", "Dataset with all samples on which to apply the function", typeof(Dataset), VariableKind.In));
40      AddVariableInfo(new VariableInfo("TargetVariable", "Name of the target variable", typeof(StringData), VariableKind.In));
41      AddVariableInfo(new VariableInfo("InputVariables", "List of allowed input variable names", typeof(ItemList), VariableKind.In));
42      AddVariableInfo(new VariableInfo("SamplesStart", "Start index of samples in dataset to evaluate", typeof(IntData), VariableKind.In));
43      AddVariableInfo(new VariableInfo("SamplesEnd", "End index of samples in dataset to evaluate", typeof(IntData), VariableKind.In));
44      AddVariableInfo(new VariableInfo("MaxTimeOffset", "(optional) Maximal time offset for time-series prognosis", typeof(IntData), VariableKind.In));
45      AddVariableInfo(new VariableInfo("MinTimeOffset", "(optional) Minimal time offset for time-series prognosis", typeof(IntData), VariableKind.In));
46      AddVariableInfo(new VariableInfo("LinearRegressionModel", "Formula that was calculated by linear regression", typeof(IGeneticProgrammingModel), VariableKind.Out | VariableKind.New));
47    }
48
49    public override IOperation Apply(IScope scope) {
50      Dataset dataset = GetVariableValue<Dataset>("Dataset", scope, true);
51      string targetVariable = GetVariableValue<StringData>("TargetVariable", scope, true).Data;
52      int targetVariableIndex = dataset.GetVariableIndex(targetVariable);
53      int start = GetVariableValue<IntData>("SamplesStart", scope, true).Data;
54      int end = GetVariableValue<IntData>("SamplesEnd", scope, true).Data;
55      IntData maxTimeOffsetData = GetVariableValue<IntData>("MaxTimeOffset", scope, true, false);
56      int maxTimeOffset = maxTimeOffsetData == null ? 0 : maxTimeOffsetData.Data;
57      IntData minTimeOffsetData = GetVariableValue<IntData>("MinTimeOffset", scope, true, false);
58      int minTimeOffset = minTimeOffsetData == null ? 0 : minTimeOffsetData.Data;
59      ItemList inputVariables = GetVariableValue<ItemList>("InputVariables", scope, true, false);
60     
61      IFunctionTree tree;
62      if (inputVariables != null) {
63        tree = CreateModel(dataset, targetVariable, inputVariables.Cast<StringData>().Select(x => x.Data), start, end, minTimeOffset, maxTimeOffset);
64      } else {
65        tree = CreateModel(dataset, targetVariable, dataset.VariableNames, start, end, minTimeOffset, maxTimeOffset);
66      }
67      scope.AddVariable(new HeuristicLab.Core.Variable(scope.TranslateName("LinearRegressionModel"), new GeneticProgrammingModel(tree)));
68      return null;
69    }
70
71    public static IFunctionTree CreateModel(Dataset dataset, string targetVariable, IEnumerable<string> inputVariables, int start, int end) {
72      return CreateModel(dataset, targetVariable, inputVariables, start, end, 0, 0);
73    }
74
75    public static IFunctionTree CreateModel(Dataset dataset, string targetVariable, IEnumerable<string> inputVariables,
76        int start, int end,
77        int minTimeOffset, int maxTimeOffset) {
78      int targetVariableIndex = dataset.GetVariableIndex(targetVariable);
79      List<int> allowedColumns = CalculateAllowedColumns(dataset, targetVariableIndex, inputVariables.Select(x => dataset.GetVariableIndex(x)), start, end);
80      List<int> allowedRows = CalculateAllowedRows(dataset, targetVariableIndex, allowedColumns, start, end, minTimeOffset, maxTimeOffset);
81
82      double[,] inputMatrix = PrepareInputMatrix(dataset, allowedColumns, allowedRows, minTimeOffset, maxTimeOffset);
83      double[] targetVector = PrepareTargetVector(dataset, targetVariableIndex, allowedRows);
84      double[] coefficients = CalculateCoefficients(inputMatrix, targetVector);
85      return CreateModel(coefficients, allowedColumns.Select(i => dataset.GetVariableName(i)).ToList(), minTimeOffset, maxTimeOffset);
86    }
87
88    private static IFunctionTree CreateModel(double[] coefficients, List<string> allowedVariables, int minTimeOffset, int maxTimeOffset) {
89      IFunctionTree root = new Addition().GetTreeNode();
90
91      int timeOffsetRange = (maxTimeOffset - minTimeOffset + 1);
92
93      for (int i = 0; i < allowedVariables.Count; i++) {
94        for (int timeOffset = minTimeOffset; timeOffset <= maxTimeOffset; timeOffset++) {
95          var vNode = (VariableFunctionTree)new GP.StructureIdentification.Variable().GetTreeNode();
96          vNode.VariableName = allowedVariables[i];
97          vNode.Weight = coefficients[(i * timeOffsetRange) + (timeOffset - minTimeOffset)];
98          vNode.SampleOffset = timeOffset;
99          root.AddSubTree(vNode);
100        }
101      }
102      var cNode = (ConstantFunctionTree)new Constant().GetTreeNode();
103
104      cNode.Value = coefficients[coefficients.Length - 1];
105      root.AddSubTree(cNode);
106      return root;
107    }
108
109    private static double[] CalculateCoefficients(double[,] inputMatrix, double[] targetVector) {
110      int retVal = 0;
111      alglib.linreg.linearmodel lm = new alglib.linreg.linearmodel();
112      alglib.linreg.lrreport ar = new alglib.linreg.lrreport();
113      int n = targetVector.Length;
114      int p = inputMatrix.GetLength(1);
115      // no features allowed -> return constant offset
116      if (p <= 1) return new double[] { Statistics.Mean(targetVector) };
117      double[,] dataset = new double[n, p];
118      for (int row = 0; row < n; row++) {
119        for (int column = 0; column < p - 1; column++) {
120          dataset[row, column] = inputMatrix[row, column];
121        }
122        dataset[row, p - 1] = targetVector[row];
123      }
124      alglib.linreg.lrbuild(ref dataset, n, p - 1, ref retVal, ref lm, ref ar);
125      if (retVal != 1) throw new ArgumentException("Error in calculation of linear regression model");
126      Console.Out.WriteLine("ALGLIB Linear Regression: Estimated generalization RMS = {0}", ar.cvrmserror);
127
128      double[] coefficients = new double[p];
129      for (int i = 0; i < p; i++) {
130        coefficients[i] = lm.w[i + 4];
131      }
132      return coefficients;
133    }
134
135    //returns list of valid row indexes (rows without NaN values)
136    private static List<int> CalculateAllowedRows(Dataset dataset, int targetVariable, IList<int> allowedColumns, int start, int end, int minTimeOffset, int maxTimeOffset) {
137      List<int> allowedRows = new List<int>();
138      bool add;
139      for (int row = start; row < end; row++) {
140        add = true;
141        for (int colIndex = 0; colIndex < allowedColumns.Count && add == true; colIndex++) {
142          for (int timeOffset = minTimeOffset; timeOffset <= maxTimeOffset; timeOffset++) {
143            if (
144              row + timeOffset < 0 ||
145              row + timeOffset > dataset.Rows ||
146              double.IsNaN(dataset.GetValue(row + timeOffset, allowedColumns[colIndex])) ||
147              double.IsInfinity(dataset.GetValue(row + timeOffset, allowedColumns[colIndex])) ||
148              double.IsNaN(dataset.GetValue(row + timeOffset, targetVariable))) {
149              add = false;
150            }
151          }
152        }
153        if (add)
154          allowedRows.Add(row);
155        add = true;
156      }
157      return allowedRows;
158    }
159
160    //returns list of valid column indexes (columns which contain max. 10% NaN (or infinity) and contain at least two different values)
161    private static List<int> CalculateAllowedColumns(Dataset dataset, int targetVariable, IEnumerable<int> inputVariables, int start, int end) {
162      List<int> allowedColumns = new List<int>();
163      double n = end - start;
164      foreach (int inputVariable in inputVariables) {// = 0; i < dataset.Columns; i++) {
165        double nanRatio = dataset.CountMissingValues(inputVariable, start, end) / n;
166        if (inputVariable != targetVariable && nanRatio < 0.1 && dataset.GetRange(inputVariable, start, end) > 0.0) {
167          allowedColumns.Add(inputVariable);
168        }
169      }
170      return allowedColumns;
171    }
172
173    private static double[,] PrepareInputMatrix(Dataset dataset, List<int> allowedColumns, List<int> allowedRows, int minTimeOffset, int maxTimeOffset) {
174      int rowCount = allowedRows.Count;
175      int timeOffsetRange = (maxTimeOffset - minTimeOffset + 1);
176      double[,] matrix = new double[rowCount, (allowedColumns.Count * timeOffsetRange) + 1];
177      for (int row = 0; row < allowedRows.Count; row++)
178        for (int col = 0; col < allowedColumns.Count; col++) {
179          for (int timeOffset = minTimeOffset; timeOffset <= maxTimeOffset; timeOffset++)
180            matrix[row, (col * timeOffsetRange) + (timeOffset - minTimeOffset)] = dataset.GetValue(allowedRows[row] + timeOffset, allowedColumns[col]);
181        }
182      //add constant 1.0 in last column
183      for (int i = 0; i < rowCount; i++)
184        matrix[i, allowedColumns.Count * timeOffsetRange] = constant;
185      return matrix;
186    }
187
188    private static double[] PrepareTargetVector(Dataset dataset, int targetVariable, List<int> allowedRows) {
189      int rowCount = allowedRows.Count;
190      double[] targetVector = new double[rowCount];
191      double[] samples = dataset.Samples;
192      for (int row = 0; row < rowCount; row++) {
193        targetVector[row] = dataset.GetValue(allowedRows[row], targetVariable);
194      }
195      return targetVector;
196    }
197  }
198}
Note: See TracBrowser for help on using the repository browser.