#region License Information /* HeuristicLab * Copyright (C) 2002-2008 Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Collections.Generic; using System.Linq; using System.Text; using HeuristicLab.Core; using HeuristicLab.Data; using HeuristicLab.DataAnalysis; using HeuristicLab.GP; using HeuristicLab.GP.StructureIdentification; namespace HeuristicLab.LinearRegression { public class LinearRegressionOperator : OperatorBase { private static double constant = 1.0; public LinearRegressionOperator() { AddVariableInfo(new VariableInfo("TargetVariable", "Index of the column of the dataset that holds the target variable", typeof(IntData), VariableKind.In)); AddVariableInfo(new VariableInfo("Dataset", "Dataset with all samples on which to apply the function", typeof(Dataset), VariableKind.In)); AddVariableInfo(new VariableInfo("SamplesStart", "Start index of samples in dataset to evaluate", typeof(IntData), VariableKind.In)); AddVariableInfo(new VariableInfo("SamplesEnd", "End index of samples in dataset to evaluate", typeof(IntData), VariableKind.In)); AddVariableInfo(new VariableInfo("AllowedFeatures", "List of indexes of allowed features", typeof(ItemList), VariableKind.In)); AddVariableInfo(new VariableInfo("LinearRegressionModel", "Formula that was calculated by linear regression", typeof(IFunctionTree), VariableKind.Out | VariableKind.New)); AddVariableInfo(new VariableInfo("TreeSize", "The size (number of nodes) of the tree", typeof(IntData), VariableKind.New | VariableKind.Out)); AddVariableInfo(new VariableInfo("TreeHeight", "The height of the tree", typeof(IntData), VariableKind.New | VariableKind.Out)); } public override IOperation Apply(IScope scope) { int targetVariable = GetVariableValue("TargetVariable", scope, true).Data; Dataset dataset = GetVariableValue("Dataset", scope, true); int start = GetVariableValue("SamplesStart", scope, true).Data; int end = GetVariableValue("SamplesEnd", scope, true).Data; ItemList allowedFeatures = GetVariableValue>("AllowedFeatures", scope, true); List allowedRows = CalculateAllowedRows(dataset, allowedFeatures, targetVariable, start, end); List disallowedFeatures = new List(); foreach (IntData allowedFeature in allowedFeatures) { if (IsAlmost(dataset.GetMinimum(allowedFeature.Data, start, end), 0.0) && IsAlmost(dataset.GetMaximum(allowedFeature.Data, start, end), 0.0)) disallowedFeatures.Add(allowedFeature); } foreach (IntData disallowedFeature in disallowedFeatures) allowedFeatures.Remove(disallowedFeature); double[,] inputMatrix = PrepareInputMatrix(dataset, allowedFeatures, allowedRows); double[] targetVector = PrepareTargetVector(dataset, targetVariable, allowedRows); double[] coefficients = CalculateCoefficients(inputMatrix, targetVector); IFunctionTree tree = CreateModel(coefficients, allowedFeatures); scope.AddVariable(new HeuristicLab.Core.Variable(scope.TranslateName("LinearRegressionModel"), tree)); scope.AddVariable(new HeuristicLab.Core.Variable(scope.TranslateName("TreeSize"), new IntData(tree.Size))); scope.AddVariable(new HeuristicLab.Core.Variable(scope.TranslateName("TreeHeight"), new IntData(tree.Height))); return null; } private bool IsAlmost(double x, double y) { return Math.Abs(x - y) < 1.0E-12; } private IFunctionTree CreateModel(double[] coefficients, ItemList allowedFeatures) { IFunctionTree root = new Addition().GetTreeNode(); IFunctionTree actNode = root; Queue nodes = new Queue(); GP.StructureIdentification.Variable v; for (int i = 0; i < coefficients.Length - 1; i++) { v = new GP.StructureIdentification.Variable(); v.GetVariable(GP.StructureIdentification.Variable.INDEX).Value = new ConstrainedIntData(allowedFeatures[i].Data); v.GetVariable(GP.StructureIdentification.Variable.WEIGHT).Value = new ConstrainedDoubleData(coefficients[i]); v.GetVariable(GP.StructureIdentification.Variable.OFFSET).Value = new ConstrainedIntData(0); nodes.Enqueue(v.GetTreeNode()); } GP.StructureIdentification.Constant c = new Constant(); c.GetVariable(GP.StructureIdentification.Constant.VALUE).Value = new ConstrainedDoubleData(coefficients[coefficients.Length - 1] * 1.0); nodes.Enqueue(c.GetTreeNode()); IFunctionTree newTree; while (nodes.Count != 1) { newTree = new Addition().GetTreeNode(); newTree.AddSubTree(nodes.Dequeue()); newTree.AddSubTree(nodes.Dequeue()); nodes.Enqueue(newTree); } return nodes.Dequeue(); } private double[] CalculateCoefficients(double[,] inputMatrix, double[] targetVector) { double[] weights = new double[targetVector.Length]; double[] coefficients = new double[inputMatrix.GetLength(1)]; for(int i=0;i CalculateAllowedRows(Dataset dataset, ItemList allowedFeatures, int targetVariable, int start, int end) { List allowedRows = new List(); bool add; for (int row = start; row < end; row++) { add = true; for (int col = 0; col < allowedFeatures.Count && add == true; col++) { if (double.IsNaN(dataset.GetValue(row, allowedFeatures[col].Data)) || double.IsNaN(dataset.GetValue(row, targetVariable))) add = false; } if (add) allowedRows.Add(row); add = true; } return allowedRows; } private double[,] PrepareInputMatrix(Dataset dataset, ItemList allowedFeatures, List allowedRows) { int rowCount = allowedRows.Count; double[,] matrix = new double[rowCount, allowedFeatures.Count + 1]; for (int col = 0; col < allowedFeatures.Count; col++) { for (int row = 0; row < allowedRows.Count; row++) matrix[row, col] = dataset.GetValue(allowedRows[row], allowedFeatures[col].Data); } //add constant 1.0 in last column for (int i = 0; i < rowCount; i++) matrix[i, allowedFeatures.Count] = constant; return matrix; } private double[] PrepareTargetVector(Dataset dataset, int targetVariable, List allowedRows) { int rowCount = allowedRows.Count; double[] targetVector = new double[rowCount]; double[] samples = dataset.Samples; for (int row = 0; row < rowCount; row++) { targetVector[row] = dataset.GetValue(allowedRows[row], targetVariable); } return targetVector; } } }