[5624] | 1 | #region License Information
|
---|
| 2 | /* HeuristicLab
|
---|
[17180] | 3 | * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL)
|
---|
[5624] | 4 | *
|
---|
| 5 | * This file is part of HeuristicLab.
|
---|
| 6 | *
|
---|
| 7 | * HeuristicLab is free software: you can redistribute it and/or modify
|
---|
| 8 | * it under the terms of the GNU General Public License as published by
|
---|
| 9 | * the Free Software Foundation, either version 3 of the License, or
|
---|
| 10 | * (at your option) any later version.
|
---|
| 11 | *
|
---|
| 12 | * HeuristicLab is distributed in the hope that it will be useful,
|
---|
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 15 | * GNU General Public License for more details.
|
---|
| 16 | *
|
---|
| 17 | * You should have received a copy of the GNU General Public License
|
---|
| 18 | * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
|
---|
| 19 | */
|
---|
| 20 | #endregion
|
---|
| 21 |
|
---|
[11308] | 22 | using System;
|
---|
[5624] | 23 | using System.Collections.Generic;
|
---|
| 24 | using System.Linq;
|
---|
[11308] | 25 | using System.Linq.Expressions;
|
---|
| 26 | using System.Threading.Tasks;
|
---|
| 27 | using HeuristicLab.Common;
|
---|
[11361] | 28 | using HeuristicLab.Core;
|
---|
[11308] | 29 | using HeuristicLab.Data;
|
---|
[5624] | 30 | using HeuristicLab.Problems.DataAnalysis;
|
---|
[11361] | 31 | using HeuristicLab.Random;
|
---|
[8609] | 32 | using LibSVM;
|
---|
[5624] | 33 |
|
---|
| 34 | namespace HeuristicLab.Algorithms.DataAnalysis {
|
---|
| 35 | public class SupportVectorMachineUtil {
|
---|
| 36 | /// <summary>
|
---|
[15854] | 37 | /// Transforms <paramref name="dataset"/> into a data structure as needed by libSVM.
|
---|
[5624] | 38 | /// </summary>
|
---|
[15854] | 39 | /// <param name="dataset">The source dataset</param>
|
---|
| 40 | /// <param name="targetVariable">The target variable</param>
|
---|
| 41 | /// <param name="inputVariables">The selected input variables to include in the svm_problem.</param>
|
---|
[5624] | 42 | /// <param name="rowIndices">The rows of the dataset that should be contained in the resulting SVM-problem</param>
|
---|
| 43 | /// <returns>A problem data type that can be used to train a support vector machine.</returns>
|
---|
[12509] | 44 | public static svm_problem CreateSvmProblem(IDataset dataset, string targetVariable, IEnumerable<string> inputVariables, IEnumerable<int> rowIndices) {
|
---|
[15854] | 45 | double[] targetVector ;
|
---|
| 46 | var nRows = rowIndices.Count();
|
---|
| 47 | if (string.IsNullOrEmpty(targetVariable)) {
|
---|
| 48 | // if the target variable is not set (e.g. for prediction of a trained model) we just use a zero vector
|
---|
| 49 | targetVector = new double[nRows];
|
---|
| 50 | } else {
|
---|
| 51 | targetVector = dataset.GetDoubleValues(targetVariable, rowIndices).ToArray();
|
---|
| 52 | }
|
---|
| 53 | svm_node[][] nodes = new svm_node[nRows][];
|
---|
[5624] | 54 | int maxNodeIndex = 0;
|
---|
| 55 | int svmProblemRowIndex = 0;
|
---|
[6002] | 56 | List<string> inputVariablesList = inputVariables.ToList();
|
---|
[5624] | 57 | foreach (int row in rowIndices) {
|
---|
[11337] | 58 | List<svm_node> tempRow = new List<svm_node>();
|
---|
[6002] | 59 | int colIndex = 1; // make sure the smallest node index for SVM = 1
|
---|
| 60 | foreach (var inputVariable in inputVariablesList) {
|
---|
[6740] | 61 | double value = dataset.GetDoubleValue(inputVariable, row);
|
---|
[6002] | 62 | // SVM also works with missing values
|
---|
| 63 | // => don't add NaN values in the dataset to the sparse SVM matrix representation
|
---|
[5624] | 64 | if (!double.IsNaN(value)) {
|
---|
[11361] | 65 | tempRow.Add(new svm_node() { index = colIndex, value = value });
|
---|
| 66 | // nodes must be sorted in ascending ordered by column index
|
---|
[6002] | 67 | if (colIndex > maxNodeIndex) maxNodeIndex = colIndex;
|
---|
[5624] | 68 | }
|
---|
[6002] | 69 | colIndex++;
|
---|
[5624] | 70 | }
|
---|
[6002] | 71 | nodes[svmProblemRowIndex++] = tempRow.ToArray();
|
---|
[5624] | 72 | }
|
---|
[11337] | 73 | return new svm_problem { l = targetVector.Length, y = targetVector, x = nodes };
|
---|
[5624] | 74 | }
|
---|
[11308] | 75 |
|
---|
| 76 | /// <summary>
|
---|
[15854] | 77 | /// Transforms <paramref name="dataset"/> into a data structure as needed by libSVM for prediction.
|
---|
| 78 | /// </summary>
|
---|
| 79 | /// <param name="dataset">The problem data to transform</param>
|
---|
| 80 | /// <param name="inputVariables">The selected input variables to include in the svm_problem.</param>
|
---|
| 81 | /// <param name="rowIndices">The rows of the dataset that should be contained in the resulting SVM-problem</param>
|
---|
| 82 | /// <returns>A problem data type that can be used for prediction with a trained support vector machine.</returns>
|
---|
| 83 | public static svm_problem CreateSvmProblem(IDataset dataset, IEnumerable<string> inputVariables, IEnumerable<int> rowIndices) {
|
---|
| 84 | // for prediction we don't need a target variable
|
---|
| 85 | return CreateSvmProblem(dataset, string.Empty, inputVariables, rowIndices);
|
---|
| 86 | }
|
---|
| 87 |
|
---|
| 88 | /// <summary>
|
---|
[11308] | 89 | /// Instantiate and return a svm_parameter object with default values.
|
---|
| 90 | /// </summary>
|
---|
| 91 | /// <returns>A svm_parameter object with default values</returns>
|
---|
| 92 | public static svm_parameter DefaultParameters() {
|
---|
| 93 | svm_parameter parameter = new svm_parameter();
|
---|
| 94 | parameter.svm_type = svm_parameter.NU_SVR;
|
---|
| 95 | parameter.kernel_type = svm_parameter.RBF;
|
---|
| 96 | parameter.C = 1;
|
---|
| 97 | parameter.nu = 0.5;
|
---|
| 98 | parameter.gamma = 1;
|
---|
| 99 | parameter.p = 1;
|
---|
| 100 | parameter.cache_size = 500;
|
---|
| 101 | parameter.probability = 0;
|
---|
| 102 | parameter.eps = 0.001;
|
---|
| 103 | parameter.degree = 3;
|
---|
| 104 | parameter.shrinking = 1;
|
---|
| 105 | parameter.coef0 = 0;
|
---|
| 106 |
|
---|
| 107 | return parameter;
|
---|
| 108 | }
|
---|
| 109 |
|
---|
[11361] | 110 | public static double CrossValidate(IDataAnalysisProblemData problemData, svm_parameter parameters, int numberOfFolds, bool shuffleFolds = true) {
|
---|
| 111 | var partitions = GenerateSvmPartitions(problemData, numberOfFolds, shuffleFolds);
|
---|
| 112 | return CalculateCrossValidationPartitions(partitions, parameters);
|
---|
[11339] | 113 | }
|
---|
| 114 |
|
---|
[11542] | 115 | public static svm_parameter GridSearch(out double cvMse, IDataAnalysisProblemData problemData, Dictionary<string, IEnumerable<double>> parameterRanges, int numberOfFolds, bool shuffleFolds = true, int maxDegreeOfParallelism = 1) {
|
---|
[11339] | 116 | DoubleValue mse = new DoubleValue(Double.MaxValue);
|
---|
| 117 | var bestParam = DefaultParameters();
|
---|
| 118 | var crossProduct = parameterRanges.Values.CartesianProduct();
|
---|
| 119 | var setters = parameterRanges.Keys.Select(GenerateSetter).ToList();
|
---|
[11361] | 120 | var partitions = GenerateSvmPartitions(problemData, numberOfFolds, shuffleFolds);
|
---|
[11464] | 121 |
|
---|
| 122 | var locker = new object(); // for thread synchronization
|
---|
[11361] | 123 | Parallel.ForEach(crossProduct, new ParallelOptions { MaxDegreeOfParallelism = maxDegreeOfParallelism },
|
---|
| 124 | parameterCombination => {
|
---|
[11339] | 125 | var parameters = DefaultParameters();
|
---|
| 126 | var parameterValues = parameterCombination.ToList();
|
---|
[11361] | 127 | for (int i = 0; i < parameterValues.Count; ++i)
|
---|
[11339] | 128 | setters[i](parameters, parameterValues[i]);
|
---|
[11361] | 129 |
|
---|
| 130 | double testMse = CalculateCrossValidationPartitions(partitions, parameters);
|
---|
[11542] | 131 | if (!double.IsNaN(testMse)) {
|
---|
| 132 | lock (locker) {
|
---|
| 133 | if (testMse < mse.Value) {
|
---|
| 134 | mse.Value = testMse;
|
---|
| 135 | bestParam = (svm_parameter)parameters.Clone();
|
---|
| 136 | }
|
---|
[11342] | 137 | }
|
---|
[11339] | 138 | }
|
---|
| 139 | });
|
---|
[11542] | 140 | cvMse = mse.Value;
|
---|
[11339] | 141 | return bestParam;
|
---|
| 142 | }
|
---|
| 143 |
|
---|
[11361] | 144 | private static double CalculateCrossValidationPartitions(Tuple<svm_problem, svm_problem>[] partitions, svm_parameter parameters) {
|
---|
| 145 | double avgTestMse = 0;
|
---|
[11339] | 146 | var calc = new OnlineMeanSquaredErrorCalculator();
|
---|
| 147 | foreach (Tuple<svm_problem, svm_problem> tuple in partitions) {
|
---|
| 148 | var trainingSvmProblem = tuple.Item1;
|
---|
| 149 | var testSvmProblem = tuple.Item2;
|
---|
| 150 | var model = svm.svm_train(trainingSvmProblem, parameters);
|
---|
| 151 | calc.Reset();
|
---|
| 152 | for (int i = 0; i < testSvmProblem.l; ++i)
|
---|
| 153 | calc.Add(testSvmProblem.y[i], svm.svm_predict(model, testSvmProblem.x[i]));
|
---|
[11542] | 154 | double mse = calc.ErrorState == OnlineCalculatorError.None ? calc.MeanSquaredError : double.NaN;
|
---|
| 155 | avgTestMse += mse;
|
---|
[11308] | 156 | }
|
---|
[11339] | 157 | avgTestMse /= partitions.Length;
|
---|
[11361] | 158 | return avgTestMse;
|
---|
[11308] | 159 | }
|
---|
| 160 |
|
---|
[11361] | 161 | private static Tuple<svm_problem, svm_problem>[] GenerateSvmPartitions(IDataAnalysisProblemData problemData, int numberOfFolds, bool shuffleFolds = true) {
|
---|
| 162 | var folds = GenerateFolds(problemData, numberOfFolds, shuffleFolds).ToList();
|
---|
[11326] | 163 | var targetVariable = GetTargetVariableName(problemData);
|
---|
[11337] | 164 | var partitions = new Tuple<svm_problem, svm_problem>[numberOfFolds];
|
---|
| 165 | for (int i = 0; i < numberOfFolds; ++i) {
|
---|
[11326] | 166 | int p = i; // avoid "access to modified closure" warning below
|
---|
[11337] | 167 | var trainingRows = folds.SelectMany((par, j) => j != p ? par : Enumerable.Empty<int>());
|
---|
[11326] | 168 | var testRows = folds[i];
|
---|
[11337] | 169 | var trainingSvmProblem = CreateSvmProblem(problemData.Dataset, targetVariable, problemData.AllowedInputVariables, trainingRows);
|
---|
[11464] | 170 | var rangeTransform = RangeTransform.Compute(trainingSvmProblem);
|
---|
| 171 | var testSvmProblem = rangeTransform.Scale(CreateSvmProblem(problemData.Dataset, targetVariable, problemData.AllowedInputVariables, testRows));
|
---|
| 172 | partitions[i] = new Tuple<svm_problem, svm_problem>(rangeTransform.Scale(trainingSvmProblem), testSvmProblem);
|
---|
[11326] | 173 | }
|
---|
[11337] | 174 | return partitions;
|
---|
[11326] | 175 | }
|
---|
[11308] | 176 |
|
---|
[11361] | 177 | public static IEnumerable<IEnumerable<int>> GenerateFolds(IDataAnalysisProblemData problemData, int numberOfFolds, bool shuffleFolds = true) {
|
---|
| 178 | var random = new MersenneTwister((uint)Environment.TickCount);
|
---|
| 179 | if (problemData is IRegressionProblemData) {
|
---|
| 180 | var trainingIndices = shuffleFolds ? problemData.TrainingIndices.OrderBy(x => random.Next()) : problemData.TrainingIndices;
|
---|
| 181 | return GenerateFolds(trainingIndices, problemData.TrainingPartition.Size, numberOfFolds);
|
---|
| 182 | }
|
---|
| 183 | if (problemData is IClassificationProblemData) {
|
---|
| 184 | // when shuffle is enabled do stratified folds generation, some folds may have zero elements
|
---|
| 185 | // otherwise, generate folds normally
|
---|
| 186 | return shuffleFolds ? GenerateFoldsStratified(problemData as IClassificationProblemData, numberOfFolds, random) : GenerateFolds(problemData.TrainingIndices, problemData.TrainingPartition.Size, numberOfFolds);
|
---|
| 187 | }
|
---|
| 188 | throw new ArgumentException("Problem data is neither regression or classification problem data.");
|
---|
| 189 | }
|
---|
| 190 |
|
---|
[11339] | 191 | /// <summary>
|
---|
[11361] | 192 | /// Stratified fold generation from classification data. Stratification means that we ensure the same distribution of class labels for each fold.
|
---|
| 193 | /// The samples are grouped by class label and each group is split into @numberOfFolds parts. The final folds are formed from the joining of
|
---|
| 194 | /// the corresponding parts from each class label.
|
---|
[11339] | 195 | /// </summary>
|
---|
[11361] | 196 | /// <param name="problemData">The classification problem data.</param>
|
---|
| 197 | /// <param name="numberOfFolds">The number of folds in which to split the data.</param>
|
---|
| 198 | /// <param name="random">The random generator used to shuffle the folds.</param>
|
---|
| 199 | /// <returns>An enumerable sequece of folds, where a fold is represented by a sequence of row indices.</returns>
|
---|
| 200 | private static IEnumerable<IEnumerable<int>> GenerateFoldsStratified(IClassificationProblemData problemData, int numberOfFolds, IRandom random) {
|
---|
| 201 | var values = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices);
|
---|
| 202 | var valuesIndices = problemData.TrainingIndices.Zip(values, (i, v) => new { Index = i, Value = v }).ToList();
|
---|
| 203 | IEnumerable<IEnumerable<IEnumerable<int>>> foldsByClass = valuesIndices.GroupBy(x => x.Value, x => x.Index).Select(g => GenerateFolds(g, g.Count(), numberOfFolds));
|
---|
| 204 | var enumerators = foldsByClass.Select(f => f.GetEnumerator()).ToList();
|
---|
| 205 | while (enumerators.All(e => e.MoveNext())) {
|
---|
| 206 | yield return enumerators.SelectMany(e => e.Current).OrderBy(x => random.Next()).ToList();
|
---|
[11308] | 207 | }
|
---|
| 208 | }
|
---|
| 209 |
|
---|
[11361] | 210 | private static IEnumerable<IEnumerable<T>> GenerateFolds<T>(IEnumerable<T> values, int valuesCount, int numberOfFolds) {
|
---|
| 211 | // if number of folds is greater than the number of values, some empty folds will be returned
|
---|
| 212 | if (valuesCount < numberOfFolds) {
|
---|
| 213 | for (int i = 0; i < numberOfFolds; ++i)
|
---|
| 214 | yield return i < valuesCount ? values.Skip(i).Take(1) : Enumerable.Empty<T>();
|
---|
| 215 | } else {
|
---|
| 216 | int f = valuesCount / numberOfFolds, r = valuesCount % numberOfFolds; // number of folds rounded to integer and remainder
|
---|
| 217 | int start = 0, end = f;
|
---|
| 218 | for (int i = 0; i < numberOfFolds; ++i) {
|
---|
| 219 | if (r > 0) {
|
---|
| 220 | ++end;
|
---|
| 221 | --r;
|
---|
| 222 | }
|
---|
| 223 | yield return values.Skip(start).Take(end - start);
|
---|
| 224 | start = end;
|
---|
| 225 | end += f;
|
---|
| 226 | }
|
---|
| 227 | }
|
---|
| 228 | }
|
---|
| 229 |
|
---|
[11308] | 230 | private static Action<svm_parameter, double> GenerateSetter(string fieldName) {
|
---|
| 231 | var targetExp = Expression.Parameter(typeof(svm_parameter));
|
---|
| 232 | var valueExp = Expression.Parameter(typeof(double));
|
---|
| 233 | var fieldExp = Expression.Field(targetExp, fieldName);
|
---|
| 234 | var assignExp = Expression.Assign(fieldExp, Expression.Convert(valueExp, fieldExp.Type));
|
---|
| 235 | var setter = Expression.Lambda<Action<svm_parameter, double>>(assignExp, targetExp, valueExp).Compile();
|
---|
| 236 | return setter;
|
---|
| 237 | }
|
---|
| 238 |
|
---|
[11326] | 239 | private static string GetTargetVariableName(IDataAnalysisProblemData problemData) {
|
---|
| 240 | var regressionProblemData = problemData as IRegressionProblemData;
|
---|
| 241 | var classificationProblemData = problemData as IClassificationProblemData;
|
---|
| 242 |
|
---|
| 243 | if (regressionProblemData != null)
|
---|
| 244 | return regressionProblemData.TargetVariable;
|
---|
| 245 | if (classificationProblemData != null)
|
---|
| 246 | return classificationProblemData.TargetVariable;
|
---|
| 247 |
|
---|
| 248 | throw new ArgumentException("Problem data is neither regression or classification problem data.");
|
---|
| 249 | }
|
---|
[5624] | 250 | }
|
---|
| 251 | }
|
---|