[5624] | 1 | #region License Information
|
---|
| 2 | /* HeuristicLab
|
---|
[14185] | 3 | * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
|
---|
[5624] | 4 | *
|
---|
| 5 | * This file is part of HeuristicLab.
|
---|
| 6 | *
|
---|
| 7 | * HeuristicLab is free software: you can redistribute it and/or modify
|
---|
| 8 | * it under the terms of the GNU General Public License as published by
|
---|
| 9 | * the Free Software Foundation, either version 3 of the License, or
|
---|
| 10 | * (at your option) any later version.
|
---|
| 11 | *
|
---|
| 12 | * HeuristicLab is distributed in the hope that it will be useful,
|
---|
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 15 | * GNU General Public License for more details.
|
---|
| 16 | *
|
---|
| 17 | * You should have received a copy of the GNU General Public License
|
---|
| 18 | * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
|
---|
| 19 | */
|
---|
| 20 | #endregion
|
---|
| 21 |
|
---|
[11308] | 22 | using System;
|
---|
[5624] | 23 | using System.Collections.Generic;
|
---|
| 24 | using System.Linq;
|
---|
[11308] | 25 | using System.Linq.Expressions;
|
---|
| 26 | using System.Threading.Tasks;
|
---|
| 27 | using HeuristicLab.Common;
|
---|
[11361] | 28 | using HeuristicLab.Core;
|
---|
[11308] | 29 | using HeuristicLab.Data;
|
---|
[5624] | 30 | using HeuristicLab.Problems.DataAnalysis;
|
---|
[11361] | 31 | using HeuristicLab.Random;
|
---|
[8609] | 32 | using LibSVM;
|
---|
[5624] | 33 |
|
---|
| 34 | namespace HeuristicLab.Algorithms.DataAnalysis {
|
---|
| 35 | public class SupportVectorMachineUtil {
|
---|
| 36 | /// <summary>
|
---|
| 37 | /// Transforms <paramref name="problemData"/> into a data structure as needed by libSVM.
|
---|
| 38 | /// </summary>
|
---|
| 39 | /// <param name="problemData">The problem data to transform</param>
|
---|
| 40 | /// <param name="rowIndices">The rows of the dataset that should be contained in the resulting SVM-problem</param>
|
---|
| 41 | /// <returns>A problem data type that can be used to train a support vector machine.</returns>
|
---|
[12509] | 42 | public static svm_problem CreateSvmProblem(IDataset dataset, string targetVariable, IEnumerable<string> inputVariables, IEnumerable<int> rowIndices) {
|
---|
[11337] | 43 | double[] targetVector = dataset.GetDoubleValues(targetVariable, rowIndices).ToArray();
|
---|
[8609] | 44 | svm_node[][] nodes = new svm_node[targetVector.Length][];
|
---|
[5624] | 45 | int maxNodeIndex = 0;
|
---|
| 46 | int svmProblemRowIndex = 0;
|
---|
[6002] | 47 | List<string> inputVariablesList = inputVariables.ToList();
|
---|
[5624] | 48 | foreach (int row in rowIndices) {
|
---|
[11337] | 49 | List<svm_node> tempRow = new List<svm_node>();
|
---|
[6002] | 50 | int colIndex = 1; // make sure the smallest node index for SVM = 1
|
---|
| 51 | foreach (var inputVariable in inputVariablesList) {
|
---|
[6740] | 52 | double value = dataset.GetDoubleValue(inputVariable, row);
|
---|
[6002] | 53 | // SVM also works with missing values
|
---|
| 54 | // => don't add NaN values in the dataset to the sparse SVM matrix representation
|
---|
[5624] | 55 | if (!double.IsNaN(value)) {
|
---|
[11361] | 56 | tempRow.Add(new svm_node() { index = colIndex, value = value });
|
---|
| 57 | // nodes must be sorted in ascending ordered by column index
|
---|
[6002] | 58 | if (colIndex > maxNodeIndex) maxNodeIndex = colIndex;
|
---|
[5624] | 59 | }
|
---|
[6002] | 60 | colIndex++;
|
---|
[5624] | 61 | }
|
---|
[6002] | 62 | nodes[svmProblemRowIndex++] = tempRow.ToArray();
|
---|
[5624] | 63 | }
|
---|
[11337] | 64 | return new svm_problem { l = targetVector.Length, y = targetVector, x = nodes };
|
---|
[5624] | 65 | }
|
---|
[11308] | 66 |
|
---|
| 67 | /// <summary>
|
---|
| 68 | /// Instantiate and return a svm_parameter object with default values.
|
---|
| 69 | /// </summary>
|
---|
| 70 | /// <returns>A svm_parameter object with default values</returns>
|
---|
| 71 | public static svm_parameter DefaultParameters() {
|
---|
| 72 | svm_parameter parameter = new svm_parameter();
|
---|
| 73 | parameter.svm_type = svm_parameter.NU_SVR;
|
---|
| 74 | parameter.kernel_type = svm_parameter.RBF;
|
---|
| 75 | parameter.C = 1;
|
---|
| 76 | parameter.nu = 0.5;
|
---|
| 77 | parameter.gamma = 1;
|
---|
| 78 | parameter.p = 1;
|
---|
| 79 | parameter.cache_size = 500;
|
---|
| 80 | parameter.probability = 0;
|
---|
| 81 | parameter.eps = 0.001;
|
---|
| 82 | parameter.degree = 3;
|
---|
| 83 | parameter.shrinking = 1;
|
---|
| 84 | parameter.coef0 = 0;
|
---|
| 85 |
|
---|
| 86 | return parameter;
|
---|
| 87 | }
|
---|
| 88 |
|
---|
[11361] | 89 | public static double CrossValidate(IDataAnalysisProblemData problemData, svm_parameter parameters, int numberOfFolds, bool shuffleFolds = true) {
|
---|
| 90 | var partitions = GenerateSvmPartitions(problemData, numberOfFolds, shuffleFolds);
|
---|
| 91 | return CalculateCrossValidationPartitions(partitions, parameters);
|
---|
[11339] | 92 | }
|
---|
| 93 |
|
---|
[11542] | 94 | public static svm_parameter GridSearch(out double cvMse, IDataAnalysisProblemData problemData, Dictionary<string, IEnumerable<double>> parameterRanges, int numberOfFolds, bool shuffleFolds = true, int maxDegreeOfParallelism = 1) {
|
---|
[11339] | 95 | DoubleValue mse = new DoubleValue(Double.MaxValue);
|
---|
| 96 | var bestParam = DefaultParameters();
|
---|
| 97 | var crossProduct = parameterRanges.Values.CartesianProduct();
|
---|
| 98 | var setters = parameterRanges.Keys.Select(GenerateSetter).ToList();
|
---|
[11361] | 99 | var partitions = GenerateSvmPartitions(problemData, numberOfFolds, shuffleFolds);
|
---|
[11464] | 100 |
|
---|
| 101 | var locker = new object(); // for thread synchronization
|
---|
[11361] | 102 | Parallel.ForEach(crossProduct, new ParallelOptions { MaxDegreeOfParallelism = maxDegreeOfParallelism },
|
---|
| 103 | parameterCombination => {
|
---|
[11339] | 104 | var parameters = DefaultParameters();
|
---|
| 105 | var parameterValues = parameterCombination.ToList();
|
---|
[11361] | 106 | for (int i = 0; i < parameterValues.Count; ++i)
|
---|
[11339] | 107 | setters[i](parameters, parameterValues[i]);
|
---|
[11361] | 108 |
|
---|
| 109 | double testMse = CalculateCrossValidationPartitions(partitions, parameters);
|
---|
[11542] | 110 | if (!double.IsNaN(testMse)) {
|
---|
| 111 | lock (locker) {
|
---|
| 112 | if (testMse < mse.Value) {
|
---|
| 113 | mse.Value = testMse;
|
---|
| 114 | bestParam = (svm_parameter)parameters.Clone();
|
---|
| 115 | }
|
---|
[11342] | 116 | }
|
---|
[11339] | 117 | }
|
---|
| 118 | });
|
---|
[11542] | 119 | cvMse = mse.Value;
|
---|
[11339] | 120 | return bestParam;
|
---|
| 121 | }
|
---|
| 122 |
|
---|
[11361] | 123 | private static double CalculateCrossValidationPartitions(Tuple<svm_problem, svm_problem>[] partitions, svm_parameter parameters) {
|
---|
| 124 | double avgTestMse = 0;
|
---|
[11339] | 125 | var calc = new OnlineMeanSquaredErrorCalculator();
|
---|
| 126 | foreach (Tuple<svm_problem, svm_problem> tuple in partitions) {
|
---|
| 127 | var trainingSvmProblem = tuple.Item1;
|
---|
| 128 | var testSvmProblem = tuple.Item2;
|
---|
| 129 | var model = svm.svm_train(trainingSvmProblem, parameters);
|
---|
| 130 | calc.Reset();
|
---|
| 131 | for (int i = 0; i < testSvmProblem.l; ++i)
|
---|
| 132 | calc.Add(testSvmProblem.y[i], svm.svm_predict(model, testSvmProblem.x[i]));
|
---|
[11542] | 133 | double mse = calc.ErrorState == OnlineCalculatorError.None ? calc.MeanSquaredError : double.NaN;
|
---|
| 134 | avgTestMse += mse;
|
---|
[11308] | 135 | }
|
---|
[11339] | 136 | avgTestMse /= partitions.Length;
|
---|
[11361] | 137 | return avgTestMse;
|
---|
[11308] | 138 | }
|
---|
| 139 |
|
---|
[11361] | 140 | private static Tuple<svm_problem, svm_problem>[] GenerateSvmPartitions(IDataAnalysisProblemData problemData, int numberOfFolds, bool shuffleFolds = true) {
|
---|
| 141 | var folds = GenerateFolds(problemData, numberOfFolds, shuffleFolds).ToList();
|
---|
[11326] | 142 | var targetVariable = GetTargetVariableName(problemData);
|
---|
[11337] | 143 | var partitions = new Tuple<svm_problem, svm_problem>[numberOfFolds];
|
---|
| 144 | for (int i = 0; i < numberOfFolds; ++i) {
|
---|
[11326] | 145 | int p = i; // avoid "access to modified closure" warning below
|
---|
[11337] | 146 | var trainingRows = folds.SelectMany((par, j) => j != p ? par : Enumerable.Empty<int>());
|
---|
[11326] | 147 | var testRows = folds[i];
|
---|
[11337] | 148 | var trainingSvmProblem = CreateSvmProblem(problemData.Dataset, targetVariable, problemData.AllowedInputVariables, trainingRows);
|
---|
[11464] | 149 | var rangeTransform = RangeTransform.Compute(trainingSvmProblem);
|
---|
| 150 | var testSvmProblem = rangeTransform.Scale(CreateSvmProblem(problemData.Dataset, targetVariable, problemData.AllowedInputVariables, testRows));
|
---|
| 151 | partitions[i] = new Tuple<svm_problem, svm_problem>(rangeTransform.Scale(trainingSvmProblem), testSvmProblem);
|
---|
[11326] | 152 | }
|
---|
[11337] | 153 | return partitions;
|
---|
[11326] | 154 | }
|
---|
[11308] | 155 |
|
---|
[11361] | 156 | public static IEnumerable<IEnumerable<int>> GenerateFolds(IDataAnalysisProblemData problemData, int numberOfFolds, bool shuffleFolds = true) {
|
---|
| 157 | var random = new MersenneTwister((uint)Environment.TickCount);
|
---|
| 158 | if (problemData is IRegressionProblemData) {
|
---|
| 159 | var trainingIndices = shuffleFolds ? problemData.TrainingIndices.OrderBy(x => random.Next()) : problemData.TrainingIndices;
|
---|
| 160 | return GenerateFolds(trainingIndices, problemData.TrainingPartition.Size, numberOfFolds);
|
---|
| 161 | }
|
---|
| 162 | if (problemData is IClassificationProblemData) {
|
---|
| 163 | // when shuffle is enabled do stratified folds generation, some folds may have zero elements
|
---|
| 164 | // otherwise, generate folds normally
|
---|
| 165 | return shuffleFolds ? GenerateFoldsStratified(problemData as IClassificationProblemData, numberOfFolds, random) : GenerateFolds(problemData.TrainingIndices, problemData.TrainingPartition.Size, numberOfFolds);
|
---|
| 166 | }
|
---|
| 167 | throw new ArgumentException("Problem data is neither regression or classification problem data.");
|
---|
| 168 | }
|
---|
| 169 |
|
---|
[11339] | 170 | /// <summary>
|
---|
[11361] | 171 | /// Stratified fold generation from classification data. Stratification means that we ensure the same distribution of class labels for each fold.
|
---|
| 172 | /// The samples are grouped by class label and each group is split into @numberOfFolds parts. The final folds are formed from the joining of
|
---|
| 173 | /// the corresponding parts from each class label.
|
---|
[11339] | 174 | /// </summary>
|
---|
[11361] | 175 | /// <param name="problemData">The classification problem data.</param>
|
---|
| 176 | /// <param name="numberOfFolds">The number of folds in which to split the data.</param>
|
---|
| 177 | /// <param name="random">The random generator used to shuffle the folds.</param>
|
---|
| 178 | /// <returns>An enumerable sequece of folds, where a fold is represented by a sequence of row indices.</returns>
|
---|
| 179 | private static IEnumerable<IEnumerable<int>> GenerateFoldsStratified(IClassificationProblemData problemData, int numberOfFolds, IRandom random) {
|
---|
| 180 | var values = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices);
|
---|
| 181 | var valuesIndices = problemData.TrainingIndices.Zip(values, (i, v) => new { Index = i, Value = v }).ToList();
|
---|
| 182 | IEnumerable<IEnumerable<IEnumerable<int>>> foldsByClass = valuesIndices.GroupBy(x => x.Value, x => x.Index).Select(g => GenerateFolds(g, g.Count(), numberOfFolds));
|
---|
| 183 | var enumerators = foldsByClass.Select(f => f.GetEnumerator()).ToList();
|
---|
| 184 | while (enumerators.All(e => e.MoveNext())) {
|
---|
| 185 | yield return enumerators.SelectMany(e => e.Current).OrderBy(x => random.Next()).ToList();
|
---|
[11308] | 186 | }
|
---|
| 187 | }
|
---|
| 188 |
|
---|
[11361] | 189 | private static IEnumerable<IEnumerable<T>> GenerateFolds<T>(IEnumerable<T> values, int valuesCount, int numberOfFolds) {
|
---|
| 190 | // if number of folds is greater than the number of values, some empty folds will be returned
|
---|
| 191 | if (valuesCount < numberOfFolds) {
|
---|
| 192 | for (int i = 0; i < numberOfFolds; ++i)
|
---|
| 193 | yield return i < valuesCount ? values.Skip(i).Take(1) : Enumerable.Empty<T>();
|
---|
| 194 | } else {
|
---|
| 195 | int f = valuesCount / numberOfFolds, r = valuesCount % numberOfFolds; // number of folds rounded to integer and remainder
|
---|
| 196 | int start = 0, end = f;
|
---|
| 197 | for (int i = 0; i < numberOfFolds; ++i) {
|
---|
| 198 | if (r > 0) {
|
---|
| 199 | ++end;
|
---|
| 200 | --r;
|
---|
| 201 | }
|
---|
| 202 | yield return values.Skip(start).Take(end - start);
|
---|
| 203 | start = end;
|
---|
| 204 | end += f;
|
---|
| 205 | }
|
---|
| 206 | }
|
---|
| 207 | }
|
---|
| 208 |
|
---|
[11308] | 209 | private static Action<svm_parameter, double> GenerateSetter(string fieldName) {
|
---|
| 210 | var targetExp = Expression.Parameter(typeof(svm_parameter));
|
---|
| 211 | var valueExp = Expression.Parameter(typeof(double));
|
---|
| 212 | var fieldExp = Expression.Field(targetExp, fieldName);
|
---|
| 213 | var assignExp = Expression.Assign(fieldExp, Expression.Convert(valueExp, fieldExp.Type));
|
---|
| 214 | var setter = Expression.Lambda<Action<svm_parameter, double>>(assignExp, targetExp, valueExp).Compile();
|
---|
| 215 | return setter;
|
---|
| 216 | }
|
---|
| 217 |
|
---|
[11326] | 218 | private static string GetTargetVariableName(IDataAnalysisProblemData problemData) {
|
---|
| 219 | var regressionProblemData = problemData as IRegressionProblemData;
|
---|
| 220 | var classificationProblemData = problemData as IClassificationProblemData;
|
---|
| 221 |
|
---|
| 222 | if (regressionProblemData != null)
|
---|
| 223 | return regressionProblemData.TargetVariable;
|
---|
| 224 | if (classificationProblemData != null)
|
---|
| 225 | return classificationProblemData.TargetVariable;
|
---|
| 226 |
|
---|
| 227 | throw new ArgumentException("Problem data is neither regression or classification problem data.");
|
---|
| 228 | }
|
---|
[5624] | 229 | }
|
---|
| 230 | }
|
---|