Context Navigation

source: stable/HeuristicLab.Algorithms.DataAnalysis/3.4/SupportVectorMachine/SupportVectorMachineUtil.cs @ 16189

Visit:

Last change on this file since 16189 was 16160, checked in by gkronber, 6 years ago
#2905: merged r15854 from trunk to stable
File size: 13.0 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2018 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22	using System;
23	using System.Collections.Generic;
24	using System.Linq;
25	using System.Linq.Expressions;
26	using System.Threading.Tasks;
27	using HeuristicLab.Common;
28	using HeuristicLab.Core;
29	using HeuristicLab.Data;
30	using HeuristicLab.Problems.DataAnalysis;
31	using HeuristicLab.Random;
32	using LibSVM;
33
34	namespace HeuristicLab.Algorithms.DataAnalysis {
35	public class SupportVectorMachineUtil {
36	/// <summary>
37	/// Transforms <paramref name="dataset"/> into a data structure as needed by libSVM.
38	/// </summary>
39	/// <param name="dataset">The source dataset</param>
40	/// <param name="targetVariable">The target variable</param>
41	/// <param name="inputVariables">The selected input variables to include in the svm_problem.</param>
42	/// <param name="rowIndices">The rows of the dataset that should be contained in the resulting SVM-problem</param>
43	/// <returns>A problem data type that can be used to train a support vector machine.</returns>
44	public static svm_problem CreateSvmProblem(IDataset dataset, string targetVariable, IEnumerable<string> inputVariables, IEnumerable<int> rowIndices) {
45	double[] targetVector ;
46	var nRows = rowIndices.Count();
47	if (string.IsNullOrEmpty(targetVariable)) {
48	// if the target variable is not set (e.g. for prediction of a trained model) we just use a zero vector
49	targetVector = new double[nRows];
50	} else {
51	targetVector = dataset.GetDoubleValues(targetVariable, rowIndices).ToArray();
52	}
53	svm_node[][] nodes = new svm_node[nRows][];
54	int maxNodeIndex = 0;
55	int svmProblemRowIndex = 0;
56	List<string> inputVariablesList = inputVariables.ToList();
57	foreach (int row in rowIndices) {
58	List<svm_node> tempRow = new List<svm_node>();
59	int colIndex = 1; // make sure the smallest node index for SVM = 1
60	foreach (var inputVariable in inputVariablesList) {
61	double value = dataset.GetDoubleValue(inputVariable, row);
62	// SVM also works with missing values
63	// => don't add NaN values in the dataset to the sparse SVM matrix representation
64	if (!double.IsNaN(value)) {
65	tempRow.Add(new svm_node() { index = colIndex, value = value });
66	// nodes must be sorted in ascending ordered by column index
67	if (colIndex > maxNodeIndex) maxNodeIndex = colIndex;
68	}
69	colIndex++;
70	}
71	nodes[svmProblemRowIndex++] = tempRow.ToArray();
72	}
73	return new svm_problem { l = targetVector.Length, y = targetVector, x = nodes };
74	}
75
76	/// <summary>
77	/// Transforms <paramref name="dataset"/> into a data structure as needed by libSVM for prediction.
78	/// </summary>
79	/// <param name="dataset">The problem data to transform</param>
80	/// <param name="inputVariables">The selected input variables to include in the svm_problem.</param>
81	/// <param name="rowIndices">The rows of the dataset that should be contained in the resulting SVM-problem</param>
82	/// <returns>A problem data type that can be used for prediction with a trained support vector machine.</returns>
83	public static svm_problem CreateSvmProblem(IDataset dataset, IEnumerable<string> inputVariables, IEnumerable<int> rowIndices) {
84	// for prediction we don't need a target variable
85	return CreateSvmProblem(dataset, string.Empty, inputVariables, rowIndices);
86	}
87
88	/// <summary>
89	/// Instantiate and return a svm_parameter object with default values.
90	/// </summary>
91	/// <returns>A svm_parameter object with default values</returns>
92	public static svm_parameter DefaultParameters() {
93	svm_parameter parameter = new svm_parameter();
94	parameter.svm_type = svm_parameter.NU_SVR;
95	parameter.kernel_type = svm_parameter.RBF;
96	parameter.C = 1;
97	parameter.nu = 0.5;
98	parameter.gamma = 1;
99	parameter.p = 1;
100	parameter.cache_size = 500;
101	parameter.probability = 0;
102	parameter.eps = 0.001;
103	parameter.degree = 3;
104	parameter.shrinking = 1;
105	parameter.coef0 = 0;
106
107	return parameter;
108	}
109
110	public static double CrossValidate(IDataAnalysisProblemData problemData, svm_parameter parameters, int numberOfFolds, bool shuffleFolds = true) {
111	var partitions = GenerateSvmPartitions(problemData, numberOfFolds, shuffleFolds);
112	return CalculateCrossValidationPartitions(partitions, parameters);
113	}
114
115	public static svm_parameter GridSearch(out double cvMse, IDataAnalysisProblemData problemData, Dictionary<string, IEnumerable<double>> parameterRanges, int numberOfFolds, bool shuffleFolds = true, int maxDegreeOfParallelism = 1) {
116	DoubleValue mse = new DoubleValue(Double.MaxValue);
117	var bestParam = DefaultParameters();
118	var crossProduct = parameterRanges.Values.CartesianProduct();
119	var setters = parameterRanges.Keys.Select(GenerateSetter).ToList();
120	var partitions = GenerateSvmPartitions(problemData, numberOfFolds, shuffleFolds);
121
122	var locker = new object(); // for thread synchronization
123	Parallel.ForEach(crossProduct, new ParallelOptions { MaxDegreeOfParallelism = maxDegreeOfParallelism },
124	parameterCombination => {
125	var parameters = DefaultParameters();
126	var parameterValues = parameterCombination.ToList();
127	for (int i = 0; i < parameterValues.Count; ++i)
128	setters[i](parameters, parameterValues[i]);
129
130	double testMse = CalculateCrossValidationPartitions(partitions, parameters);
131	if (!double.IsNaN(testMse)) {
132	lock (locker) {
133	if (testMse < mse.Value) {
134	mse.Value = testMse;
135	bestParam = (svm_parameter)parameters.Clone();
136	}
137	}
138	}
139	});
140	cvMse = mse.Value;
141	return bestParam;
142	}
143
144	private static double CalculateCrossValidationPartitions(Tuple<svm_problem, svm_problem>[] partitions, svm_parameter parameters) {
145	double avgTestMse = 0;
146	var calc = new OnlineMeanSquaredErrorCalculator();
147	foreach (Tuple<svm_problem, svm_problem> tuple in partitions) {
148	var trainingSvmProblem = tuple.Item1;
149	var testSvmProblem = tuple.Item2;
150	var model = svm.svm_train(trainingSvmProblem, parameters);
151	calc.Reset();
152	for (int i = 0; i < testSvmProblem.l; ++i)
153	calc.Add(testSvmProblem.y[i], svm.svm_predict(model, testSvmProblem.x[i]));
154	double mse = calc.ErrorState == OnlineCalculatorError.None ? calc.MeanSquaredError : double.NaN;
155	avgTestMse += mse;
156	}
157	avgTestMse /= partitions.Length;
158	return avgTestMse;
159	}
160
161	private static Tuple<svm_problem, svm_problem>[] GenerateSvmPartitions(IDataAnalysisProblemData problemData, int numberOfFolds, bool shuffleFolds = true) {
162	var folds = GenerateFolds(problemData, numberOfFolds, shuffleFolds).ToList();
163	var targetVariable = GetTargetVariableName(problemData);
164	var partitions = new Tuple<svm_problem, svm_problem>[numberOfFolds];
165	for (int i = 0; i < numberOfFolds; ++i) {
166	int p = i; // avoid "access to modified closure" warning below
167	var trainingRows = folds.SelectMany((par, j) => j != p ? par : Enumerable.Empty<int>());
168	var testRows = folds[i];
169	var trainingSvmProblem = CreateSvmProblem(problemData.Dataset, targetVariable, problemData.AllowedInputVariables, trainingRows);
170	var rangeTransform = RangeTransform.Compute(trainingSvmProblem);
171	var testSvmProblem = rangeTransform.Scale(CreateSvmProblem(problemData.Dataset, targetVariable, problemData.AllowedInputVariables, testRows));
172	partitions[i] = new Tuple<svm_problem, svm_problem>(rangeTransform.Scale(trainingSvmProblem), testSvmProblem);
173	}
174	return partitions;
175	}
176
177	public static IEnumerable<IEnumerable<int>> GenerateFolds(IDataAnalysisProblemData problemData, int numberOfFolds, bool shuffleFolds = true) {
178	var random = new MersenneTwister((uint)Environment.TickCount);
179	if (problemData is IRegressionProblemData) {
180	var trainingIndices = shuffleFolds ? problemData.TrainingIndices.OrderBy(x => random.Next()) : problemData.TrainingIndices;
181	return GenerateFolds(trainingIndices, problemData.TrainingPartition.Size, numberOfFolds);
182	}
183	if (problemData is IClassificationProblemData) {
184	// when shuffle is enabled do stratified folds generation, some folds may have zero elements
185	// otherwise, generate folds normally
186	return shuffleFolds ? GenerateFoldsStratified(problemData as IClassificationProblemData, numberOfFolds, random) : GenerateFolds(problemData.TrainingIndices, problemData.TrainingPartition.Size, numberOfFolds);
187	}
188	throw new ArgumentException("Problem data is neither regression or classification problem data.");
189	}
190
191	/// <summary>
192	/// Stratified fold generation from classification data. Stratification means that we ensure the same distribution of class labels for each fold.
193	/// The samples are grouped by class label and each group is split into @numberOfFolds parts. The final folds are formed from the joining of
194	/// the corresponding parts from each class label.
195	/// </summary>
196	/// <param name="problemData">The classification problem data.</param>
197	/// <param name="numberOfFolds">The number of folds in which to split the data.</param>
198	/// <param name="random">The random generator used to shuffle the folds.</param>
199	/// <returns>An enumerable sequece of folds, where a fold is represented by a sequence of row indices.</returns>
200	private static IEnumerable<IEnumerable<int>> GenerateFoldsStratified(IClassificationProblemData problemData, int numberOfFolds, IRandom random) {
201	var values = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices);
202	var valuesIndices = problemData.TrainingIndices.Zip(values, (i, v) => new { Index = i, Value = v }).ToList();
203	IEnumerable<IEnumerable<IEnumerable<int>>> foldsByClass = valuesIndices.GroupBy(x => x.Value, x => x.Index).Select(g => GenerateFolds(g, g.Count(), numberOfFolds));
204	var enumerators = foldsByClass.Select(f => f.GetEnumerator()).ToList();
205	while (enumerators.All(e => e.MoveNext())) {
206	yield return enumerators.SelectMany(e => e.Current).OrderBy(x => random.Next()).ToList();
207	}
208	}
209
210	private static IEnumerable<IEnumerable<T>> GenerateFolds<T>(IEnumerable<T> values, int valuesCount, int numberOfFolds) {
211	// if number of folds is greater than the number of values, some empty folds will be returned
212	if (valuesCount < numberOfFolds) {
213	for (int i = 0; i < numberOfFolds; ++i)
214	yield return i < valuesCount ? values.Skip(i).Take(1) : Enumerable.Empty<T>();
215	} else {
216	int f = valuesCount / numberOfFolds, r = valuesCount % numberOfFolds; // number of folds rounded to integer and remainder
217	int start = 0, end = f;
218	for (int i = 0; i < numberOfFolds; ++i) {
219	if (r > 0) {
220	++end;
221	--r;
222	}
223	yield return values.Skip(start).Take(end - start);
224	start = end;
225	end += f;
226	}
227	}
228	}
229
230	private static Action<svm_parameter, double> GenerateSetter(string fieldName) {
231	var targetExp = Expression.Parameter(typeof(svm_parameter));
232	var valueExp = Expression.Parameter(typeof(double));
233	var fieldExp = Expression.Field(targetExp, fieldName);
234	var assignExp = Expression.Assign(fieldExp, Expression.Convert(valueExp, fieldExp.Type));
235	var setter = Expression.Lambda<Action<svm_parameter, double>>(assignExp, targetExp, valueExp).Compile();
236	return setter;
237	}
238
239	private static string GetTargetVariableName(IDataAnalysisProblemData problemData) {
240	var regressionProblemData = problemData as IRegressionProblemData;
241	var classificationProblemData = problemData as IClassificationProblemData;
242
243	if (regressionProblemData != null)
244	return regressionProblemData.TargetVariable;
245	if (classificationProblemData != null)
246	return classificationProblemData.TargetVariable;
247
248	throw new ArgumentException("Problem data is neither regression or classification problem data.");
249	}
250	}
251	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences