Context Navigation

source: stable/HeuristicLab.Algorithms.DataAnalysis/3.4/BaselineClassifiers/OneR.cs @ 15298

Visit:

Last change on this file since 15298 was 15131, checked in by gkronber, 7 years ago
#2650: merged r14826 from trunk to stable. The only remaining conflict is DataTableControl and ScatterPlotControl which have been renamed within r14982 (-> tree conflict).
File size: 10.9 KB

Line
1	#region License Information
2	/* HeuristicLab
3	* Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4	*
5	* This file is part of HeuristicLab.
6	*
7	* HeuristicLab is free software: you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation, either version 3 of the License, or
10	* (at your option) any later version.
11	*
12	* HeuristicLab is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19	*/
20	#endregion
21
22	using System;
23	using System.Collections.Generic;
24	using System.Linq;
25	using System.Threading;
26	using HeuristicLab.Common;
27	using HeuristicLab.Core;
28	using HeuristicLab.Data;
29	using HeuristicLab.Optimization;
30	using HeuristicLab.Parameters;
31	using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
32	using HeuristicLab.Problems.DataAnalysis;
33
34	namespace HeuristicLab.Algorithms.DataAnalysis {
35	/// <summary>
36	/// 1R classification algorithm.
37	/// </summary>
38	[Item("OneR Classification", "A simple classification algorithm the searches the best single-variable split (does not support categorical features correctly). See R.C. Holte (1993). Very simple classification rules perform well on most commonly used datasets. Machine Learning. 11:63-91.")]
39	[StorableClass]
40	public sealed class OneR : FixedDataAnalysisAlgorithm<IClassificationProblem> {
41
42	public IValueParameter<IntValue> MinBucketSizeParameter {
43	get { return (IValueParameter<IntValue>)Parameters["MinBucketSize"]; }
44	}
45
46	[StorableConstructor]
47	private OneR(bool deserializing) : base(deserializing) { }
48
49	private OneR(OneR original, Cloner cloner)
50	: base(original, cloner) { }
51
52	public OneR()
53	: base() {
54	Parameters.Add(new ValueParameter<IntValue>("MinBucketSize", "Minimum size of a bucket for numerical values. (Except for the rightmost bucket)", new IntValue(6)));
55	Problem = new ClassificationProblem();
56	}
57
58	public override IDeepCloneable Clone(Cloner cloner) {
59	return new OneR(this, cloner);
60	}
61
62	protected override void Run(CancellationToken cancellationToken) {
63	var solution = CreateOneRSolution(Problem.ProblemData, MinBucketSizeParameter.Value.Value);
64	Results.Add(new Result("OneR solution", "The 1R classifier.", solution));
65	}
66
67	public static IClassificationSolution CreateOneRSolution(IClassificationProblemData problemData, int minBucketSize = 6) {
68	var classValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices);
69	var model1 = FindBestDoubleVariableModel(problemData, minBucketSize);
70	var model2 = FindBestFactorModel(problemData);
71
72	if (model1 == null && model2 == null) throw new InvalidProgramException("Could not create OneR solution");
73	else if (model1 == null) return new OneFactorClassificationSolution(model2, (IClassificationProblemData)problemData.Clone());
74	else if (model2 == null) return new OneRClassificationSolution(model1, (IClassificationProblemData)problemData.Clone());
75	else {
76	var model1EstimatedValues = model1.GetEstimatedClassValues(problemData.Dataset, problemData.TrainingIndices);
77	var model1NumCorrect = classValues.Zip(model1EstimatedValues, (a, b) => a.IsAlmost(b)).Count(e => e);
78
79	var model2EstimatedValues = model2.GetEstimatedClassValues(problemData.Dataset, problemData.TrainingIndices);
80	var model2NumCorrect = classValues.Zip(model2EstimatedValues, (a, b) => a.IsAlmost(b)).Count(e => e);
81
82	if (model1NumCorrect > model2NumCorrect) {
83	return new OneRClassificationSolution(model1, (IClassificationProblemData)problemData.Clone());
84	} else {
85	return new OneFactorClassificationSolution(model2, (IClassificationProblemData)problemData.Clone());
86	}
87	}
88	}
89
90	private static OneRClassificationModel FindBestDoubleVariableModel(IClassificationProblemData problemData, int minBucketSize = 6) {
91	var bestClassified = 0;
92	List<Split> bestSplits = null;
93	string bestVariable = string.Empty;
94	double bestMissingValuesClass = double.NaN;
95	var classValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices);
96
97	var allowedInputVariables = problemData.AllowedInputVariables.Where(problemData.Dataset.VariableHasType<double>);
98
99	if (!allowedInputVariables.Any()) return null;
100
101	foreach (var variable in allowedInputVariables) {
102	var inputValues = problemData.Dataset.GetDoubleValues(variable, problemData.TrainingIndices);
103	var samples = inputValues.Zip(classValues, (i, v) => new Sample(i, v)).OrderBy(s => s.inputValue);
104
105	var missingValuesDistribution = samples
106	.Where(s => double.IsNaN(s.inputValue)).GroupBy(s => s.classValue)
107	.ToDictionary(s => s.Key, s => s.Count())
108	.MaxItems(s => s.Value)
109	.FirstOrDefault();
110
111	//calculate class distributions for all distinct inputValues
112	List<Dictionary<double, int>> classDistributions = new List<Dictionary<double, int>>();
113	List<double> thresholds = new List<double>();
114	double lastValue = double.NaN;
115	foreach (var sample in samples.Where(s => !double.IsNaN(s.inputValue))) {
116	if (sample.inputValue > lastValue \|\| double.IsNaN(lastValue)) {
117	if (!double.IsNaN(lastValue)) thresholds.Add((lastValue + sample.inputValue) / 2);
118	lastValue = sample.inputValue;
119	classDistributions.Add(new Dictionary<double, int>());
120	foreach (var classValue in problemData.ClassValues)
121	classDistributions[classDistributions.Count - 1][classValue] = 0;
122
123	}
124	classDistributions[classDistributions.Count - 1][sample.classValue]++;
125	}
126	thresholds.Add(double.PositiveInfinity);
127
128	var distribution = classDistributions[0];
129	var threshold = thresholds[0];
130	var splits = new List<Split>();
131
132	for (int i = 1; i < classDistributions.Count; i++) {
133	var samplesInSplit = distribution.Max(d => d.Value);
134	//join splits if there are too few samples in the split or the distributions has the same maximum class value as the current split
135	if (samplesInSplit < minBucketSize \|\|
136	classDistributions[i].MaxItems(d => d.Value).Select(d => d.Key).Contains(
137	distribution.MaxItems(d => d.Value).Select(d => d.Key).First())) {
138	foreach (var classValue in classDistributions[i])
139	distribution[classValue.Key] += classValue.Value;
140	threshold = thresholds[i];
141	} else {
142	splits.Add(new Split(threshold, distribution.MaxItems(d => d.Value).Select(d => d.Key).First()));
143	distribution = classDistributions[i];
144	threshold = thresholds[i];
145	}
146	}
147	splits.Add(new Split(double.PositiveInfinity, distribution.MaxItems(d => d.Value).Select(d => d.Key).First()));
148
149	int correctClassified = 0;
150	int splitIndex = 0;
151	foreach (var sample in samples.Where(s => !double.IsNaN(s.inputValue))) {
152	while (sample.inputValue >= splits[splitIndex].thresholdValue)
153	splitIndex++;
154	correctClassified += sample.classValue.IsAlmost(splits[splitIndex].classValue) ? 1 : 0;
155	}
156	correctClassified += missingValuesDistribution.Value;
157
158	if (correctClassified > bestClassified) {
159	bestClassified = correctClassified;
160	bestSplits = splits;
161	bestVariable = variable;
162	bestMissingValuesClass = missingValuesDistribution.Value == 0 ? double.NaN : missingValuesDistribution.Key;
163	}
164	}
165
166	//remove neighboring splits with the same class value
167	for (int i = 0; i < bestSplits.Count - 1; i++) {
168	if (bestSplits[i].classValue.IsAlmost(bestSplits[i + 1].classValue)) {
169	bestSplits.Remove(bestSplits[i]);
170	i--;
171	}
172	}
173
174	var model = new OneRClassificationModel(problemData.TargetVariable, bestVariable,
175	bestSplits.Select(s => s.thresholdValue).ToArray(),
176	bestSplits.Select(s => s.classValue).ToArray(), bestMissingValuesClass);
177
178	return model;
179	}
180	private static OneFactorClassificationModel FindBestFactorModel(IClassificationProblemData problemData) {
181	var classValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices);
182	var defaultClass = FindMostFrequentClassValue(classValues);
183	// only select string variables
184	var allowedInputVariables = problemData.AllowedInputVariables.Where(problemData.Dataset.VariableHasType<string>);
185
186	if (!allowedInputVariables.Any()) return null;
187
188	OneFactorClassificationModel bestModel = null;
189	var bestModelNumCorrect = 0;
190
191	foreach (var variable in allowedInputVariables) {
192	var variableValues = problemData.Dataset.GetStringValues(variable, problemData.TrainingIndices);
193	var groupedClassValues = variableValues
194	.Zip(classValues, (v, c) => new KeyValuePair<string, double>(v, c))
195	.GroupBy(kvp => kvp.Key)
196	.ToDictionary(g => g.Key, g => FindMostFrequentClassValue(g.Select(kvp => kvp.Value)));
197
198	var model = new OneFactorClassificationModel(problemData.TargetVariable, variable,
199	groupedClassValues.Select(kvp => kvp.Key).ToArray(), groupedClassValues.Select(kvp => kvp.Value).ToArray(), defaultClass);
200
201	var modelEstimatedValues = model.GetEstimatedClassValues(problemData.Dataset, problemData.TrainingIndices);
202	var modelNumCorrect = classValues.Zip(modelEstimatedValues, (a, b) => a.IsAlmost(b)).Count(e => e);
203	if (modelNumCorrect > bestModelNumCorrect) {
204	bestModelNumCorrect = modelNumCorrect;
205	bestModel = model;
206	}
207	}
208
209	return bestModel;
210	}
211
212	private static double FindMostFrequentClassValue(IEnumerable<double> classValues) {
213	return classValues.GroupBy(c => c).OrderByDescending(g => g.Count()).Select(g => g.Key).First();
214	}
215
216	#region helper classes
217	private class Split {
218	public double thresholdValue;
219	public double classValue;
220
221	public Split(double thresholdValue, double classValue) {
222	this.thresholdValue = thresholdValue;
223	this.classValue = classValue;
224	}
225	}
226
227	private class Sample {
228	public double inputValue;
229	public double classValue;
230
231	public Sample(double inputValue, double classValue) {
232	this.inputValue = inputValue;
233	this.classValue = classValue;
234	}
235	}
236	#endregion
237	}
238	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences