Context Navigation

source: trunk/sources/HeuristicLab.Algorithms.DataAnalysis/3.4/BaselineClassifiers/OneR.cs @ 15529

Visit:

Last change on this file since 15529 was 14826, checked in by gkronber, 8 years ago
#2650: merged the factors branch into trunk
File size: 10.9 KB

Rev	Line
[10569]	1	#region License Information
	2	/* HeuristicLab
[14185]	3	* Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[10569]	4	*
	5	* This file is part of HeuristicLab.
	6	*
	7	* HeuristicLab is free software: you can redistribute it and/or modify
	8	* it under the terms of the GNU General Public License as published by
	9	* the Free Software Foundation, either version 3 of the License, or
	10	* (at your option) any later version.
	11	*
	12	* HeuristicLab is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	* GNU General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU General Public License
	18	* along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
	19	*/
	20	#endregion
	21
[14826]	22	using System;
[10569]	23	using System.Collections.Generic;
	24	using System.Linq;
[14523]	25	using System.Threading;
[10569]	26	using HeuristicLab.Common;
	27	using HeuristicLab.Core;
	28	using HeuristicLab.Data;
	29	using HeuristicLab.Optimization;
	30	using HeuristicLab.Parameters;
	31	using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
	32	using HeuristicLab.Problems.DataAnalysis;
	33
	34	namespace HeuristicLab.Algorithms.DataAnalysis {
	35	/// <summary>
	36	/// 1R classification algorithm.
	37	/// </summary>
[13090]	38	[Item("OneR Classification", "A simple classification algorithm the searches the best single-variable split (does not support categorical features correctly). See R.C. Holte (1993). Very simple classification rules perform well on most commonly used datasets. Machine Learning. 11:63-91.")]
[10569]	39	[StorableClass]
[13090]	40	public sealed class OneR : FixedDataAnalysisAlgorithm<IClassificationProblem> {
[10569]	41
	42	public IValueParameter<IntValue> MinBucketSizeParameter {
	43	get { return (IValueParameter<IntValue>)Parameters["MinBucketSize"]; }
	44	}
	45
	46	[StorableConstructor]
[13090]	47	private OneR(bool deserializing) : base(deserializing) { }
[10569]	48
[13090]	49	private OneR(OneR original, Cloner cloner)
[10569]	50	: base(original, cloner) { }
	51
[13090]	52	public OneR()
[10569]	53	: base() {
	54	Parameters.Add(new ValueParameter<IntValue>("MinBucketSize", "Minimum size of a bucket for numerical values. (Except for the rightmost bucket)", new IntValue(6)));
	55	Problem = new ClassificationProblem();
	56	}
	57
	58	public override IDeepCloneable Clone(Cloner cloner) {
[13090]	59	return new OneR(this, cloner);
[10569]	60	}
	61
[14523]	62	protected override void Run(CancellationToken cancellationToken) {
[10569]	63	var solution = CreateOneRSolution(Problem.ProblemData, MinBucketSizeParameter.Value.Value);
	64	Results.Add(new Result("OneR solution", "The 1R classifier.", solution));
	65	}
	66
[13089]	67	public static IClassificationSolution CreateOneRSolution(IClassificationProblemData problemData, int minBucketSize = 6) {
[14826]	68	var classValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices);
	69	var model1 = FindBestDoubleVariableModel(problemData, minBucketSize);
	70	var model2 = FindBestFactorModel(problemData);
	71
	72	if (model1 == null && model2 == null) throw new InvalidProgramException("Could not create OneR solution");
	73	else if (model1 == null) return new OneFactorClassificationSolution(model2, (IClassificationProblemData)problemData.Clone());
	74	else if (model2 == null) return new OneRClassificationSolution(model1, (IClassificationProblemData)problemData.Clone());
	75	else {
	76	var model1EstimatedValues = model1.GetEstimatedClassValues(problemData.Dataset, problemData.TrainingIndices);
	77	var model1NumCorrect = classValues.Zip(model1EstimatedValues, (a, b) => a.IsAlmost(b)).Count(e => e);
	78
	79	var model2EstimatedValues = model2.GetEstimatedClassValues(problemData.Dataset, problemData.TrainingIndices);
	80	var model2NumCorrect = classValues.Zip(model2EstimatedValues, (a, b) => a.IsAlmost(b)).Count(e => e);
	81
	82	if (model1NumCorrect > model2NumCorrect) {
	83	return new OneRClassificationSolution(model1, (IClassificationProblemData)problemData.Clone());
	84	} else {
	85	return new OneFactorClassificationSolution(model2, (IClassificationProblemData)problemData.Clone());
	86	}
	87	}
	88	}
	89
	90	private static OneRClassificationModel FindBestDoubleVariableModel(IClassificationProblemData problemData, int minBucketSize = 6) {
[10569]	91	var bestClassified = 0;
	92	List<Split> bestSplits = null;
	93	string bestVariable = string.Empty;
[10570]	94	double bestMissingValuesClass = double.NaN;
	95	var classValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices);
[10569]	96
[14826]	97	var allowedInputVariables = problemData.AllowedInputVariables.Where(problemData.Dataset.VariableHasType<double>);
	98
	99	if (!allowedInputVariables.Any()) return null;
	100
	101	foreach (var variable in allowedInputVariables) {
[10569]	102	var inputValues = problemData.Dataset.GetDoubleValues(variable, problemData.TrainingIndices);
	103	var samples = inputValues.Zip(classValues, (i, v) => new Sample(i, v)).OrderBy(s => s.inputValue);
	104
[14826]	105	var missingValuesDistribution = samples
	106	.Where(s => double.IsNaN(s.inputValue)).GroupBy(s => s.classValue)
	107	.ToDictionary(s => s.Key, s => s.Count())
	108	.MaxItems(s => s.Value)
	109	.FirstOrDefault();
[10570]	110
[10569]	111	//calculate class distributions for all distinct inputValues
	112	List<Dictionary<double, int>> classDistributions = new List<Dictionary<double, int>>();
	113	List<double> thresholds = new List<double>();
	114	double lastValue = double.NaN;
[10570]	115	foreach (var sample in samples.Where(s => !double.IsNaN(s.inputValue))) {
[10569]	116	if (sample.inputValue > lastValue \|\| double.IsNaN(lastValue)) {
	117	if (!double.IsNaN(lastValue)) thresholds.Add((lastValue + sample.inputValue) / 2);
	118	lastValue = sample.inputValue;
	119	classDistributions.Add(new Dictionary<double, int>());
	120	foreach (var classValue in problemData.ClassValues)
	121	classDistributions[classDistributions.Count - 1][classValue] = 0;
	122
	123	}
	124	classDistributions[classDistributions.Count - 1][sample.classValue]++;
	125	}
	126	thresholds.Add(double.PositiveInfinity);
	127
	128	var distribution = classDistributions[0];
	129	var threshold = thresholds[0];
	130	var splits = new List<Split>();
	131
	132	for (int i = 1; i < classDistributions.Count; i++) {
	133	var samplesInSplit = distribution.Max(d => d.Value);
[10570]	134	//join splits if there are too few samples in the split or the distributions has the same maximum class value as the current split
[10569]	135	if (samplesInSplit < minBucketSize \|\|
	136	classDistributions[i].MaxItems(d => d.Value).Select(d => d.Key).Contains(
	137	distribution.MaxItems(d => d.Value).Select(d => d.Key).First())) {
	138	foreach (var classValue in classDistributions[i])
	139	distribution[classValue.Key] += classValue.Value;
	140	threshold = thresholds[i];
	141	} else {
	142	splits.Add(new Split(threshold, distribution.MaxItems(d => d.Value).Select(d => d.Key).First()));
	143	distribution = classDistributions[i];
	144	threshold = thresholds[i];
	145	}
	146	}
	147	splits.Add(new Split(double.PositiveInfinity, distribution.MaxItems(d => d.Value).Select(d => d.Key).First()));
	148
	149	int correctClassified = 0;
	150	int splitIndex = 0;
[10570]	151	foreach (var sample in samples.Where(s => !double.IsNaN(s.inputValue))) {
[10569]	152	while (sample.inputValue >= splits[splitIndex].thresholdValue)
	153	splitIndex++;
[14826]	154	correctClassified += sample.classValue.IsAlmost(splits[splitIndex].classValue) ? 1 : 0;
[10569]	155	}
[10570]	156	correctClassified += missingValuesDistribution.Value;
[10569]	157
	158	if (correctClassified > bestClassified) {
	159	bestClassified = correctClassified;
	160	bestSplits = splits;
	161	bestVariable = variable;
[10570]	162	bestMissingValuesClass = missingValuesDistribution.Value == 0 ? double.NaN : missingValuesDistribution.Key;
[10569]	163	}
	164	}
	165
	166	//remove neighboring splits with the same class value
	167	for (int i = 0; i < bestSplits.Count - 1; i++) {
[14826]	168	if (bestSplits[i].classValue.IsAlmost(bestSplits[i + 1].classValue)) {
[10569]	169	bestSplits.Remove(bestSplits[i]);
	170	i--;
	171	}
	172	}
	173
[14826]	174	var model = new OneRClassificationModel(problemData.TargetVariable, bestVariable,
	175	bestSplits.Select(s => s.thresholdValue).ToArray(),
	176	bestSplits.Select(s => s.classValue).ToArray(), bestMissingValuesClass);
[10569]	177
[14826]	178	return model;
[10569]	179	}
[14826]	180	private static OneFactorClassificationModel FindBestFactorModel(IClassificationProblemData problemData) {
	181	var classValues = problemData.Dataset.GetDoubleValues(problemData.TargetVariable, problemData.TrainingIndices);
	182	var defaultClass = FindMostFrequentClassValue(classValues);
	183	// only select string variables
	184	var allowedInputVariables = problemData.AllowedInputVariables.Where(problemData.Dataset.VariableHasType<string>);
[10569]	185
[14826]	186	if (!allowedInputVariables.Any()) return null;
	187
	188	OneFactorClassificationModel bestModel = null;
	189	var bestModelNumCorrect = 0;
	190
	191	foreach (var variable in allowedInputVariables) {
	192	var variableValues = problemData.Dataset.GetStringValues(variable, problemData.TrainingIndices);
	193	var groupedClassValues = variableValues
	194	.Zip(classValues, (v, c) => new KeyValuePair<string, double>(v, c))
	195	.GroupBy(kvp => kvp.Key)
	196	.ToDictionary(g => g.Key, g => FindMostFrequentClassValue(g.Select(kvp => kvp.Value)));
	197
	198	var model = new OneFactorClassificationModel(problemData.TargetVariable, variable,
	199	groupedClassValues.Select(kvp => kvp.Key).ToArray(), groupedClassValues.Select(kvp => kvp.Value).ToArray(), defaultClass);
	200
	201	var modelEstimatedValues = model.GetEstimatedClassValues(problemData.Dataset, problemData.TrainingIndices);
	202	var modelNumCorrect = classValues.Zip(modelEstimatedValues, (a, b) => a.IsAlmost(b)).Count(e => e);
	203	if (modelNumCorrect > bestModelNumCorrect) {
	204	bestModelNumCorrect = modelNumCorrect;
	205	bestModel = model;
	206	}
	207	}
	208
	209	return bestModel;
	210	}
	211
	212	private static double FindMostFrequentClassValue(IEnumerable<double> classValues) {
	213	return classValues.GroupBy(c => c).OrderByDescending(g => g.Count()).Select(g => g.Key).First();
	214	}
	215
[10569]	216	#region helper classes
	217	private class Split {
	218	public double thresholdValue;
	219	public double classValue;
	220
	221	public Split(double thresholdValue, double classValue) {
	222	this.thresholdValue = thresholdValue;
	223	this.classValue = classValue;
	224	}
	225	}
	226
	227	private class Sample {
	228	public double inputValue;
	229	public double classValue;
	230
	231	public Sample(double inputValue, double classValue) {
	232	this.inputValue = inputValue;
	233	this.classValue = classValue;
	234	}
	235	}
	236	#endregion
	237	}
	238	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Update cookies preferences