#region License Information /* HeuristicLab * Copyright (C) 2002-2011 Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Collections.Generic; using System.IO; using System.Linq; using HeuristicLab.Common; using HeuristicLab.Core; using HeuristicLab.Data; using HeuristicLab.Parameters; using HeuristicLab.Persistence.Default.CompositeSerializers.Storable; namespace HeuristicLab.Problems.DataAnalysis { [StorableClass] [Item("ClassificationProblemData", "Represents an item containing all data defining a classification problem.")] public class ClassificationProblemData : DataAnalysisProblemData, IClassificationProblemData { protected const string TargetVariableParameterName = "TargetVariable"; protected const string ClassNamesParameterName = "ClassNames"; protected const string ClassificationPenaltiesParameterName = "ClassificationPenalties"; protected const int MaximumNumberOfClasses = 20; protected const int InspectedRowsToDetermineTargets = 500; #region default data private static string[] defaultVariableNames = new string[] { "sample", "clump thickness", "cell size", "cell shape", "marginal adhesion", "epithelial cell size", "bare nuclei", "chromatin", "nucleoli", "mitoses", "class" }; private static double[,] defaultData = new double[,]{ {1000025,5,1,1,1,2,1,3,1,1,2 }, {1002945,5,4,4,5,7,10,3,2,1,2 }, {1015425,3,1,1,1,2,2,3,1,1,2 }, {1016277,6,8,8,1,3,4,3,7,1,2 }, {1017023,4,1,1,3,2,1,3,1,1,2 }, {1017122,8,10,10,8,7,10,9,7,1,4 }, {1018099,1,1,1,1,2,10,3,1,1,2 }, {1018561,2,1,2,1,2,1,3,1,1,2 }, {1033078,2,1,1,1,2,1,1,1,5,2 }, {1033078,4,2,1,1,2,1,2,1,1,2 }, {1035283,1,1,1,1,1,1,3,1,1,2 }, {1036172,2,1,1,1,2,1,2,1,1,2 }, {1041801,5,3,3,3,2,3,4,4,1,4 }, {1043999,1,1,1,1,2,3,3,1,1,2 }, {1044572,8,7,5,10,7,9,5,5,4,4 }, {1047630,7,4,6,4,6,1,4,3,1,4 }, {1048672,4,1,1,1,2,1,2,1,1,2 }, {1049815,4,1,1,1,2,1,3,1,1,2 }, {1050670,10,7,7,6,4,10,4,1,2,4 }, {1050718,6,1,1,1,2,1,3,1,1,2 }, {1054590,7,3,2,10,5,10,5,4,4,4 }, {1054593,10,5,5,3,6,7,7,10,1,4 }, {1056784,3,1,1,1,2,1,2,1,1,2 }, {1057013,8,4,5,1,2,2,7,3,1,4 }, {1059552,1,1,1,1,2,1,3,1,1,2 }, {1065726,5,2,3,4,2,7,3,6,1,4 }, {1066373,3,2,1,1,1,1,2,1,1,2 }, {1066979,5,1,1,1,2,1,2,1,1,2 }, {1067444,2,1,1,1,2,1,2,1,1,2 }, {1070935,1,1,3,1,2,1,1,1,1,2 }, {1070935,3,1,1,1,1,1,2,1,1,2 }, {1071760,2,1,1,1,2,1,3,1,1,2 }, {1072179,10,7,7,3,8,5,7,4,3,4 }, {1074610,2,1,1,2,2,1,3,1,1,2 }, {1075123,3,1,2,1,2,1,2,1,1,2 }, {1079304,2,1,1,1,2,1,2,1,1,2 }, {1080185,10,10,10,8,6,1,8,9,1,4 }, {1081791,6,2,1,1,1,1,7,1,1,2 }, {1084584,5,4,4,9,2,10,5,6,1,4 }, {1091262,2,5,3,3,6,7,7,5,1,4 }, {1096800,6,6,6,9,6,4,7,8,1,2 }, {1099510,10,4,3,1,3,3,6,5,2,4 }, {1100524,6,10,10,2,8,10,7,3,3,4 }, {1102573,5,6,5,6,10,1,3,1,1,4 }, {1103608,10,10,10,4,8,1,8,10,1,4 }, {1103722,1,1,1,1,2,1,2,1,2,2 }, {1105257,3,7,7,4,4,9,4,8,1,4 }, {1105524,1,1,1,1,2,1,2,1,1,2 }, {1106095,4,1,1,3,2,1,3,1,1,2 }, {1106829,7,8,7,2,4,8,3,8,2,4 }, {1108370,9,5,8,1,2,3,2,1,5,4 }, {1108449,5,3,3,4,2,4,3,4,1,4 }, {1110102,10,3,6,2,3,5,4,10,2,4 }, {1110503,5,5,5,8,10,8,7,3,7,4 }, {1110524,10,5,5,6,8,8,7,1,1,4 }, {1111249,10,6,6,3,4,5,3,6,1,4 }, {1112209,8,10,10,1,3,6,3,9,1,4 }, {1113038,8,2,4,1,5,1,5,4,4,4 }, {1113483,5,2,3,1,6,10,5,1,1,4 }, {1113906,9,5,5,2,2,2,5,1,1,4 }, {1115282,5,3,5,5,3,3,4,10,1,4 }, {1115293,1,1,1,1,2,2,2,1,1,2 }, {1116116,9,10,10,1,10,8,3,3,1,4 }, {1116132,6,3,4,1,5,2,3,9,1,4 }, {1116192,1,1,1,1,2,1,2,1,1,2 }, {1116998,10,4,2,1,3,2,4,3,10,4 }, {1117152,4,1,1,1,2,1,3,1,1,2 }, {1118039,5,3,4,1,8,10,4,9,1,4 }, {1120559,8,3,8,3,4,9,8,9,8,4 }, {1121732,1,1,1,1,2,1,3,2,1,2 }, {1121919,5,1,3,1,2,1,2,1,1,2 }, {1123061,6,10,2,8,10,2,7,8,10,4 }, {1124651,1,3,3,2,2,1,7,2,1,2 }, {1125035,9,4,5,10,6,10,4,8,1,4 }, {1126417,10,6,4,1,3,4,3,2,3,4 }, {1131294,1,1,2,1,2,2,4,2,1,2 }, {1132347,1,1,4,1,2,1,2,1,1,2 }, {1133041,5,3,1,2,2,1,2,1,1,2 }, {1133136,3,1,1,1,2,3,3,1,1,2 }, {1136142,2,1,1,1,3,1,2,1,1,2 }, {1137156,2,2,2,1,1,1,7,1,1,2 }, {1143978,4,1,1,2,2,1,2,1,1,2 }, {1143978,5,2,1,1,2,1,3,1,1,2 }, {1147044,3,1,1,1,2,2,7,1,1,2 }, {1147699,3,5,7,8,8,9,7,10,7,4 }, {1147748,5,10,6,1,10,4,4,10,10,4 }, {1148278,3,3,6,4,5,8,4,4,1,4 }, {1148873,3,6,6,6,5,10,6,8,3,4 }, {1152331,4,1,1,1,2,1,3,1,1,2 }, {1155546,2,1,1,2,3,1,2,1,1,2 }, {1156272,1,1,1,1,2,1,3,1,1,2 }, {1156948,3,1,1,2,2,1,1,1,1,2 }, {1157734,4,1,1,1,2,1,3,1,1,2 }, {1158247,1,1,1,1,2,1,2,1,1,2 }, {1160476,2,1,1,1,2,1,3,1,1,2 }, {1164066,1,1,1,1,2,1,3,1,1,2 }, {1165297,2,1,1,2,2,1,1,1,1,2 }, {1165790,5,1,1,1,2,1,3,1,1,2 }, {1165926,9,6,9,2,10,6,2,9,10,4 }, {1166630,7,5,6,10,5,10,7,9,4,4 }, {1166654,10,3,5,1,10,5,3,10,2,4 }, {1167439,2,3,4,4,2,5,2,5,1,4 }, {1167471,4,1,2,1,2,1,3,1,1,2 }, {1168359,8,2,3,1,6,3,7,1,1,4 }, {1168736,10,10,10,10,10,1,8,8,8,4 }, {1169049,7,3,4,4,3,3,3,2,7,4 }, {1170419,10,10,10,8,2,10,4,1,1,4 }, {1170420,1,6,8,10,8,10,5,7,1,4 }, {1171710,1,1,1,1,2,1,2,3,1,2 }, {1171710,6,5,4,4,3,9,7,8,3,4 }, {1171795,1,3,1,2,2,2,5,3,2,2 }, {1171845,8,6,4,3,5,9,3,1,1,4 }, {1172152,10,3,3,10,2,10,7,3,3,4 }, {1173216,10,10,10,3,10,8,8,1,1,4 }, {1173235,3,3,2,1,2,3,3,1,1,2 }, {1173347,1,1,1,1,2,5,1,1,1,2 }, {1173347,8,3,3,1,2,2,3,2,1,2 }, {1173509,4,5,5,10,4,10,7,5,8,4 }, {1173514,1,1,1,1,4,3,1,1,1,2 }, {1173681,3,2,1,1,2,2,3,1,1,2 }, {1174057,1,1,2,2,2,1,3,1,1,2 }, {1174057,4,2,1,1,2,2,3,1,1,2 }, {1174131,10,10,10,2,10,10,5,3,3,4 }, {1174428,5,3,5,1,8,10,5,3,1,4 }, {1175937,5,4,6,7,9,7,8,10,1,4 }, {1176406,1,1,1,1,2,1,2,1,1,2 }, {1176881,7,5,3,7,4,10,7,5,5,4 } }; private static readonly Dataset defaultDataset; private static readonly IEnumerable defaultAllowedInputVariables; private static readonly string defaultTargetVariable; private static readonly ClassificationProblemData emptyProblemData; public static ClassificationProblemData EmptyProblemData { get { return EmptyProblemData; } } static ClassificationProblemData() { defaultDataset = new Dataset(defaultVariableNames, defaultData); defaultDataset.Name = "Wisconsin classification problem"; defaultDataset.Description = "subset from to .."; defaultAllowedInputVariables = defaultVariableNames.Except(new List() { "sample", "class" }); defaultTargetVariable = "class"; var problemData = new ClassificationProblemData(); problemData.Parameters.Clear(); problemData.Name = "Empty Classification ProblemData"; problemData.Description = "This ProblemData acts as place holder before the correct problem data is loaded."; problemData.isEmpty = true; problemData.Parameters.Add(new FixedValueParameter(DatasetParameterName, "", new Dataset())); problemData.Parameters.Add(new FixedValueParameter>(InputVariablesParameterName, "")); problemData.Parameters.Add(new FixedValueParameter(TrainingPartitionParameterName, "", (IntRange)new IntRange(0, 0).AsReadOnly())); problemData.Parameters.Add(new FixedValueParameter(TestPartitionParameterName, "", (IntRange)new IntRange(0, 0).AsReadOnly())); problemData.Parameters.Add(new ConstrainedValueParameter(TargetVariableParameterName, new ItemSet())); problemData.Parameters.Add(new FixedValueParameter(ClassNamesParameterName, "", new StringMatrix(0, 0).AsReadOnly())); problemData.Parameters.Add(new FixedValueParameter(ClassificationPenaltiesParameterName, "", (DoubleMatrix)new DoubleMatrix(0, 0).AsReadOnly())); emptyProblemData = problemData; } #endregion #region parameter properties public ConstrainedValueParameter TargetVariableParameter { get { return (ConstrainedValueParameter)Parameters[TargetVariableParameterName]; } } public IFixedValueParameter ClassNamesParameter { get { return (IFixedValueParameter)Parameters[ClassNamesParameterName]; } } public IFixedValueParameter ClassificationPenaltiesParameter { get { return (IFixedValueParameter)Parameters[ClassificationPenaltiesParameterName]; } } #endregion #region properties public string TargetVariable { get { return TargetVariableParameter.Value.Value; } } private List classValues; public List ClassValues { get { if (classValues == null) { classValues = Dataset.GetDoubleValues(TargetVariableParameter.Value.Value).Distinct().ToList(); classValues.Sort(); } return classValues; } } IEnumerable IClassificationProblemData.ClassValues { get { return ClassValues; } } public int Classes { get { return ClassValues.Count; } } private List classNames; public List ClassNames { get { if (classNames == null) { classNames = new List(); for (int i = 0; i < ClassNamesParameter.Value.Rows; i++) classNames.Add(ClassNamesParameter.Value[i, 0]); } return classNames; } } IEnumerable IClassificationProblemData.ClassNames { get { return ClassNames; } } private Dictionary, double> classificationPenaltiesCache = new Dictionary, double>(); #endregion [StorableConstructor] protected ClassificationProblemData(bool deserializing) : base(deserializing) { } [StorableHook(HookType.AfterDeserialization)] private void AfterDeserialization() { RegisterParameterEvents(); } protected ClassificationProblemData(ClassificationProblemData original, Cloner cloner) : base(original, cloner) { RegisterParameterEvents(); } public override IDeepCloneable Clone(Cloner cloner) { if (this == emptyProblemData) return emptyProblemData; return new ClassificationProblemData(this, cloner); } public ClassificationProblemData() : this(defaultDataset, defaultAllowedInputVariables, defaultTargetVariable) { } public ClassificationProblemData(Dataset dataset, IEnumerable allowedInputVariables, string targetVariable) : base(dataset, allowedInputVariables) { var validTargetVariableValues = CheckVariablesForPossibleTargetVariables(dataset).Select(x => new StringValue(x).AsReadOnly()).ToList(); var target = validTargetVariableValues.Where(x => x.Value == targetVariable).DefaultIfEmpty(validTargetVariableValues.First()).First(); Parameters.Add(new ConstrainedValueParameter(TargetVariableParameterName, new ItemSet(validTargetVariableValues), target)); Parameters.Add(new FixedValueParameter(ClassNamesParameterName, "")); Parameters.Add(new FixedValueParameter(ClassificationPenaltiesParameterName, "")); ResetTargetVariableDependentMembers(); RegisterParameterEvents(); } private static IEnumerable CheckVariablesForPossibleTargetVariables(Dataset dataset) { int maxSamples = Math.Min(InspectedRowsToDetermineTargets, dataset.Rows); var validTargetVariables = (from v in dataset.DoubleVariables let distinctValues = dataset.GetDoubleValues(v) .Take(maxSamples) .Distinct() .Count() where distinctValues < MaximumNumberOfClasses select v).ToArray(); if (!validTargetVariables.Any()) throw new ArgumentException("Import of classification problem data was not successful, because no target variable was found." + " A target variable must have at most " + MaximumNumberOfClasses + " distinct values to be applicable to classification."); return validTargetVariables; } private void ResetTargetVariableDependentMembers() { DeregisterParameterEvents(); classNames = null; ((IStringConvertibleMatrix)ClassNamesParameter.Value).Columns = 1; ((IStringConvertibleMatrix)ClassNamesParameter.Value).Rows = ClassValues.Count; for (int i = 0; i < Classes; i++) ClassNamesParameter.Value[i, 0] = "Class " + ClassValues[i]; ClassNamesParameter.Value.ColumnNames = new List() { "ClassNames" }; ClassNamesParameter.Value.RowNames = ClassValues.Select(s => "ClassValue: " + s); classificationPenaltiesCache.Clear(); ((ValueParameter)ClassificationPenaltiesParameter).ReactOnValueToStringChangedAndValueItemImageChanged = false; ((IStringConvertibleMatrix)ClassificationPenaltiesParameter.Value).Rows = Classes; ((IStringConvertibleMatrix)ClassificationPenaltiesParameter.Value).Columns = Classes; ClassificationPenaltiesParameter.Value.RowNames = ClassNames.Select(name => "Actual " + name); ClassificationPenaltiesParameter.Value.ColumnNames = ClassNames.Select(name => "Estimated " + name); for (int i = 0; i < Classes; i++) { for (int j = 0; j < Classes; j++) { if (i != j) ClassificationPenaltiesParameter.Value[i, j] = 1; else ClassificationPenaltiesParameter.Value[i, j] = 0; } } ((ValueParameter)ClassificationPenaltiesParameter).ReactOnValueToStringChangedAndValueItemImageChanged = true; RegisterParameterEvents(); } public string GetClassName(double classValue) { if (!ClassValues.Contains(classValue)) throw new ArgumentException(); int index = ClassValues.IndexOf(classValue); return ClassNames[index]; } public double GetClassValue(string className) { if (!ClassNames.Contains(className)) throw new ArgumentException(); int index = ClassNames.IndexOf(className); return ClassValues[index]; } public void SetClassName(double classValue, string className) { if (!classValues.Contains(classValue)) throw new ArgumentException(); int index = ClassValues.IndexOf(classValue); ClassNames[index] = className; ClassNamesParameter.Value[index, 0] = className; } public double GetClassificationPenalty(string correctClassName, string estimatedClassName) { return GetClassificationPenalty(GetClassValue(correctClassName), GetClassValue(estimatedClassName)); } public double GetClassificationPenalty(double correctClassValue, double estimatedClassValue) { var key = Tuple.Create(correctClassValue, estimatedClassValue); if (!classificationPenaltiesCache.ContainsKey(key)) { int correctClassIndex = ClassValues.IndexOf(correctClassValue); int estimatedClassIndex = ClassValues.IndexOf(estimatedClassValue); classificationPenaltiesCache[key] = ClassificationPenaltiesParameter.Value[correctClassIndex, estimatedClassIndex]; } return classificationPenaltiesCache[key]; } public void SetClassificationPenalty(string correctClassName, string estimatedClassName, double penalty) { SetClassificationPenalty(GetClassValue(correctClassName), GetClassValue(estimatedClassName), penalty); } public void SetClassificationPenalty(double correctClassValue, double estimatedClassValue, double penalty) { var key = Tuple.Create(correctClassValue, estimatedClassValue); int correctClassIndex = ClassValues.IndexOf(correctClassValue); int estimatedClassIndex = ClassValues.IndexOf(estimatedClassValue); ClassificationPenaltiesParameter.Value[correctClassIndex, estimatedClassIndex] = penalty; } #region events private void RegisterParameterEvents() { TargetVariableParameter.ValueChanged += new EventHandler(TargetVariableParameter_ValueChanged); ClassNamesParameter.Value.Reset += new EventHandler(Parameter_ValueChanged); ClassNamesParameter.Value.ItemChanged += new EventHandler>(MatrixParameter_ItemChanged); ClassificationPenaltiesParameter.Value.Reset += new EventHandler(Parameter_ValueChanged); ClassificationPenaltiesParameter.Value.ItemChanged += new EventHandler>(MatrixParameter_ItemChanged); } private void DeregisterParameterEvents() { TargetVariableParameter.ValueChanged -= new EventHandler(TargetVariableParameter_ValueChanged); ClassNamesParameter.Value.Reset -= new EventHandler(Parameter_ValueChanged); ClassNamesParameter.Value.ItemChanged -= new EventHandler>(MatrixParameter_ItemChanged); ClassificationPenaltiesParameter.Value.Reset -= new EventHandler(Parameter_ValueChanged); ClassificationPenaltiesParameter.Value.ItemChanged -= new EventHandler>(MatrixParameter_ItemChanged); } private void TargetVariableParameter_ValueChanged(object sender, EventArgs e) { classValues = null; ResetTargetVariableDependentMembers(); OnChanged(); } private void Parameter_ValueChanged(object sender, EventArgs e) { OnChanged(); } private void MatrixParameter_ItemChanged(object sender, EventArgs e) { OnChanged(); } #endregion #region Import from file public static ClassificationProblemData ImportFromFile(string fileName) { TableFileParser csvFileParser = new TableFileParser(); csvFileParser.Parse(fileName); Dataset dataset = new Dataset(csvFileParser.VariableNames, csvFileParser.Values); dataset.Name = Path.GetFileName(fileName); ClassificationProblemData problemData = new ClassificationProblemData(dataset, dataset.DoubleVariables.Skip(1), dataset.DoubleVariables.First()); problemData.Name = "Data imported from " + Path.GetFileName(fileName); return problemData; } #endregion } }