Changeset 15614
- Timestamp:
- 01/15/18 08:21:48 (6 years ago)
- Location:
- branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis
- Files:
-
- 11 added
- 12 deleted
- 32 edited
- 1 copied
Legend:
- Unmodified
- Added
- Removed
-
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis
- Property svn:mergeinfo changed
/trunk/sources/HeuristicLab.Algorithms.DataAnalysis (added) merged: 15464,15499,15502,15505,15532,15545,15548,15551,15556
- Property svn:mergeinfo changed
-
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4
-
Property
svn:mergeinfo
set to
(toggle deleted branches)
/stable/HeuristicLab.Algorithms.DataAnalysis/3.4 merged eligible /trunk/sources/HeuristicLab.Algorithms.DataAnalysis/3.4 merged eligible /branches/1721-RandomForestPersistence/HeuristicLab.Algorithms.DataAnalysis/3.4 10321-10322 /branches/Async/HeuristicLab.Algorithms.DataAnalysis/3.4 13329-15286 /branches/Benchmarking/sources/HeuristicLab.Algorithms.DataAnalysis/3.4 6917-7005 /branches/ClassificationModelComparison/HeuristicLab.Algorithms.DataAnalysis/3.4 9070-13099 /branches/CloningRefactoring/HeuristicLab.Algorithms.DataAnalysis/3.4 4656-4721 /branches/DataAnalysis Refactoring/HeuristicLab.Algorithms.DataAnalysis/3.4 5471-5808 /branches/DataAnalysis SolutionEnsembles/HeuristicLab.Algorithms.DataAnalysis/3.4 5815-6180 /branches/DataAnalysis/HeuristicLab.Algorithms.DataAnalysis/3.4 4458-4459,4462,4464 /branches/DataPreprocessing/HeuristicLab.Algorithms.DataAnalysis/3.4 10085-11101 /branches/GP.Grammar.Editor/HeuristicLab.Algorithms.DataAnalysis/3.4 6284-6795 /branches/GP.Symbols (TimeLag, Diff, Integral)/HeuristicLab.Algorithms.DataAnalysis/3.4 5060 /branches/HeuristicLab.DatasetRefactor/sources/HeuristicLab.Algorithms.DataAnalysis/3.4 11570-12508 /branches/HeuristicLab.Problems.Orienteering/HeuristicLab.Algorithms.DataAnalysis/3.4 11130-12721 /branches/HeuristicLab.RegressionSolutionGradientView/HeuristicLab.Algorithms.DataAnalysis/3.4 13819-14091 /branches/HeuristicLab.TimeSeries/HeuristicLab.Algorithms.DataAnalysis/3.4 8116-8789 /branches/LogResidualEvaluator/HeuristicLab.Algorithms.DataAnalysis/3.4 10202-10483 /branches/NET40/sources/HeuristicLab.Algorithms.DataAnalysis/3.4 5138-5162 /branches/ParallelEngine/HeuristicLab.Algorithms.DataAnalysis/3.4 5175-5192 /branches/ProblemInstancesRegressionAndClassification/HeuristicLab.Algorithms.DataAnalysis/3.4 7773-7810 /branches/QAPAlgorithms/HeuristicLab.Algorithms.DataAnalysis/3.4 6350-6627 /branches/Restructure trunk solution/HeuristicLab.Algorithms.DataAnalysis/3.4 6828 /branches/SpectralKernelForGaussianProcesses/HeuristicLab.Algorithms.DataAnalysis/3.4 10204-10479 /branches/SuccessProgressAnalysis/HeuristicLab.Algorithms.DataAnalysis/3.4 5370-5682 /branches/Trunk/HeuristicLab.Algorithms.DataAnalysis/3.4 6829-6865 /branches/VNS/HeuristicLab.Algorithms.DataAnalysis/3.4 5594-5752 /branches/Weighted TSNE/3.4 15451-15531 /branches/histogram/HeuristicLab.Algorithms.DataAnalysis/3.4 5959-6341 /branches/symbreg-factors-2650/HeuristicLab.Algorithms.DataAnalysis/3.4 14232-14825
-
Property
svn:mergeinfo
set to
(toggle deleted branches)
-
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/GaussianProcess/GaussianProcessRegression.cs
r15430 r15614 37 37 [Creatable(CreatableAttribute.Categories.DataAnalysisRegression, Priority = 160)] 38 38 [StorableClass] 39 public sealed class GaussianProcessRegression : GaussianProcessBase, IStorableContent {39 public sealed class GaussianProcessRegression : GaussianProcessBase, IStorableContent, IDataAnalysisAlgorithm<IRegressionProblem> { 40 40 public string Filename { get; set; } 41 41 -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/HeuristicLab.Algorithms.DataAnalysis-3.4.csproj
r15470 r15614 143 143 <SpecificVersion>False</SpecificVersion> 144 144 <HintPath>..\..\..\..\trunk\sources\bin\HeuristicLab.Data-3.3.dll</HintPath> 145 </Reference> 146 <Reference Include="HeuristicLab.Encodings.PermutationEncoding-3.3, Version=3.3.0.0, Culture=neutral, PublicKeyToken=ba48961d6f65dcec, processorArchitecture=MSIL"> 147 <SpecificVersion>False</SpecificVersion> 148 <HintPath>..\..\..\..\trunk\sources\bin\HeuristicLab.Encodings.PermutationEncoding-3.3.dll</HintPath> 145 149 </Reference> 146 150 <Reference Include="HeuristicLab.Encodings.RealVectorEncoding-3.3, Version=3.3.0.0, Culture=neutral, PublicKeyToken=ba48961d6f65dcec, processorArchitecture=MSIL"> … … 361 365 <Compile Include="Linear\MultinomialLogitModel.cs" /> 362 366 <Compile Include="Linear\Scaling.cs" /> 363 <Compile Include="M5Regression\Interfaces\ISplit Type.cs" />364 <Compile Include="M5Regression\Interfaces\IM5M etaModel.cs" />365 <Compile Include="M5Regression\Interfaces\ILeaf Type.cs" />366 <Compile Include="M5Regression\Interfaces\IPruning Type.cs" />367 <Compile Include="M5Regression\Interfaces\ISpliter.cs" /> 368 <Compile Include="M5Regression\Interfaces\IM5Model.cs" /> 369 <Compile Include="M5Regression\Interfaces\ILeafModel.cs" /> 370 <Compile Include="M5Regression\Interfaces\IPruning.cs" /> 367 371 <Compile Include="M5Regression\LeafTypes\ComplexLeaf.cs" /> 368 372 <Compile Include="M5Regression\LeafTypes\ComponentReductionLinearLeaf.cs" /> … … 374 378 <Compile Include="M5Regression\M5Utilities\M5StaticUtilities.cs" /> 375 379 <Compile Include="M5Regression\M5Utilities\M5Analyzer.cs" /> 376 <Compile Include="M5Regression\M5Utilities\M5CreationParameters.cs" /> 377 <Compile Include="M5Regression\M5Utilities\M5UpdateParameters.cs" /> 380 <Compile Include="M5Regression\M5Utilities\M5Parameters.cs" /> 378 381 <Compile Include="M5Regression\MetaModels\ComponentReducedLinearModel.cs" /> 379 382 <Compile Include="M5Regression\MetaModels\M5NodeModel.cs" /> … … 383 386 <Compile Include="M5Regression\MetaModels\DampenedLinearModel.cs" /> 384 387 <Compile Include="M5Regression\MetaModels\PreconstructedLinearModel.cs" /> 385 <Compile Include="M5Regression\Pruning\HoldoutLinearPruning.cs" /> 386 <Compile Include="M5Regression\Pruning\HoldoutLeafPruning.cs" /> 387 <Compile Include="M5Regression\Pruning\M5LinearPruning.cs" /> 388 <Compile Include="M5Regression\Pruning\PruningBase.cs" /> 388 <Compile Include="M5Regression\Pruning\M5LinearBottomUpPruning.cs" /> 389 <Compile Include="M5Regression\Pruning\BottomUpPruningBase.cs" /> 389 390 <Compile Include="M5Regression\Pruning\NoPruning.cs" /> 390 <Compile Include="M5Regression\Pruning\M5Leaf Pruning.cs" />391 <Compile Include="M5Regression\Pruning\M5LeafBottomUpPruning.cs" /> 391 392 <Compile Include="M5Regression\Spliting\OrderImpurityCalculator.cs" /> 392 <Compile Include="M5Regression\Spliting\OrderSplitType.cs" /> 393 <Compile Include="M5Regression\Spliting\OptimumSearchingSpliter.cs" /> 394 <Compile Include="M5Regression\Spliting\M5Spliter.cs" /> 393 395 <Compile Include="Nca\Initialization\INcaInitializer.cs" /> 394 396 <Compile Include="Nca\Initialization\LdaInitializer.cs" /> … … 448 450 <Compile Include="TSNE\Distances\IndexedItemDistance.cs" /> 449 451 <Compile Include="TSNE\Distances\ManhattanDistance.cs" /> 452 <Compile Include="TSNE\Distances\WeightedEuclideanDistance.cs" /> 450 453 <Compile Include="TSNE\Distances\IDistance.cs" /> 451 454 <Compile Include="TSNE\PriorityQueue.cs" /> -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/LeafTypes/ComplexLeaf.cs
r15430 r15614 32 32 [StorableClass] 33 33 [Item("ComplexLeaf", "A leaf type that uses an arbitriary RegressionAlgorithm to create leaf models")] 34 public class ComplexLeaf : ParameterizedNamedItem, ILeaf Type<IRegressionModel>{34 public class ComplexLeaf : ParameterizedNamedItem, ILeafModel { 35 35 public const string RegressionParameterName = "Regression"; 36 36 public IValueParameter<IDataAnalysisAlgorithm<IRegressionProblem>> RegressionParameter { … … 55 55 56 56 #region IModelType 57 public IRegressionModel BuildModel(IRegressionProblemData pd, IRandom random, CancellationToken cancellation, out int noParameters) { 57 public bool ProvidesConfidence { 58 get { return false; } 59 } 60 public IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int noParameters) { 58 61 if (pd.Dataset.Rows < MinLeafSize(pd)) throw new ArgumentException("The number of training instances is too small to create a linear model"); 59 62 noParameters = pd.Dataset.Rows + 1; 60 63 Regression.Problem = new RegressionProblem {ProblemData = pd}; 61 var res = M5StaticUtilities.RunSubAlgorithm(Regression, random.Next(), cancellation );64 var res = M5StaticUtilities.RunSubAlgorithm(Regression, random.Next(), cancellationToken); 62 65 var t = res.Select(x => x.Value).OfType<IRegressionSolution>().FirstOrDefault(); 63 66 if (t == null) throw new ArgumentException("No RegressionSolution was provided by the algorithm"); -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/LeafTypes/ComponentReductionLinearLeaf.cs
r15470 r15614 34 34 [StorableClass] 35 35 [Item("ComponentReductionLinearLeaf", "A leaf type that uses principle component analysis to create smaller linear models as leaf models")] 36 public class ComponentReductionLinearLeaf : ParameterizedNamedItem, ILeaf Type<IConfidenceRegressionModel>{36 public class ComponentReductionLinearLeaf : ParameterizedNamedItem, ILeafModel { 37 37 public const string NoComponentsParameterName = "NoComponents"; 38 38 public IFixedValueParameter<IntValue> NoComponentsParameter { … … 56 56 57 57 #region IModelType 58 public IConfidenceRegressionModel BuildModel(IRegressionProblemData pd, IRandom random, 59 CancellationToken cancellation, out int noParameters) { 58 public bool ProvidesConfidence { 59 get { return true; } 60 } 61 public IRegressionModel Build(IRegressionProblemData pd, IRandom random, 62 CancellationToken cancellationToken, out int noParameters) { 60 63 var pca = PrincipleComponentTransformation.CreateProjection(pd.Dataset, pd.TrainingIndices, pd.AllowedInputVariables, true); 61 64 var pcdata = pca.TransformProblemData(pd); … … 64 67 noParameters = 1; 65 68 for (var i = 1; i <= Math.Min(NoComponents, pd.AllowedInputVariables.Count()); i++) { 66 var pd2 = (IRegressionProblemData) 69 var pd2 = (IRegressionProblemData)pcdata.Clone(); 67 70 var inputs = new HashSet<string>(pca.ComponentNames.Take(i)); 68 71 foreach (var v in pd2.InputVariables.CheckedItems.ToArray()) -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/LeafTypes/ConstantLeaf.cs
r15430 r15614 31 31 [StorableClass] 32 32 [Item("ConstantLeaf", "A leaf type that uses constant models as leaf models")] 33 public class ConstantLeaf : ParameterizedNamedItem, ILeaf Type<IRegressionModel>{33 public class ConstantLeaf : ParameterizedNamedItem, ILeafModel { 34 34 #region Constructors & Cloning 35 35 [StorableConstructor] … … 43 43 44 44 #region IModelType 45 public IRegressionModel BuildModel(IRegressionProblemData pd, IRandom random, CancellationToken cancellation, out int noParameters) { 45 public bool ProvidesConfidence { 46 get { return false; } 47 } 48 public IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int noParameters) { 46 49 if (pd.Dataset.Rows < MinLeafSize(pd)) throw new ArgumentException("The number of training instances is too small to create a linear model"); 47 50 noParameters = 1; -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/LeafTypes/GaussianProcessLeaf.cs
r15430 r15614 33 33 [StorableClass] 34 34 [Item("GaussianProcessLeaf", "A leaf type that uses gaussian process models as leaf models.")] 35 public class GaussianProcessLeaf : ParameterizedNamedItem, ILeaf Type<IGaussianProcessModel>{35 public class GaussianProcessLeaf : ParameterizedNamedItem, ILeafModel { 36 36 #region ParameterNames 37 37 public const string TriesParameterName = "Tries"; … … 75 75 76 76 #region IModelType 77 public IGaussianProcessModel BuildModel(IRegressionProblemData pd, IRandom random, CancellationToken cancellation, out int noParameters) { 78 if (pd.Dataset.Rows < MinLeafSize(pd)) throw new ArgumentException("The number of training instances is too small to create a linear model"); 77 public bool ProvidesConfidence { 78 get { return true; } 79 } 80 public IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int noParameters) { 81 if (pd.Dataset.Rows < MinLeafSize(pd)) throw new ArgumentException("The number of training instances is too small to create a gaussian process model"); 79 82 Regression.Problem = new RegressionProblem {ProblemData = pd}; 80 83 var cvscore = double.MaxValue; … … 82 85 83 86 for (var i = 0; i < Tries; i++) { 84 var res = M5StaticUtilities.RunSubAlgorithm(Regression, random.Next(), cancellation );87 var res = M5StaticUtilities.RunSubAlgorithm(Regression, random.Next(), cancellationToken); 85 88 var t = res.Select(x => x.Value).OfType<GaussianProcessRegressionSolution>().FirstOrDefault(); 86 var score = ((DoubleValue) 89 var score = ((DoubleValue)res["Negative log pseudo-likelihood (LOO-CV)"].Value).Value; 87 90 if (score >= cvscore || t == null || double.IsNaN(t.TrainingRSquared)) continue; 88 91 cvscore = score; 89 92 sol = t; 90 93 } 91 94 Regression.Runs.Clear(); 92 95 if (sol == null) throw new ArgumentException("Could not create Gaussian Process model"); 93 96 -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/LeafTypes/LinearLeaf.cs
r15430 r15614 31 31 [StorableClass] 32 32 [Item("LinearLeaf", "A leaf type that uses linear models as leaf models. This is the standard for M5' regression")] 33 public class LinearLeaf : ParameterizedNamedItem, ILeaf Type<IConfidenceRegressionModel>{33 public class LinearLeaf : ParameterizedNamedItem, ILeafModel { 34 34 #region Constructors & Cloning 35 35 [StorableConstructor] … … 43 43 44 44 #region IModelType 45 public IConfidenceRegressionModel BuildModel(IRegressionProblemData pd, IRandom random, CancellationToken cancellation, out int noParameters) { 45 public bool ProvidesConfidence { 46 get { return true; } 47 } 48 public IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int noParameters) { 46 49 if (pd.Dataset.Rows < MinLeafSize(pd)) throw new ArgumentException("The number of training instances is too small to create a linear model"); 47 50 double rmse, cvRmse; -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/LeafTypes/LogisticLeaf.cs
r15430 r15614 33 33 [StorableClass] 34 34 [Item("LogisticLeaf", "A leaf type that uses linear models with a logistic dampening as leaf models. Dampening reduces prediction values far outside the observed target values.")] 35 public class LogisticLeaf : ParameterizedNamedItem, ILeaf Type<IConfidenceRegressionModel>{35 public class LogisticLeaf : ParameterizedNamedItem, ILeafModel { 36 36 private const string DampeningParameterName = "Dampening"; 37 37 public IFixedValueParameter<DoubleValue> DampeningParameter { … … 55 55 56 56 #region IModelType 57 public IConfidenceRegressionModel BuildModel(IRegressionProblemData pd, IRandom random, CancellationToken cancellation, out int noParameters) { 57 public bool ProvidesConfidence { 58 get { return true; } 59 } 60 public IRegressionModel Build(IRegressionProblemData pd, IRandom random, CancellationToken cancellationToken, out int noParameters) { 58 61 if (pd.Dataset.Rows < MinLeafSize(pd)) throw new ArgumentException("The number of training instances is too small to create a linear model"); 59 62 double rmse, cvRmse; -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/M5Regression.cs
r15470 r15614 6 6 using HeuristicLab.Core; 7 7 using HeuristicLab.Data; 8 using HeuristicLab.Encodings.PermutationEncoding; 8 9 using HeuristicLab.Optimization; 9 10 using HeuristicLab.Parameters; … … 16 17 [StorableClass] 17 18 [Creatable(CreatableAttribute.Categories.DataAnalysisRegression, Priority = 95)] 18 [Item("M5RegressionTree", "A M5 regression tree / rule set classifier")]19 [Item("M5RegressionTree", "A M5 regression tree / rule set")] 19 20 public sealed class M5Regression : FixedDataAnalysisAlgorithm<IRegressionProblem> { 20 21 #region Parametername 21 22 private const string GenerateRulesParameterName = "GenerateRules"; 22 private const string ImpurityParameterName = "Split"; 23 private const string HoldoutSizeParameterName = "HoldoutSize"; 24 private const string SpliterParameterName = "Spliter"; 23 25 private const string MinimalNodeSizeParameterName = "MinimalNodeSize"; 24 private const string ModelTypeParameterName = "ModelType";26 private const string LeafModelParameterName = "LeafModel"; 25 27 private const string PruningTypeParameterName = "PruningType"; 26 28 private const string SeedParameterName = "Seed"; 27 29 private const string SetSeedRandomlyParameterName = "SetSeedRandomly"; 30 private const string UseHoldoutParameterName = "UseHoldout"; 28 31 #endregion 29 32 30 33 #region Parameter properties 31 34 public IFixedValueParameter<BoolValue> GenerateRulesParameter { 32 get { return Parameters[GenerateRulesParameterName] as IFixedValueParameter<BoolValue>; } 33 } 34 public IConstrainedValueParameter<ISplitType> ImpurityParameter { 35 get { return Parameters[ImpurityParameterName] as IConstrainedValueParameter<ISplitType>; } 35 get { return (IFixedValueParameter<BoolValue>)Parameters[GenerateRulesParameterName]; } 36 } 37 public IFixedValueParameter<PercentValue> HoldoutSizeParameter { 38 get { return (IFixedValueParameter<PercentValue>)Parameters[HoldoutSizeParameterName]; } 39 } 40 public IConstrainedValueParameter<ISpliter> ImpurityParameter { 41 get { return (IConstrainedValueParameter<ISpliter>)Parameters[SpliterParameterName]; } 36 42 } 37 43 public IFixedValueParameter<IntValue> MinimalNodeSizeParameter { 38 get { return (IFixedValueParameter<IntValue>) 39 } 40 public IConstrainedValueParameter<ILeaf Type<IRegressionModel>> ModelTypeParameter {41 get { return Parameters[ModelTypeParameterName] as IConstrainedValueParameter<ILeafType<IRegressionModel>>; }42 } 43 public IConstrainedValueParameter<IPruning Type> PruningTypeParameter {44 get { return Parameters[PruningTypeParameterName] as IConstrainedValueParameter<IPruningType>; }44 get { return (IFixedValueParameter<IntValue>)Parameters[MinimalNodeSizeParameterName]; } 45 } 46 public IConstrainedValueParameter<ILeafModel> LeafModelParameter { 47 get { return (IConstrainedValueParameter<ILeafModel>)Parameters[LeafModelParameterName]; } 48 } 49 public IConstrainedValueParameter<IPruning> PruningTypeParameter { 50 get { return (IConstrainedValueParameter<IPruning>)Parameters[PruningTypeParameterName]; } 45 51 } 46 52 public IFixedValueParameter<IntValue> SeedParameter { 47 get { return Parameters[SeedParameterName] as IFixedValueParameter<IntValue>; }53 get { return (IFixedValueParameter<IntValue>)Parameters[SeedParameterName]; } 48 54 } 49 55 public IFixedValueParameter<BoolValue> SetSeedRandomlyParameter { 50 get { return Parameters[SetSeedRandomlyParameterName] as IFixedValueParameter<BoolValue>; } 56 get { return (IFixedValueParameter<BoolValue>)Parameters[SetSeedRandomlyParameterName]; } 57 } 58 public IFixedValueParameter<BoolValue> UseHoldoutParameter { 59 get { return (IFixedValueParameter<BoolValue>)Parameters[UseHoldoutParameterName]; } 51 60 } 52 61 #endregion … … 56 65 get { return GenerateRulesParameter.Value.Value; } 57 66 } 58 public ISplitType Split { 67 public double HoldoutSize { 68 get { return HoldoutSizeParameter.Value.Value; } 69 } 70 public ISpliter Split { 59 71 get { return ImpurityParameter.Value; } 60 72 } … … 62 74 get { return MinimalNodeSizeParameter.Value.Value; } 63 75 } 64 public ILeaf Type<IRegressionModel> LeafType{65 get { return ModelTypeParameter.Value; }66 } 67 public IPruning Type PruningType{76 public ILeafModel LeafModel { 77 get { return LeafModelParameter.Value; } 78 } 79 public IPruning Pruning { 68 80 get { return PruningTypeParameter.Value; } 69 81 } … … 73 85 public bool SetSeedRandomly { 74 86 get { return SetSeedRandomlyParameter.Value.Value; } 87 } 88 public bool UseHoldout { 89 get { return UseHoldoutParameter.Value.Value; } 75 90 } 76 91 #endregion … … 81 96 private M5Regression(M5Regression original, Cloner cloner) : base(original, cloner) { } 82 97 public M5Regression() { 83 var modelSet = new ItemSet<ILeafType<IRegressionModel>>(ApplicationManager.Manager.GetInstances<ILeafType<IRegressionModel>>()); 84 var pruningSet = new ItemSet<IPruningType>(ApplicationManager.Manager.GetInstances<IPruningType>()); 85 var impuritySet = new ItemSet<ISplitType>(ApplicationManager.Manager.GetInstances<ISplitType>()); 86 Parameters.Add(new FixedValueParameter<BoolValue>(GenerateRulesParameterName, "Whether a set of rules or a decision tree shall be created", new BoolValue(true))); 87 Parameters.Add(new ConstrainedValueParameter<ISplitType>(ImpurityParameterName, "The type of split function used to create node splits", impuritySet, impuritySet.OfType<OrderSplitType>().First())); 98 var modelSet = new ItemSet<ILeafModel>(ApplicationManager.Manager.GetInstances<ILeafModel>()); 99 var pruningSet = new ItemSet<IPruning>(ApplicationManager.Manager.GetInstances<IPruning>()); 100 var impuritySet = new ItemSet<ISpliter>(ApplicationManager.Manager.GetInstances<ISpliter>()); 101 Parameters.Add(new FixedValueParameter<BoolValue>(GenerateRulesParameterName, "Whether a set of rules or a decision tree shall be created", new BoolValue(false))); 102 Parameters.Add(new FixedValueParameter<PercentValue>(HoldoutSizeParameterName, "How much of the training set shall be reserved for pruning", new PercentValue(0.2))); 103 Parameters.Add(new ConstrainedValueParameter<ISpliter>(SpliterParameterName, "The type of split function used to create node splits", impuritySet, impuritySet.OfType<M5Spliter>().First())); 88 104 Parameters.Add(new FixedValueParameter<IntValue>(MinimalNodeSizeParameterName, "The minimal number of samples in a leaf node", new IntValue(1))); 89 Parameters.Add(new ConstrainedValueParameter<ILeaf Type<IRegressionModel>>(ModelTypeParameterName, "The type of model used for the nodes", modelSet, modelSet.OfType<LinearLeaf>().First()));90 Parameters.Add(new ConstrainedValueParameter<IPruning Type>(PruningTypeParameterName, "The type of pruning used", pruningSet, pruningSet.OfType<M5LeafPruning>().First()));105 Parameters.Add(new ConstrainedValueParameter<ILeafModel>(LeafModelParameterName, "The type of model used for the nodes", modelSet, modelSet.OfType<LinearLeaf>().First())); 106 Parameters.Add(new ConstrainedValueParameter<IPruning>(PruningTypeParameterName, "The type of pruning used", pruningSet, pruningSet.OfType<M5LinearBottomUpPruning>().First())); 91 107 Parameters.Add(new FixedValueParameter<IntValue>(SeedParameterName, "The random seed used to initialize the new pseudo random number generator.", new IntValue(0))); 92 108 Parameters.Add(new FixedValueParameter<BoolValue>(SetSeedRandomlyParameterName, "True if the random seed should be set to a random value, otherwise false.", new BoolValue(true))); 109 Parameters.Add(new FixedValueParameter<BoolValue>(UseHoldoutParameterName, "True if a holdout set should be generated, false if splitting and pruning shall be performed on the same data ", new BoolValue(false))); 93 110 Problem = new RegressionProblem(); 94 111 } … … 102 119 if (SetSeedRandomly) SeedParameter.Value.Value = new System.Random().Next(); 103 120 random.Reset(Seed); 104 var solution = CreateM5RegressionSolution(Problem.ProblemData, random, Leaf Type, Split, PruningType, cancellationToken, MinimalNodeSize, GenerateRules, Results);121 var solution = CreateM5RegressionSolution(Problem.ProblemData, random, LeafModel, Split, Pruning, UseHoldout, HoldoutSize, MinimalNodeSize, GenerateRules, Results, cancellationToken); 105 122 AnalyzeSolution(solution); 106 123 } … … 108 125 #region Static Interface 109 126 public static IRegressionSolution CreateM5RegressionSolution(IRegressionProblemData problemData, IRandom random, 110 ILeaf Type<IRegressionModel> leafType = null, ISplitType splitType = null, IPruningType pruningType= null,111 CancellationToken? cancellationToken = null, int minNumInstances = 4, bool generateRules = false, ResultCollection results= null) {127 ILeafModel leafModel = null, ISpliter spliter = null, IPruning pruning = null, 128 bool useHoldout = false, double holdoutSize = 0.2, int minNumInstances = 4, bool generateRules = false, ResultCollection results = null, CancellationToken? cancellationToken = null) { 112 129 //set default values 113 if (leaf Type == null) leafType= new LinearLeaf();114 if (split Type == null) splitType = new OrderSplitType();130 if (leafModel == null) leafModel = new LinearLeaf(); 131 if (spliter == null) spliter = new M5Spliter(); 115 132 if (cancellationToken == null) cancellationToken = CancellationToken.None; 116 if (pruningType == null) pruningType = new M5LeafPruning(); 117 133 if (pruning == null) pruning = new M5LeafBottomUpPruning(); 118 134 119 135 var doubleVars = new HashSet<string>(problemData.Dataset.DoubleVariables); 120 136 var vars = problemData.AllowedInputVariables.Concat(new[] {problemData.TargetVariable}).ToArray(); 121 if (vars.Any(v => !doubleVars.Contains(v))) throw new NotSupportedException("M5 regression does not support non-double valued input or output features.");137 if (vars.Any(v => !doubleVars.Contains(v))) throw new NotSupportedException("M5 regression supports only double valued input or output features."); 122 138 123 139 var values = vars.Select(v => problemData.Dataset.GetDoubleValues(v, problemData.TrainingIndices).ToArray()).ToArray(); 124 140 if (values.Any(v => v.Any(x => double.IsNaN(x) || double.IsInfinity(x)))) 125 141 throw new NotSupportedException("M5 regression does not support NaN or infinity values in the input dataset."); 142 126 143 var trainingData = new Dataset(vars, values); 127 144 var pd = new RegressionProblemData(trainingData, problemData.AllowedInputVariables, problemData.TargetVariable); … … 130 147 131 148 //create & build Model 132 var m5Params = new M5CreationParameters(pruningType, minNumInstances, leafType, pd, random, splitType, results); 133 134 IReadOnlyList<int> t, h; 135 pruningType.GenerateHoldOutSet(problemData.TrainingIndices.ToArray(), random, out t, out h); 136 137 if (generateRules) { 138 IM5MetaModel model = M5RuleSetModel.CreateRuleModel(problemData.TargetVariable, m5Params); 139 model.BuildClassifier(t, h, m5Params, cancellationToken.Value); 140 return model.CreateRegressionSolution(problemData); 149 var m5Params = new M5Parameters(pruning, minNumInstances, leafModel, pd, random, spliter, results); 150 151 IReadOnlyList<int> trainingRows, pruningRows; 152 GeneratePruningSet(problemData.TrainingIndices.ToArray(), random, useHoldout, holdoutSize, out trainingRows, out pruningRows); 153 154 IM5Model model; 155 if (generateRules) 156 model = M5RuleSetModel.CreateRuleModel(problemData.TargetVariable, m5Params); 157 else 158 model = M5TreeModel.CreateTreeModel(problemData.TargetVariable, m5Params); 159 160 model.Build(trainingRows, pruningRows, m5Params, cancellationToken.Value); 161 return model.CreateRegressionSolution(problemData); 162 } 163 164 public static void UpdateM5Model(IRegressionModel model, IRegressionProblemData problemData, IRandom random, 165 ILeafModel leafModel, CancellationToken? cancellationToken = null) { 166 var m5Model = model as IM5Model; 167 if (m5Model == null) throw new ArgumentException("This type of model can not be updated"); 168 UpdateM5Model(m5Model, problemData, random, leafModel, cancellationToken); 169 } 170 171 private static void UpdateM5Model(IM5Model model, IRegressionProblemData problemData, IRandom random, 172 ILeafModel leafModel = null, CancellationToken? cancellationToken = null) { 173 if (cancellationToken == null) cancellationToken = CancellationToken.None; 174 var m5Params = new M5Parameters(leafModel, problemData, random); 175 model.Update(problemData.TrainingIndices.ToList(), m5Params, cancellationToken.Value); 176 } 177 #endregion 178 179 #region Helpers 180 private static void GeneratePruningSet(IReadOnlyList<int> allrows, IRandom random, bool useHoldout, double holdoutSize, out IReadOnlyList<int> training, out IReadOnlyList<int> pruning) { 181 if (!useHoldout) { 182 training = allrows; 183 pruning = allrows; 184 return; 185 } 186 var perm = new Permutation(PermutationTypes.Absolute, allrows.Count, random); 187 var cut = (int)(holdoutSize * allrows.Count); 188 pruning = perm.Take(cut).Select(i => allrows[i]).ToArray(); 189 training = perm.Take(cut).Select(i => allrows[i]).ToArray(); 190 } 191 192 private void AnalyzeSolution(IRegressionSolution solution) { 193 Results.Add(new Result("RegressionSolution", (IItem)solution.Clone())); 194 195 Dictionary<string, int> frequencies; 196 if (!GenerateRules) { 197 Results.Add(M5Analyzer.CreateLeafDepthHistogram((M5TreeModel)solution.Model)); 198 frequencies = M5Analyzer.GetTreeVariableFrequences((M5TreeModel)solution.Model); 141 199 } 142 200 else { 143 IM5MetaModel model = M5TreeModel.CreateTreeModel(problemData.TargetVariable, m5Params); 144 model.BuildClassifier(t, h, m5Params, cancellationToken.Value); 145 return model.CreateRegressionSolution(problemData); 146 } 147 } 148 149 public static void UpdateM5Model(M5TreeModel model, IRegressionProblemData problemData, IRandom random, 150 ILeafType<IRegressionModel> leafType = null, CancellationToken? cancellationToken = null) { 151 UpdateM5Model(model as IM5MetaModel, problemData, random, leafType, cancellationToken); 152 } 153 154 public static void UpdateM5Model(M5RuleSetModel model, IRegressionProblemData problemData, IRandom random, 155 ILeafType<IRegressionModel> leafType = null, CancellationToken? cancellationToken = null) { 156 UpdateM5Model(model as IM5MetaModel, problemData, random, leafType, cancellationToken); 157 } 158 159 private static void UpdateM5Model(IM5MetaModel model, IRegressionProblemData problemData, IRandom random, 160 ILeafType<IRegressionModel> leafType = null, CancellationToken? cancellationToken = null) { 161 if (cancellationToken == null) cancellationToken = CancellationToken.None; 162 var m5Params = new M5UpdateParameters(leafType, problemData, random); 163 model.UpdateModel(problemData.TrainingIndices.ToList(), m5Params, cancellationToken.Value); 164 } 165 #endregion 166 167 #region Helpers 168 private void AnalyzeSolution(IRegressionSolution solution) { 169 Results.Add(new Result("RegressionSolution", (IItem) solution.Clone())); 170 171 Dictionary<string, int> frequencies; 172 if (!GenerateRules) { 173 Results.Add(M5Analyzer.CreateLeafDepthHistogram((M5TreeModel) solution.Model)); 174 frequencies = M5Analyzer.GetTreeVariableFrequences((M5TreeModel) solution.Model); 175 } 176 else { 177 Results.Add(M5Analyzer.CreateRulesResult((M5RuleSetModel) solution.Model, Problem.ProblemData, "M5TreeResult", true)); 178 frequencies = M5Analyzer.GetRuleVariableFrequences((M5RuleSetModel) solution.Model); 179 Results.Add(M5Analyzer.CreateCoverageDiagram((M5RuleSetModel) solution.Model, Problem.ProblemData)); 201 Results.Add(M5Analyzer.CreateRulesResult((M5RuleSetModel)solution.Model, Problem.ProblemData, "M5TreeResult", true)); 202 frequencies = M5Analyzer.GetRuleVariableFrequences((M5RuleSetModel)solution.Model); 203 Results.Add(M5Analyzer.CreateCoverageDiagram((M5RuleSetModel)solution.Model, Problem.ProblemData)); 180 204 } 181 205 … … 183 207 var sum = frequencies.Values.Sum(); 184 208 sum = sum == 0 ? 1 : sum; 185 var impactArray = new DoubleArray(frequencies.Select(i => (double) 209 var impactArray = new DoubleArray(frequencies.Select(i => (double)i.Value / sum).ToArray()) { 186 210 ElementNames = frequencies.Select(i => i.Key) 187 211 }; -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/M5Utilities/M5Analyzer.cs
r15470 r15614 37 37 var res = ruleSetModel.VariablesUsedForPrediction.ToDictionary(x => x, x => 0); 38 38 foreach (var rule in ruleSetModel.Rules) 39 foreach (var att in rule.SplitAtt s)39 foreach (var att in rule.SplitAttributes) 40 40 res[att]++; 41 41 return res; … … 46 46 var root = treeModel.Root; 47 47 foreach (var cur in root.EnumerateNodes().Where(x => !x.IsLeaf)) 48 res[cur.SplitAttr ]++;48 res[cur.SplitAttribute]++; 49 49 return res; 50 50 } -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/M5Utilities/M5StaticUtilities.cs
r15549 r15614 21 21 22 22 using System; 23 using System.Collections.Generic; 24 using System.Linq; 23 25 using System.Threading; 26 using HeuristicLab.Common; 24 27 using HeuristicLab.Core; 25 28 using HeuristicLab.Data; 26 29 using HeuristicLab.Optimization; 30 using HeuristicLab.Problems.DataAnalysis; 27 31 28 32 namespace HeuristicLab.Algorithms.DataAnalysis { 29 33 internal static class M5StaticUtilities { 30 public static ResultCollection RunSubAlgorithm(IAlgorithm alg, int random, CancellationToken cancellation ) {34 public static ResultCollection RunSubAlgorithm(IAlgorithm alg, int random, CancellationToken cancellationToken) { 31 35 if (alg.Parameters.ContainsKey("SetSeedRandomly") && alg.Parameters.ContainsKey("Seed")) { 32 36 var seed = alg.Parameters["Seed"].ActualValue as IntValue; … … 38 42 } 39 43 if (alg.ExecutionState != ExecutionState.Paused) alg.Prepare(); 40 alg.Start(cancellation );44 alg.Start(cancellationToken); 41 45 return alg.Results; 46 } 47 48 public static void SplitRows(IReadOnlyList<int> rows, IDataset data, string splitAttr, double splitValue, out IReadOnlyList<int> leftRows, out IReadOnlyList<int> rightRows) { 49 //TODO check and revert points at borders are now used multipe times 50 var assignment = data.GetDoubleValues(splitAttr, rows).Select(x => x.IsAlmost(splitValue) ? 2 : x < splitValue ? 0 : 1).ToArray(); 51 leftRows = rows.Zip(assignment, (i, b) => new {i, b}).Where(x => x.b == 0 || x.b == 2).Select(x => x.i).ToList(); 52 rightRows = rows.Zip(assignment, (i, b) => new {i, b}).Where(x => x.b > 0).Select(x => x.i).ToList(); 53 } 54 55 public static IRegressionModel BuildModel(IReadOnlyList<int> rows, M5Parameters parameters, ILeafModel leafModel, CancellationToken cancellation, out int numParams) { 56 var reducedData = ReduceDataset(parameters.Data, rows, parameters.AllowedInputVariables.ToArray(), parameters.TargetVariable); 57 var pd = new RegressionProblemData(reducedData, parameters.AllowedInputVariables.ToArray(), parameters.TargetVariable); 58 pd.TrainingPartition.Start = 0; 59 pd.TrainingPartition.End = pd.TestPartition.Start = pd.TestPartition.End = reducedData.Rows; 60 61 int numP; 62 var model = leafModel.Build(pd, parameters.Random, cancellation, out numP); 63 numParams = numP; 64 cancellation.ThrowIfCancellationRequested(); 65 return model; 66 } 67 68 public static IDataset ReduceDataset(IDataset data, IReadOnlyList<int> rows, IReadOnlyList<string> inputVariables, string target) { 69 return new Dataset(inputVariables.Concat(new[] {target}), inputVariables.Concat(new[] {target}).Select(x => data.GetDoubleValues(x, rows).ToList())); 42 70 } 43 71 } -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/MetaModels/M5NodeModel.cs
r15470 r15614 37 37 internal bool IsLeaf { get; private set; } 38 38 [Storable] 39 internal IRegressionModel NodeModel { get; privateset; }39 internal IRegressionModel Model { get; set; } 40 40 [Storable] 41 internal string SplitAttr { get; private set; }41 internal string SplitAttribute { get; private set; } 42 42 [Storable] 43 43 internal double SplitValue { get; private set; } … … 47 47 internal M5NodeModel Right { get; private set; } 48 48 [Storable] 49 internal M5NodeModel Parent { get; set; }49 internal M5NodeModel Parent { get; private set; } 50 50 [Storable] 51 51 internal int NumSamples { get; private set; } 52 52 [Storable] 53 internal int NumParam { get; set; } 54 [Storable] 55 internal int NodeModelParams { get; set; } 56 [Storable] 57 private IReadOnlyList<string> Variables { get; set; } 53 private IReadOnlyList<string> variables; 58 54 #endregion 59 55 … … 63 59 protected M5NodeModel(M5NodeModel original, Cloner cloner) : base(original, cloner) { 64 60 IsLeaf = original.IsLeaf; 65 NodeModel = cloner.Clone(original.NodeModel);61 Model = cloner.Clone(original.Model); 66 62 SplitValue = original.SplitValue; 67 SplitAttr = original.SplitAttr;63 SplitAttribute = original.SplitAttribute; 68 64 Left = cloner.Clone(original.Left); 69 65 Right = cloner.Clone(original.Right); 70 66 Parent = cloner.Clone(original.Parent); 71 NumParam = original.NumParam;72 67 NumSamples = original.NumSamples; 73 Variables = original.Variables != null ? original.Variables.ToList() : null;68 variables = original.variables != null ? original.variables.ToList() : null; 74 69 } 75 pr otectedM5NodeModel(string targetAttr) : base(targetAttr) { }76 pr otected M5NodeModel(M5NodeModel parent) : base(parent.TargetVariable) {70 private M5NodeModel(string targetAttr) : base(targetAttr) { } 71 private M5NodeModel(M5NodeModel parent) : this(parent.TargetVariable) { 77 72 Parent = parent; 78 73 } … … 80 75 return new M5NodeModel(this, cloner); 81 76 } 82 public static M5NodeModel CreateNode(string targetAttr, M5 CreationParameters m5CreationParams) {83 return m5 CreationParams.LeafType is ILeafType<IConfidenceRegressionModel>? new ConfidenceM5NodeModel(targetAttr) : new M5NodeModel(targetAttr);77 public static M5NodeModel CreateNode(string targetAttr, M5Parameters m5Params) { 78 return m5Params.LeafModel.ProvidesConfidence ? new ConfidenceM5NodeModel(targetAttr) : new M5NodeModel(targetAttr); 84 79 } 85 private static M5NodeModel CreateNode(M5NodeModel parent, M5 CreationParameters m5CreationParams) {86 return m5 CreationParams.LeafType is ILeafType<IConfidenceRegressionModel>? new ConfidenceM5NodeModel(parent) : new M5NodeModel(parent);80 private static M5NodeModel CreateNode(M5NodeModel parent, M5Parameters m5Params) { 81 return m5Params.LeafModel.ProvidesConfidence ? new ConfidenceM5NodeModel(parent) : new M5NodeModel(parent); 87 82 } 88 83 #endregion … … 90 85 #region RegressionModel 91 86 public override IEnumerable<string> VariablesUsedForPrediction { 92 get { return Variables; }87 get { return variables; } 93 88 } 94 89 public override IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) { 95 90 if (!IsLeaf) return rows.Select(row => GetEstimatedValue(dataset, row)); 96 if ( NodeModel == null) throw new NotSupportedException("M5Phas not been built correctly");97 return NodeModel.GetEstimatedValues(dataset, rows);91 if (Model == null) throw new NotSupportedException("The model has not been built correctly"); 92 return Model.GetEstimatedValues(dataset, rows); 98 93 } 99 94 public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) { … … 102 97 #endregion 103 98 104 internal void Split(IReadOnlyList<int> rows, M5 CreationParameters m5CreationParams, double globalStdDev) {105 Variables = m5CreationParams.AllowedInputVariables.ToArray();99 internal void Split(IReadOnlyList<int> rows, M5Parameters m5Params) { 100 variables = m5Params.AllowedInputVariables.ToArray(); 106 101 NumSamples = rows.Count; 107 102 Right = null; 108 103 Left = null; 109 NodeModel = null;110 SplitAttr = null;104 Model = null; 105 SplitAttribute = null; 111 106 SplitValue = double.NaN; 112 107 string attr; 113 108 double splitValue; 114 //IsLeaf = m5CreationParams.Data.GetDoubleValues(TargetVariable, rows).StandardDeviation() < globalStdDev * DevFraction; 115 //if (IsLeaf) return; 116 IsLeaf = !m5CreationParams.Split.Split(new RegressionProblemData(ReduceDataset(m5CreationParams.Data, rows), Variables, TargetVariable), m5CreationParams.MinLeafSize, out attr, out splitValue); 109 IsLeaf = !m5Params.Spliter.Split(new RegressionProblemData(M5StaticUtilities.ReduceDataset(m5Params.Data, rows, variables, TargetVariable), variables, TargetVariable), m5Params.MinLeafSize, out attr, out splitValue); 117 110 if (IsLeaf) return; 118 111 119 112 //split Dataset 120 113 IReadOnlyList<int> leftRows, rightRows; 121 SplitRows(rows, m5CreationParams.Data, attr, splitValue, out leftRows, out rightRows);114 M5StaticUtilities.SplitRows(rows, m5Params.Data, attr, splitValue, out leftRows, out rightRows); 122 115 123 if (leftRows.Count < m5 CreationParams.MinLeafSize || rightRows.Count < m5CreationParams.MinLeafSize) {116 if (leftRows.Count < m5Params.MinLeafSize || rightRows.Count < m5Params.MinLeafSize) { 124 117 IsLeaf = true; 125 118 return; 126 119 } 127 SplitAttr = attr;120 SplitAttribute = attr; 128 121 SplitValue = splitValue; 129 122 130 123 //create subtrees 131 Left = CreateNode(this, m5 CreationParams);132 Left.Split(leftRows, m5 CreationParams, globalStdDev);133 Right = CreateNode(this, m5 CreationParams);134 Right.Split(rightRows, m5 CreationParams, globalStdDev);124 Left = CreateNode(this, m5Params); 125 Left.Split(leftRows, m5Params); 126 Right = CreateNode(this, m5Params); 127 Right.Split(rightRows, m5Params); 135 128 } 136 129 137 internal bool Prune(IReadOnlyList<int> trainingRows, IReadOnlyList<int> testRows, M5CreationParameters m5CreationParams, CancellationToken cancellation, double globalStdDev) { 138 if (IsLeaf) { 139 BuildModel(trainingRows, m5CreationParams.Data, m5CreationParams.Random, m5CreationParams.PruningLeaf, cancellation); 140 NumParam = NodeModelParams; 141 return true; 142 } 143 //split training & holdout data 144 IReadOnlyList<int> leftTest, rightTest; 145 SplitRows(testRows, m5CreationParams.Data, SplitAttr, SplitValue, out leftTest, out rightTest); 146 IReadOnlyList<int> leftTraining, rightTraining; 147 SplitRows(trainingRows, m5CreationParams.Data, SplitAttr, SplitValue, out leftTraining, out rightTraining); 148 149 //prune children frist 150 var lpruned = Left.Prune(leftTraining, leftTest, m5CreationParams, cancellation, globalStdDev); 151 var rpruned = Right.Prune(rightTraining, rightTest, m5CreationParams, cancellation, globalStdDev); 152 NumParam = Left.NumParam + Right.NumParam + 1; 153 154 //TODO check if this reduces quality. It reduces training effort (consideraby for some pruningTypes) 155 if (!lpruned && !rpruned) return false; 156 157 BuildModel(trainingRows, m5CreationParams.Data, m5CreationParams.Random, m5CreationParams.PruningLeaf, cancellation); 158 159 //check if children will be pruned 160 if (!((PruningBase) m5CreationParams.Pruningtype).Prune(this, m5CreationParams, testRows, globalStdDev)) return false; 161 162 //convert to leafNode 163 ((IntValue) m5CreationParams.Results[M5RuleModel.NoCurrentLeafesResultName].Value).Value -= EnumerateNodes().Count(x => x.IsLeaf) - 1; 130 internal void ToLeaf() { 164 131 IsLeaf = true; 165 132 Right = null; 166 133 Left = null; 167 NumParam = NodeModelParams;168 return true;169 134 } 170 135 171 internal void InstallModels(IReadOnlyList<int> rows, IRandom random, IDataset data, ILeafType<IRegressionModel> leafType, CancellationToken cancellation) {136 internal void BuildLeafModels(IReadOnlyList<int> rows, M5Parameters parameters, CancellationToken cancellationToken) { 172 137 if (!IsLeaf) { 173 138 IReadOnlyList<int> leftRows, rightRows; 174 SplitRows(rows, data, SplitAttr, SplitValue, out leftRows, out rightRows);175 Left. InstallModels(leftRows, random, data, leafType, cancellation);176 Right. InstallModels(rightRows, random, data, leafType, cancellation);139 M5StaticUtilities.SplitRows(rows, parameters.Data, SplitAttribute, SplitValue, out leftRows, out rightRows); 140 Left.BuildLeafModels(leftRows, parameters, cancellationToken); 141 Right.BuildLeafModels(rightRows, parameters, cancellationToken); 177 142 return; 178 143 } 179 BuildModel(rows, data, random, leafType, cancellation); 144 int numP; 145 Model = M5StaticUtilities.BuildModel(rows, parameters, parameters.LeafModel, cancellationToken, out numP); 180 146 } 181 147 … … 192 158 } 193 159 194 internal void ToRuleNode() {195 Parent = null;196 }197 198 160 #region Helpers 199 161 private double GetEstimatedValue(IDataset dataset, int row) { 200 if (!IsLeaf) return (dataset.GetDoubleValue(SplitAttr, row) <= SplitValue ? Left : Right).GetEstimatedValue(dataset, row); 201 if (NodeModel == null) throw new NotSupportedException("M5P has not been built correctly"); 202 return NodeModel.GetEstimatedValues(dataset, new[] {row}).First(); 203 } 204 205 private void BuildModel(IReadOnlyList<int> rows, IDataset data, IRandom random, ILeafType<IRegressionModel> leafType, CancellationToken cancellation) { 206 var reducedData = ReduceDataset(data, rows); 207 var pd = new RegressionProblemData(reducedData, VariablesUsedForPrediction, TargetVariable); 208 pd.TrainingPartition.Start = 0; 209 pd.TrainingPartition.End = pd.TestPartition.Start = pd.TestPartition.End = reducedData.Rows; 210 211 int noparams; 212 NodeModel = leafType.BuildModel(pd, random, cancellation, out noparams); 213 NodeModelParams = noparams; 214 cancellation.ThrowIfCancellationRequested(); 215 } 216 217 private IDataset ReduceDataset(IDataset data, IReadOnlyList<int> rows) { 218 return new Dataset(VariablesUsedForPrediction.Concat(new[] {TargetVariable}), VariablesUsedForPrediction.Concat(new[] {TargetVariable}).Select(x => data.GetDoubleValues(x, rows).ToList())); 219 } 220 221 private static void SplitRows(IReadOnlyList<int> rows, IDataset data, string splitAttr, double splitValue, out IReadOnlyList<int> leftRows, out IReadOnlyList<int> rightRows) { 222 var assignment = data.GetDoubleValues(splitAttr, rows).Select(x => x <= splitValue).ToArray(); 223 leftRows = rows.Zip(assignment, (i, b) => new {i, b}).Where(x => x.b).Select(x => x.i).ToList(); 224 rightRows = rows.Zip(assignment, (i, b) => new {i, b}).Where(x => !x.b).Select(x => x.i).ToList(); 162 if (!IsLeaf) return (dataset.GetDoubleValue(SplitAttribute, row) <= SplitValue ? Left : Right).GetEstimatedValue(dataset, row); 163 if (Model == null) throw new NotSupportedException("The model has not been built correctly"); 164 return Model.GetEstimatedValues(dataset, new[] {row}).First(); 225 165 } 226 166 #endregion … … 240 180 241 181 public IEnumerable<double> GetEstimatedVariances(IDataset dataset, IEnumerable<int> rows) { 242 return IsLeaf ? ((IConfidenceRegressionModel) NodeModel).GetEstimatedVariances(dataset, rows) : rows.Select(row => GetEstimatedVariance(dataset, row));182 return IsLeaf ? ((IConfidenceRegressionModel)Model).GetEstimatedVariances(dataset, rows) : rows.Select(row => GetEstimatedVariance(dataset, row)); 243 183 } 244 184 245 185 private double GetEstimatedVariance(IDataset dataset, int row) { 246 186 if (!IsLeaf) 247 return ((IConfidenceRegressionModel) (dataset.GetDoubleValue(SplitAttr, row) <= SplitValue ? Left : Right)).GetEstimatedVariances(dataset, row.ToEnumerable()).Single();248 return ((IConfidenceRegressionModel) NodeModel).GetEstimatedVariances(dataset, new[] {row}).First();187 return ((IConfidenceRegressionModel)(dataset.GetDoubleValue(SplitAttribute, row) <= SplitValue ? Left : Right)).GetEstimatedVariances(dataset, row.ToEnumerable()).Single(); 188 return ((IConfidenceRegressionModel)Model).GetEstimatedVariances(dataset, new[] {row}).First(); 249 189 } 250 190 -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/MetaModels/M5RuleModel.cs
r15430 r15614 32 32 namespace HeuristicLab.Algorithms.DataAnalysis { 33 33 [StorableClass] 34 internal class M5RuleModel : RegressionModel, IM5MetaModel { 35 internal const string NoCurrentLeafesResultName = "Number of current Leafs"; 36 34 internal class M5RuleModel : RegressionModel { 37 35 #region Properties 38 36 [Storable] 39 internal string[] SplitAtt s { get; private set; }37 internal string[] SplitAttributes { get; private set; } 40 38 [Storable] 41 private double[] SplitVals { get; set; }39 private double[] splitValues; 42 40 [Storable] 43 private RelOp[] RelOps { get; set; }41 private Comparison[] comparisons; 44 42 [Storable] 45 43 protected IRegressionModel RuleModel { get; set; } 46 44 [Storable] 47 private IReadOnlyList<string> Variables { get; set; }45 private IReadOnlyList<string> variables; 48 46 #endregion 49 47 … … 52 50 protected M5RuleModel(bool deserializing) : base(deserializing) { } 53 51 protected M5RuleModel(M5RuleModel original, Cloner cloner) : base(original, cloner) { 54 if (original.SplitAtt s != null) SplitAtts = original.SplitAtts.ToArray();55 if (original. SplitVals != null) SplitVals = original.SplitVals.ToArray();56 if (original. RelOps != null) RelOps = original.RelOps.ToArray();52 if (original.SplitAttributes != null) SplitAttributes = original.SplitAttributes.ToArray(); 53 if (original.splitValues != null) splitValues = original.splitValues.ToArray(); 54 if (original.comparisons != null) comparisons = original.comparisons.ToArray(); 57 55 RuleModel = cloner.Clone(original.RuleModel); 58 if (original. Variables != null) Variables = original.Variables.ToList();56 if (original.variables != null) variables = original.variables.ToList(); 59 57 } 60 58 private M5RuleModel(string target) : base(target) { } … … 64 62 #endregion 65 63 66 internal static M5RuleModel CreateRuleModel(string target, M5 CreationParameters m5CreationParams) {67 return m5 CreationParams.LeafType is ILeafType<IConfidenceRegressionModel>? new ConfidenceM5RuleModel(target) : new M5RuleModel(target);64 internal static M5RuleModel CreateRuleModel(string target, M5Parameters m5Params) { 65 return m5Params.LeafModel.ProvidesConfidence ? new ConfidenceM5RuleModel(target) : new M5RuleModel(target); 68 66 } 69 67 70 68 #region IRegressionModel 71 69 public override IEnumerable<string> VariablesUsedForPrediction { 72 get { return Variables; }70 get { return variables; } 73 71 } 74 72 75 73 public override IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) { 76 if (RuleModel == null) throw new NotSupportedException(" M5Phas not been built correctly");74 if (RuleModel == null) throw new NotSupportedException("The model has not been built correctly"); 77 75 return RuleModel.GetEstimatedValues(dataset, rows); 78 76 } … … 83 81 #endregion 84 82 85 #region IM5Component 86 public void Build Classifier(IReadOnlyList<int> trainingRows, IReadOnlyList<int> holdoutRows, M5CreationParameters m5CreationParams, CancellationToken cancellation) {87 Variables = m5CreationParams.AllowedInputVariables.ToList();88 var tree = M5TreeModel.CreateTreeModel(m5 CreationParams.TargetVariable, m5CreationParams);89 ((IM5MetaModel) tree).BuildClassifier(trainingRows, holdoutRows, m5CreationParams, cancellation);83 84 public void Build(IReadOnlyList<int> trainingRows, IReadOnlyList<int> pruningRows, M5Parameters m5Params, CancellationToken cancellationToken) { 85 variables = m5Params.AllowedInputVariables.ToList(); 86 var tree = M5TreeModel.CreateTreeModel(m5Params.TargetVariable, m5Params); 87 tree.Build(trainingRows, pruningRows, m5Params, cancellationToken); 90 88 var nodeModel = tree.Root.EnumerateNodes().Where(x => x.IsLeaf).MaxItems(x => x.NumSamples).First(); 91 89 92 90 var satts = new List<string>(); 93 91 var svals = new List<double>(); 94 var reops = new List< RelOp>();92 var reops = new List<Comparison>(); 95 93 96 94 //extract Splits 97 95 for (var temp = nodeModel; temp.Parent != null; temp = temp.Parent) { 98 satts.Add(temp.Parent.SplitAttr );96 satts.Add(temp.Parent.SplitAttribute); 99 97 svals.Add(temp.Parent.SplitValue); 100 reops.Add(temp.Parent.Left == temp ? RelOp.Lessequal : RelOp.Greater);98 reops.Add(temp.Parent.Left == temp ? Comparison.LessEqual : Comparison.Greater); 101 99 } 102 nodeModel.ToRuleNode(); 103 RuleModel = nodeModel.NodeModel; 104 RelOps = reops.ToArray(); 105 SplitAtts = satts.ToArray(); 106 SplitVals = svals.ToArray(); 100 RuleModel = nodeModel.Model; 101 comparisons = reops.ToArray(); 102 SplitAttributes = satts.ToArray(); 103 splitValues = svals.ToArray(); 107 104 } 108 105 109 public void Update Model(IReadOnlyList<int> rows, M5UpdateParameters m5UpdateParameters, CancellationToken cancellation) {110 BuildModel(rows, m5 UpdateParameters.Random, m5UpdateParameters.Data, m5UpdateParameters.LeafType, cancellation);106 public void Update(IReadOnlyList<int> rows, M5Parameters m5Parameters, CancellationToken cancellationToken) { 107 BuildModel(rows, m5Parameters.Random, m5Parameters.Data, m5Parameters.LeafModel, cancellationToken); 111 108 } 112 #endregion113 109 114 110 public bool Covers(IDataset dataset, int row) { 115 return !SplitAtt s.Where((t, i) => !RelOps[i].Compare(dataset.GetDoubleValue(t, row), SplitVals[i])).Any();111 return !SplitAttributes.Where((t, i) => !comparisons[i].Compare(dataset.GetDoubleValue(t, row), splitValues[i])).Any(); 116 112 } 117 113 … … 119 115 var mins = new Dictionary<string, double>(); 120 116 var maxs = new Dictionary<string, double>(); 121 for (var i = 0; i < SplitAtt s.Length; i++) {122 var n = SplitAtt s[i];123 var v = SplitVals[i];117 for (var i = 0; i < SplitAttributes.Length; i++) { 118 var n = SplitAttributes[i]; 119 var v = splitValues[i]; 124 120 if (!mins.ContainsKey(n)) mins.Add(n, double.NegativeInfinity); 125 121 if (!maxs.ContainsKey(n)) maxs.Add(n, double.PositiveInfinity); 126 if ( RelOps[i] == RelOp.Lessequal) maxs[n] = Math.Min(maxs[n], v);122 if (comparisons[i] == Comparison.LessEqual) maxs[n] = Math.Min(maxs[n], v); 127 123 else mins[n] = Math.Max(mins[n], v); 128 124 } … … 136 132 137 133 #region Helpers 138 private void BuildModel(IReadOnlyList<int> rows, IRandom random, IDataset data, ILeaf Type<IRegressionModel> leafType, CancellationToken cancellation) {134 private void BuildModel(IReadOnlyList<int> rows, IRandom random, IDataset data, ILeafModel leafModel, CancellationToken cancellationToken) { 139 135 var reducedData = new Dataset(VariablesUsedForPrediction.Concat(new[] {TargetVariable}), VariablesUsedForPrediction.Concat(new[] {TargetVariable}).Select(x => data.GetDoubleValues(x, rows).ToList())); 140 136 var pd = new RegressionProblemData(reducedData, VariablesUsedForPrediction, TargetVariable); … … 143 139 144 140 int noparams; 145 RuleModel = leaf Type.BuildModel(pd, random, cancellation, out noparams);146 cancellation .ThrowIfCancellationRequested();141 RuleModel = leafModel.Build(pd, random, cancellationToken, out noparams); 142 cancellationToken.ThrowIfCancellationRequested(); 147 143 } 148 144 #endregion … … 161 157 162 158 public IEnumerable<double> GetEstimatedVariances(IDataset dataset, IEnumerable<int> rows) { 163 return ((IConfidenceRegressionModel) 159 return ((IConfidenceRegressionModel)RuleModel).GetEstimatedVariances(dataset, rows); 164 160 } 165 161 … … 170 166 } 171 167 172 internal enum RelOp{173 Less equal,168 internal enum Comparison { 169 LessEqual, 174 170 Greater 175 171 } 176 172 177 internal static class RelOpExtentions {178 public static bool Compare(this RelOpop, double x, double y) {173 internal static class ComparisonExtentions { 174 public static bool Compare(this Comparison op, double x, double y) { 179 175 switch (op) { 180 case RelOp.Greater:176 case Comparison.Greater: 181 177 return x > y; 182 case RelOp.Lessequal:178 case Comparison.LessEqual: 183 179 return x <= y; 184 180 default: -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/MetaModels/M5RuleSetModel.cs
r15430 r15614 32 32 namespace HeuristicLab.Algorithms.DataAnalysis { 33 33 [StorableClass] 34 public class M5RuleSetModel : RegressionModel, IM5MetaModel {35 private const string N oRulesResultName = "Number of Rules";36 private const string CoveredInstancesResultName = "Covered Instances";34 internal class M5RuleSetModel : RegressionModel, IM5Model { 35 private const string NumRulesResultName = "Number of rules"; 36 private const string CoveredInstancesResultName = "Covered instances"; 37 37 38 38 #region Properties … … 53 53 #endregion 54 54 55 internal static M5RuleSetModel CreateRuleModel(string targetAttr, M5 CreationParameters m5CreationParams) {56 return m5 CreationParams.LeafType is ILeafType<IConfidenceRegressionModel>? new ConfidenceM5RuleSetModel(targetAttr) : new M5RuleSetModel(targetAttr);55 internal static M5RuleSetModel CreateRuleModel(string targetAttr, M5Parameters m5Params) { 56 return m5Params.LeafModel.ProvidesConfidence ? new ConfidenceM5RuleSetModel(targetAttr) : new M5RuleSetModel(targetAttr); 57 57 } 58 58 … … 65 65 } 66 66 public override IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) { 67 if (Rules == null) throw new NotSupportedException("The classifierhas not been built yet");67 if (Rules == null) throw new NotSupportedException("The model has not been built yet"); 68 68 return rows.Select(row => GetEstimatedValue(dataset, row)); 69 69 } … … 73 73 #endregion 74 74 75 #region IM5 Component76 void IM5MetaModel.BuildClassifier(IReadOnlyList<int> trainingRows, IReadOnlyList<int> holdoutRows, M5CreationParameters m5CreationParams, CancellationToken cancellation) {75 #region IM5Model 76 public void Build(IReadOnlyList<int> trainingRows, IReadOnlyList<int> pruningRows, M5Parameters m5Params, CancellationToken cancellationToken) { 77 77 Rules = new List<M5RuleModel>(); 78 78 var tempTraining = trainingRows; 79 var temp Holdout = holdoutRows;79 var tempPruning = pruningRows; 80 80 do { 81 var tempRule = M5RuleModel.CreateRuleModel(m5 CreationParams.TargetVariable, m5CreationParams);82 cancellation .ThrowIfCancellationRequested();81 var tempRule = M5RuleModel.CreateRuleModel(m5Params.TargetVariable, m5Params); 82 cancellationToken.ThrowIfCancellationRequested(); 83 83 84 if (!m5 CreationParams.Results.ContainsKey(NoRulesResultName)) m5CreationParams.Results.Add(new Result(NoRulesResultName, new IntValue(0)));85 if (!m5 CreationParams.Results.ContainsKey(CoveredInstancesResultName)) m5CreationParams.Results.Add(new Result(CoveredInstancesResultName, new IntValue(0)));84 if (!m5Params.Results.ContainsKey(NumRulesResultName)) m5Params.Results.Add(new Result(NumRulesResultName, new IntValue(0))); 85 if (!m5Params.Results.ContainsKey(CoveredInstancesResultName)) m5Params.Results.Add(new Result(CoveredInstancesResultName, new IntValue(0))); 86 86 87 87 var t1 = tempTraining.Count; 88 tempRule.Build Classifier(tempTraining, tempHoldout, m5CreationParams, cancellation);89 tempTraining = tempTraining.Where(i => !tempRule.Covers(m5 CreationParams.Data, i)).ToArray();90 temp Holdout = tempHoldout.Where(i => !tempRule.Covers(m5CreationParams.Data, i)).ToArray();88 tempRule.Build(tempTraining, tempPruning, m5Params, cancellationToken); 89 tempTraining = tempTraining.Where(i => !tempRule.Covers(m5Params.Data, i)).ToArray(); 90 tempPruning = tempPruning.Where(i => !tempRule.Covers(m5Params.Data, i)).ToArray(); 91 91 Rules.Add(tempRule); 92 ((IntValue) m5CreationParams.Results[NoRulesResultName].Value).Value++;93 ((IntValue) m5CreationParams.Results[CoveredInstancesResultName].Value).Value += t1 - tempTraining.Count;92 ((IntValue)m5Params.Results[NumRulesResultName].Value).Value++; 93 ((IntValue)m5Params.Results[CoveredInstancesResultName].Value).Value += t1 - tempTraining.Count; 94 94 } 95 95 while (tempTraining.Count > 0); 96 96 } 97 97 98 void IM5MetaModel.UpdateModel(IReadOnlyList<int> rows, M5UpdateParameters m5UpdateParameters, CancellationToken cancellation) {99 foreach (var rule in Rules) rule.Update Model(rows, m5UpdateParameters, cancellation);98 public void Update(IReadOnlyList<int> rows, M5Parameters m5Parameters, CancellationToken cancellationToken) { 99 foreach (var rule in Rules) rule.Update(rows, m5Parameters, cancellationToken); 100 100 } 101 101 #endregion … … 104 104 private double GetEstimatedValue(IDataset dataset, int row) { 105 105 foreach (var rule in Rules) { 106 var prediction = rule.GetEstimatedValues(dataset, row.ToEnumerable()).Single();107 if (rule.Covers(dataset, row)) return prediction;106 if (rule.Covers(dataset, row)) 107 return rule.GetEstimatedValues(dataset, row.ToEnumerable()).Single(); 108 108 } 109 109 throw new ArgumentException("Instance is not covered by any rule"); … … 125 125 #region IConfidenceRegressionModel 126 126 public IEnumerable<double> GetEstimatedVariances(IDataset dataset, IEnumerable<int> rows) { 127 if (Rules == null) throw new NotSupportedException("The classifierhas not been built yet");127 if (Rules == null) throw new NotSupportedException("The model has not been built yet"); 128 128 return rows.Select(row => GetEstimatedVariance(dataset, row)); 129 129 } … … 133 133 private double GetEstimatedVariance(IDataset dataset, int row) { 134 134 foreach (var rule in Rules) { 135 var prediction = ((IConfidenceRegressionModel) rule).GetEstimatedVariances(dataset, row.ToEnumerable()).Single(); 136 if (rule.Covers(dataset, row)) return prediction; 135 if (rule.Covers(dataset, row)) return ((IConfidenceRegressionModel)rule).GetEstimatedVariances(dataset, row.ToEnumerable()).Single(); 137 136 } 138 137 throw new ArgumentException("Instance is not covered by any rule"); -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/MetaModels/M5TreeModel.cs
r15430 r15614 32 32 namespace HeuristicLab.Algorithms.DataAnalysis { 33 33 [StorableClass] 34 public class M5TreeModel : RegressionModel, IM5MetaModel {35 p rivate const string NoCurrentLeafesResultName = "Number of current Leafs";34 internal class M5TreeModel : RegressionModel, IM5Model { 35 public const string NumCurrentLeafsResultName = "Number of current leafs"; 36 36 #region Properties 37 37 [Storable] 38 38 internal M5NodeModel Root { get; private set; } 39 //[Storable]40 //private M5Parameters M5Params { get; set; }41 39 #endregion 42 40 … … 53 51 #endregion 54 52 55 internal static M5TreeModel CreateTreeModel(string targetAttr, M5 CreationParameters m5CreationParams) {56 return m5 CreationParams.LeafType is ILeafType<IConfidenceRegressionModel>? new ConfidenceM5TreeModel(targetAttr) : new M5TreeModel(targetAttr);53 internal static M5TreeModel CreateTreeModel(string targetAttr, M5Parameters m5Params) { 54 return m5Params.LeafModel.ProvidesConfidence ? new ConfidenceM5TreeModel(targetAttr) : new M5TreeModel(targetAttr); 57 55 } 58 56 … … 62 60 } 63 61 public override IEnumerable<double> GetEstimatedValues(IDataset dataset, IEnumerable<int> rows) { 64 if (Root == null) throw new NotSupportedException("The classifierhas not been built yet");62 if (Root == null) throw new NotSupportedException("The model has not been built yet"); 65 63 return Root.GetEstimatedValues(dataset, rows); 66 64 } … … 70 68 #endregion 71 69 72 #region IM5Component 73 void IM5MetaModel.BuildClassifier(IReadOnlyList<int> trainingRows, IReadOnlyList<int> holdoutRows, M5CreationParameters m5CreationParams, CancellationToken cancellation) { 74 Root = null; 75 var globalStdDev = m5CreationParams.Data.GetDoubleValues(m5CreationParams.TargetVariable, trainingRows).StandardDeviationPop(); 76 Root = M5NodeModel.CreateNode(m5CreationParams.TargetVariable, m5CreationParams); 77 Root.Split(trainingRows, m5CreationParams, globalStdDev); 78 InitializeLeafCounter(m5CreationParams); 79 if (!(m5CreationParams.Pruningtype is NoPruning)) Root.Prune(trainingRows, holdoutRows, m5CreationParams, cancellation, globalStdDev); 80 Root.InstallModels(trainingRows.Union(holdoutRows).ToArray(), m5CreationParams.Random, m5CreationParams.Data, m5CreationParams.LeafType, cancellation); 70 #region IM5Model 71 public void Build(IReadOnlyList<int> trainingRows, IReadOnlyList<int> pruningRows, M5Parameters m5Params, CancellationToken cancellationToken) { 72 Root = M5NodeModel.CreateNode(m5Params.TargetVariable, m5Params); 73 Root.Split(trainingRows, m5Params); 74 75 InitializeLeafCounter(m5Params); 76 77 var buPruner = m5Params.Pruning as BottomUpPruningBase; 78 if (buPruner != null) buPruner.Prune(this, trainingRows, pruningRows, m5Params, cancellationToken); 79 80 Root.BuildLeafModels(trainingRows.Union(pruningRows).ToArray(), m5Params, cancellationToken); 81 81 } 82 82 83 void IM5MetaModel.UpdateModel(IReadOnlyList<int> rows, M5UpdateParameters m5UpdateParameters, CancellationToken cancellation) {84 Root. InstallModels(rows, m5UpdateParameters.Random, m5UpdateParameters.Data, m5UpdateParameters.LeafType, cancellation);83 public void Update(IReadOnlyList<int> rows, M5Parameters m5Parameters, CancellationToken cancellationToken) { 84 Root.BuildLeafModels(rows, m5Parameters, cancellationToken); 85 85 } 86 86 #endregion 87 87 88 88 #region Helpers 89 private void InitializeLeafCounter(M5 CreationParameters m5CreationParams) {90 if (!m5 CreationParams.Results.ContainsKey(NoCurrentLeafesResultName))91 m5 CreationParams.Results.Add(new Result(NoCurrentLeafesResultName, new IntValue(Root.EnumerateNodes().Count(x => x.IsLeaf))));92 else ((IntValue) m5CreationParams.Results[NoCurrentLeafesResultName].Value).Value = Root.EnumerateNodes().Count(x => x.IsLeaf);89 private void InitializeLeafCounter(M5Parameters m5Params) { 90 if (!m5Params.Results.ContainsKey(NumCurrentLeafsResultName)) 91 m5Params.Results.Add(new Result(NumCurrentLeafsResultName, new IntValue(Root.EnumerateNodes().Count(x => x.IsLeaf)))); 92 else ((IntValue)m5Params.Results[NumCurrentLeafsResultName].Value).Value = Root.EnumerateNodes().Count(x => x.IsLeaf); 93 93 } 94 94 #endregion … … 107 107 108 108 public IEnumerable<double> GetEstimatedVariances(IDataset dataset, IEnumerable<int> rows) { 109 if (Root == null) throw new NotSupportedException("The classifierhas not been built yet");110 return ((IConfidenceRegressionModel) 109 if (Root == null) throw new NotSupportedException("The model has not been built yet"); 110 return ((IConfidenceRegressionModel)Root).GetEstimatedVariances(dataset, rows); 111 111 } 112 112 public override IRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData) { -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/Pruning/NoPruning.cs
r15470 r15614 29 29 [StorableClass] 30 30 [Item("NoPruning", "No pruning")] 31 public class NoPruning : P runingBase{31 public class NoPruning : ParameterizedNamedItem, IPruning { 32 32 #region Constructors & Cloning 33 33 [StorableConstructor] 34 34 private NoPruning(bool deserializing) : base(deserializing) { } 35 35 private NoPruning(NoPruning original, Cloner cloner) : base(original, cloner) { } 36 public NoPruning() { 37 PruningStrengthParameter.Hidden = true; 38 } 36 public NoPruning() { } 39 37 public override IDeepCloneable Clone(Cloner cloner) { 40 38 return new NoPruning(this, cloner); 41 39 } 42 #endregion 43 44 #region IPruningType 45 public override ILeafType<IRegressionModel> ModelType(ILeafType<IRegressionModel> leafType) { 46 return null; 47 } 48 49 public override void GenerateHoldOutSet(IReadOnlyList<int> allrows, IRandom random, out IReadOnlyList<int> training, out IReadOnlyList<int> holdout) { 50 training = allrows; 51 holdout = allrows; 52 } 53 internal override bool Prune(M5NodeModel node, M5CreationParameters m5CreationParams, IReadOnlyList<int> testRows, double globalStdDev) { 54 return false; 40 public int MinLeafSize(IRegressionProblemData pd, ILeafModel leafModel) { 41 return 0; 55 42 } 56 43 #endregion -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/M5Regression/Spliting/OrderImpurityCalculator.cs
r15470 r15614 28 28 /// <summary> 29 29 /// Helper class for incremental split calculation. 30 /// Used while moving a potential Split along the ordered training Instances30 /// Used while moving a potential Spliter along the ordered training Instances 31 31 /// </summary> 32 32 internal class OrderImpurityCalculator { … … 105 105 VarRight = NoRight <= 0 ? 0 : Math.Abs(NoRight * SqSumRight - SumRight * SumRight) / (NoRight * NoRight); 106 106 107 if (Order <= 0) throw new ArgumentException("Split order must be larger than 0");107 if (Order <= 0) throw new ArgumentException("Spliter order must be larger than 0"); 108 108 if (Order.IsAlmost(1)) { 109 109 y = VarTotal; … … 117 117 } 118 118 var t = NoRight + NoLeft; 119 if (NoLeft <= 0.0 || NoRight <= 0.0) Impurity = double.MinValue; //Split = 0;120 else Impurity = y - NoLeft / t * yl - NoRight / t * yr; // Split = y - NoLeft / NoRight * yl - NoRight / NoLeft * yr119 if (NoLeft <= 0.0 || NoRight <= 0.0) Impurity = double.MinValue; //Spliter = 0; 120 else Impurity = y - NoLeft / t * yl - NoRight / t * yr; // Spliter = y - NoLeft / NoRight * yl - NoRight / NoLeft * yr 121 121 } 122 122 #endregion -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/NonlinearRegression/NonlinearRegression.cs
r14826 r15614 51 51 private const string SeedParameterName = "Seed"; 52 52 private const string InitParamsRandomlyParameterName = "InitializeParametersRandomly"; 53 private const string ApplyLinearScalingParameterName = "Apply linear scaling"; 53 54 54 55 public IFixedValueParameter<StringValue> ModelStructureParameter { … … 73 74 public IFixedValueParameter<BoolValue> InitParametersRandomlyParameter { 74 75 get { return (IFixedValueParameter<BoolValue>)Parameters[InitParamsRandomlyParameterName]; } 76 } 77 78 public IFixedValueParameter<BoolValue> ApplyLinearScalingParameter { 79 get { return (IFixedValueParameter<BoolValue>)Parameters[ApplyLinearScalingParameterName]; } 75 80 } 76 81 … … 103 108 get { return InitParametersRandomlyParameter.Value.Value; } 104 109 set { InitParametersRandomlyParameter.Value.Value = value; } 110 } 111 112 public bool ApplyLinearScaling { 113 get { return ApplyLinearScalingParameter.Value.Value; } 114 set { ApplyLinearScalingParameter.Value.Value = value; } 105 115 } 106 116 … … 119 129 Parameters.Add(new FixedValueParameter<BoolValue>(SetSeedRandomlyParameterName, "Switch to determine if the random number seed should be initialized randomly.", new BoolValue(true))); 120 130 Parameters.Add(new FixedValueParameter<BoolValue>(InitParamsRandomlyParameterName, "Switch to determine if the real-valued model parameters should be initialized randomly in each restart.", new BoolValue(false))); 131 Parameters.Add(new FixedValueParameter<BoolValue>(ApplyLinearScalingParameterName, "Switch to determine if linear scaling terms should be added to the model", new BoolValue(true))); 121 132 122 133 SetParameterHiddenState(); … … 146 157 if (!Parameters.ContainsKey(InitParamsRandomlyParameterName)) 147 158 Parameters.Add(new FixedValueParameter<BoolValue>(InitParamsRandomlyParameterName, "Switch to determine if the numeric parameters of the model should be initialized randomly.", new BoolValue(false))); 159 if (!Parameters.ContainsKey(ApplyLinearScalingParameterName)) 160 Parameters.Add(new FixedValueParameter<BoolValue>(ApplyLinearScalingParameterName, "Switch to determine if linear scaling terms should be added to the model", new BoolValue(true))); 161 148 162 149 163 SetParameterHiddenState(); … … 174 188 if (SetSeedRandomly) Seed = (new System.Random()).Next(); 175 189 var rand = new MersenneTwister((uint)Seed); 176 bestSolution = CreateRegressionSolution(Problem.ProblemData, ModelStructure, Iterations, rand);190 bestSolution = CreateRegressionSolution(Problem.ProblemData, ModelStructure, Iterations, ApplyLinearScaling, rand); 177 191 trainRMSERow.Values.Add(bestSolution.TrainingRootMeanSquaredError); 178 192 testRMSERow.Values.Add(bestSolution.TestRootMeanSquaredError); 179 193 for (int r = 0; r < Restarts; r++) { 180 var solution = CreateRegressionSolution(Problem.ProblemData, ModelStructure, Iterations, rand);194 var solution = CreateRegressionSolution(Problem.ProblemData, ModelStructure, Iterations, ApplyLinearScaling, rand); 181 195 trainRMSERow.Values.Add(solution.TrainingRootMeanSquaredError); 182 196 testRMSERow.Values.Add(solution.TestRootMeanSquaredError); … … 186 200 } 187 201 } else { 188 bestSolution = CreateRegressionSolution(Problem.ProblemData, ModelStructure, Iterations );202 bestSolution = CreateRegressionSolution(Problem.ProblemData, ModelStructure, Iterations, ApplyLinearScaling); 189 203 } 190 204 … … 206 220 /// <param name="random">Optional random number generator for random initialization of numeric constants.</param> 207 221 /// <returns></returns> 208 public static ISymbolicRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData, string modelStructure, int maxIterations, IRandom rand = null) {222 public static ISymbolicRegressionSolution CreateRegressionSolution(IRegressionProblemData problemData, string modelStructure, int maxIterations, bool applyLinearScaling, IRandom rand = null) { 209 223 var parser = new InfixExpressionParser(); 210 224 var tree = parser.Parse(modelStructure); … … 262 276 263 277 SymbolicRegressionConstantOptimizationEvaluator.OptimizeConstants(interpreter, tree, problemData, problemData.TrainingIndices, 264 applyLinearScaling: false, maxIterations: maxIterations,278 applyLinearScaling: applyLinearScaling, maxIterations: maxIterations, 265 279 updateVariableWeights: false, updateConstantsInTree: true); 266 280 267 var scaledModel = new SymbolicRegressionModel(problemData.TargetVariable, tree, (ISymbolicDataAnalysisExpressionTreeInterpreter)interpreter.Clone()); 268 scaledModel.Scale(problemData); 269 SymbolicRegressionSolution solution = new SymbolicRegressionSolution(scaledModel, (IRegressionProblemData)problemData.Clone()); 281 var model = new SymbolicRegressionModel(problemData.TargetVariable, tree, (ISymbolicDataAnalysisExpressionTreeInterpreter)interpreter.Clone()); 282 if (applyLinearScaling) 283 model.Scale(problemData); 284 285 SymbolicRegressionSolution solution = new SymbolicRegressionSolution(model, (IRegressionProblemData)problemData.Clone()); 270 286 solution.Model.Name = "Regression Model"; 271 287 solution.Name = "Regression Solution"; -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/Plugin.cs.frame
r14195 r15614 37 37 [PluginDependency("HeuristicLab.Core", "3.3")] 38 38 [PluginDependency("HeuristicLab.Data", "3.3")] 39 [PluginDependency("HeuristicLab.Encodings.PermutationEncoding", "3.3")] 39 40 [PluginDependency("HeuristicLab.Encodings.RealVectorEncoding", "3.3")] 40 41 [PluginDependency("HeuristicLab.Encodings.SymbolicExpressionTreeEncoding", "3.4")] -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/RandomForest/RandomForestClassification.cs
r14523 r15614 152 152 public static RandomForestClassificationSolution CreateRandomForestClassificationSolution(IClassificationProblemData problemData, int nTrees, double r, double m, int seed, 153 153 out double rmsError, out double relClassificationError, out double outOfBagRmsError, out double outOfBagRelClassificationError) { 154 var model = CreateRandomForestClassificationModel(problemData, nTrees, r, m, seed, out rmsError, out relClassificationError, out outOfBagRmsError, out outOfBagRelClassificationError); 154 var model = CreateRandomForestClassificationModel(problemData, nTrees, r, m, seed, 155 out rmsError, out relClassificationError, out outOfBagRmsError, out outOfBagRelClassificationError); 155 156 return new RandomForestClassificationSolution(model, (IClassificationProblemData)problemData.Clone()); 156 157 } … … 158 159 public static RandomForestModel CreateRandomForestClassificationModel(IClassificationProblemData problemData, int nTrees, double r, double m, int seed, 159 160 out double rmsError, out double relClassificationError, out double outOfBagRmsError, out double outOfBagRelClassificationError) { 160 return RandomForestModel.CreateClassificationModel(problemData, nTrees, r, m, seed, out rmsError, out relClassificationError, out outOfBagRmsError, out outOfBagRelClassificationError); 161 return RandomForestModel.CreateClassificationModel(problemData, nTrees, r, m, seed, 162 rmsError: out rmsError, relClassificationError: out relClassificationError, outOfBagRmsError: out outOfBagRmsError, outOfBagRelClassificationError: out outOfBagRelClassificationError); 161 163 } 162 164 #endregion -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/RandomForest/RandomForestModel.cs
r14843 r15614 288 288 public static RandomForestModel CreateRegressionModel(IRegressionProblemData problemData, int nTrees, double r, double m, int seed, 289 289 out double rmsError, out double outOfBagRmsError, out double avgRelError, out double outOfBagAvgRelError) { 290 return CreateRegressionModel(problemData, problemData.TrainingIndices, nTrees, r, m, seed, out rmsError, out avgRelError, out outOfBagAvgRelError, out outOfBagRmsError); 290 return CreateRegressionModel(problemData, problemData.TrainingIndices, nTrees, r, m, seed, 291 rmsError: out rmsError, outOfBagRmsError: out outOfBagRmsError, avgRelError: out avgRelError, outOfBagAvgRelError: out outOfBagAvgRelError); 291 292 } 292 293 … … 300 301 301 302 rmsError = rep.rmserror; 303 outOfBagRmsError = rep.oobrmserror; 302 304 avgRelError = rep.avgrelerror; 303 305 outOfBagAvgRelError = rep.oobavgrelerror; 304 outOfBagRmsError = rep.oobrmserror;305 306 306 307 return new RandomForestModel(problemData.TargetVariable, dForest, seed, problemData, nTrees, r, m); … … 309 310 public static RandomForestModel CreateClassificationModel(IClassificationProblemData problemData, int nTrees, double r, double m, int seed, 310 311 out double rmsError, out double outOfBagRmsError, out double relClassificationError, out double outOfBagRelClassificationError) { 311 return CreateClassificationModel(problemData, problemData.TrainingIndices, nTrees, r, m, seed, out rmsError, out outOfBagRmsError, out relClassificationError, out outOfBagRelClassificationError); 312 return CreateClassificationModel(problemData, problemData.TrainingIndices, nTrees, r, m, seed, 313 out rmsError, out outOfBagRmsError, out relClassificationError, out outOfBagRelClassificationError); 312 314 } 313 315 -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/RandomForest/RandomForestRegression.cs
r14523 r15614 160 160 double r, double m, int seed, 161 161 out double rmsError, out double avgRelError, out double outOfBagRmsError, out double outOfBagAvgRelError) { 162 return RandomForestModel.CreateRegressionModel(problemData, nTrees, r, m, seed, out rmsError, out avgRelError, out outOfBagRmsError, out outOfBagAvgRelError); 162 return RandomForestModel.CreateRegressionModel(problemData, nTrees, r, m, seed, 163 rmsError: out rmsError, avgRelError: out avgRelError, outOfBagRmsError: out outOfBagRmsError, outOfBagAvgRelError: out outOfBagAvgRelError); 163 164 } 164 165 -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/TSNE/Distances/CosineDistance.cs
r15234 r15614 22 22 using System; 23 23 using System.Collections.Generic; 24 using System.Linq;25 24 using HeuristicLab.Common; 26 25 using HeuristicLab.Core; … … 28 27 29 28 namespace HeuristicLab.Algorithms.DataAnalysis { 30 31 29 /// <summary> 32 30 /// The angular distance as defined as a normalized distance measure dependent on the angle between two vectors. … … 35 33 [Item("CosineDistance", "The angular distance as defined as a normalized distance measure dependent on the angle between two vectors.")] 36 34 public class CosineDistance : DistanceBase<IEnumerable<double>> { 37 38 35 #region HLConstructors & Cloning 39 36 [StorableConstructor] … … 48 45 49 46 #region statics 50 public static double GetDistance(IReadOnlyList<double> point1, IReadOnlyList<double> point2) { 51 if (point1.Count != point2.Count) throw new ArgumentException("Cosine distance not defined on vectors of different length"); 52 var innerprod = 0.0; 53 var length1 = 0.0; 54 var length2 = 0.0; 55 56 for (var i = 0; i < point1.Count; i++) { 57 double d1 = point1[i], d2 = point2[i]; 58 innerprod += d1 * d2; 59 length1 += d1 * d1; 60 length2 += d2 * d2; 47 public static double GetDistance(IEnumerable<double> point1, IEnumerable<double> point2) { 48 using (IEnumerator<double> p1Enum = point1.GetEnumerator(), p2Enum = point2.GetEnumerator()) { 49 var innerprod = 0.0; 50 var length1 = 0.0; 51 var length2 = 0.0; 52 while (p1Enum.MoveNext() & p2Enum.MoveNext()) { 53 double d1 = p1Enum.Current, d2 = p2Enum.Current; 54 innerprod += d1 * d2; 55 length1 += d1 * d1; 56 length2 += d2 * d2; 57 } 58 var divisor = Math.Sqrt(length1 * length2); 59 if (divisor.IsAlmost(0)) throw new ArgumentException("Cosine distance is not defined on vectors of length 0"); 60 if (p1Enum.MoveNext() || p2Enum.MoveNext()) throw new ArgumentException("Cosine distance not defined on vectors of different length"); 61 return 1 - innerprod / divisor; 61 62 } 62 var l = Math.Sqrt(length1 * length2);63 if (l.IsAlmost(0)) throw new ArgumentException("Cosine distance is not defined on vectors of length 0");64 return 1 - innerprod / l;65 63 } 66 64 #endregion 67 65 public override double Get(IEnumerable<double> a, IEnumerable<double> b) { 68 return GetDistance(a .ToArray(), b.ToArray());66 return GetDistance(a, b); 69 67 } 70 68 } -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/TSNE/Distances/DistanceBase.cs
r15207 r15614 29 29 [StorableClass] 30 30 public abstract class DistanceBase<T> : Item, IDistance<T> { 31 32 31 #region HLConstructors & Cloning 33 32 [StorableConstructor] … … 44 43 45 44 public double Get(object x, object y) { 46 return Get((T) x, (T)y);45 return Get((T) x, (T) y); 47 46 } 48 47 49 48 public IComparer GetDistanceComparer(object item) { 50 return new DistanceComparer((T) item, this);49 return new DistanceComparer((T) item, this); 51 50 } 52 51 53 privateclass DistanceComparer : IComparer<T>, IComparer {52 internal class DistanceComparer : IComparer<T>, IComparer { 54 53 private readonly T item; 55 54 private readonly IDistance<T> dist; … … 65 64 66 65 public int Compare(object x, object y) { 67 return Compare((T) x, (T)y);66 return Compare((T) x, (T) y); 68 67 } 69 68 } -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/TSNE/Distances/EuclideanDistance.cs
r15207 r15614 31 31 [Item("EuclideanDistance", "A norm function that uses Euclidean distance")] 32 32 public class EuclideanDistance : DistanceBase<IEnumerable<double>> { 33 34 33 #region HLConstructors & Cloning 35 34 [StorableConstructor] 36 35 protected EuclideanDistance(bool deserializing) : base(deserializing) { } 37 36 protected EuclideanDistance(EuclideanDistance original, Cloner cloner) : base(original, cloner) { } 38 public override IDeepCloneable Clone(Cloner cloner) { return new EuclideanDistance(this, cloner); } 37 public override IDeepCloneable Clone(Cloner cloner) { 38 return new EuclideanDistance(this, cloner); 39 } 39 40 public EuclideanDistance() { } 40 41 #endregion 41 42 42 public static double GetDistance(IReadOnlyList<double> point1, IReadOnlyList<double> point2) { 43 if (point1.Count != point2.Count) throw new ArgumentException("Euclidean distance not defined on vectors of different length"); 44 var sum = 0.0; 45 for (var i = 0; i < point1.Count; i++) { 46 var d = point1[i] - point2[i]; 47 sum += d * d; 43 public static double GetDistance(IEnumerable<double> point1, IEnumerable<double> point2) { 44 using (IEnumerator<double> p1Enum = point1.GetEnumerator(), p2Enum = point2.GetEnumerator()) { 45 var sum = 0.0; 46 while (p1Enum.MoveNext() & p2Enum.MoveNext()) { 47 var d = p1Enum.Current - p2Enum.Current; 48 sum += d * d; 49 } 50 if (p1Enum.MoveNext() || p2Enum.MoveNext()) throw new ArgumentException("Euclidean distance not defined on vectors of different length"); 51 return Math.Sqrt(sum); 48 52 } 49 50 return Math.Sqrt(sum);51 53 } 52 54 53 55 public override double Get(IEnumerable<double> a, IEnumerable<double> b) { 54 return GetDistance(a .ToArray(), b.ToArray());56 return GetDistance(a, b); 55 57 } 56 58 } -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/TSNE/Distances/ManhattanDistance.cs
r15207 r15614 31 31 [Item("ManhattanDistance", "A distance function that uses block distance")] 32 32 public class ManhattanDistance : DistanceBase<IEnumerable<double>> { 33 34 33 #region HLConstructors & Cloning 35 34 [StorableConstructor] … … 45 44 #endregion 46 45 47 public static double GetDistance(double[] point1, double[] point2) { 48 if (point1.Length != point2.Length) throw new ArgumentException("Manhattan distance not defined on vectors of different length"); 49 var sum = 0.0; 50 for (var i = 0; i < point1.Length; i++) 51 sum += Math.Abs(point1[i] + point2[i]); 52 return sum; 46 public static double GetDistance(IEnumerable<double> point1, IEnumerable<double> point2) { 47 using (IEnumerator<double> p1Enum = point1.GetEnumerator(), p2Enum = point2.GetEnumerator()) { 48 var sum = 0.0; 49 while (p1Enum.MoveNext() & p2Enum.MoveNext()) 50 sum += Math.Abs(p1Enum.Current - p2Enum.Current); 51 if (p1Enum.MoveNext() || p2Enum.MoveNext()) throw new ArgumentException("Manhattan distance not defined on vectors of different length"); 52 return sum; 53 } 53 54 } 54 55 55 56 public override double Get(IEnumerable<double> a, IEnumerable<double> b) { 56 return GetDistance(a .ToArray(), b.ToArray());57 return GetDistance(a, b); 57 58 } 58 59 } -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/TSNE/TSNEAlgorithm.cs
r15428 r15614 38 38 namespace HeuristicLab.Algorithms.DataAnalysis { 39 39 /// <summary> 40 /// t- distributed stochastic neighbourhood embedding (tSNE) projects the data in a low dimensional40 /// t-Distributed Stochastic Neighbor Embedding (tSNE) projects the data in a low dimensional 41 41 /// space to allow visual cluster identification. 42 42 /// </summary> 43 [Item("t SNE", "t-distributed stochastic neighbourhood embedding projects the data in a low " +44 "dimensional space to allow visual cluster identification. Implemented similar to: https://lvdmaaten.github.io/tsne/#implementations (Barnes-Hut t-SNE). Described in : https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf")]43 [Item("t-Distributed Stochastic Neighbor Embedding (tSNE)", "t-Distributed Stochastic Neighbor Embedding projects the data in a low " + 44 "dimensional space to allow visual cluster identification. Implemented similar to: https://lvdmaaten.github.io/tsne/#implementations (Barnes-Hut t-SNE). Described in : https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf")] 45 45 [Creatable(CreatableAttribute.Categories.DataAnalysis, Priority = 100)] 46 46 [StorableClass] … … 57 57 } 58 58 59 #region parameter names59 #region Parameter names 60 60 private const string DistanceFunctionParameterName = "DistanceFunction"; 61 61 private const string PerplexityParameterName = "Perplexity"; … … 72 72 private const string ClassesNameParameterName = "ClassesName"; 73 73 private const string NormalizationParameterName = "Normalization"; 74 private const string RandomInitializationParameterName = "RandomInitialization"; 74 75 private const string UpdateIntervalParameterName = "UpdateInterval"; 75 76 #endregion 76 77 77 #region result names78 #region Result names 78 79 private const string IterationResultName = "Iteration"; 79 80 private const string ErrorResultName = "Error"; … … 83 84 #endregion 84 85 85 #region parameter properties86 #region Parameter properties 86 87 public IFixedValueParameter<DoubleValue> PerplexityParameter { 87 get { return Parameters[PerplexityParameterName] as IFixedValueParameter<DoubleValue>; }88 get { return (IFixedValueParameter<DoubleValue>)Parameters[PerplexityParameterName]; } 88 89 } 89 90 public IFixedValueParameter<PercentValue> ThetaParameter { 90 get { return Parameters[ThetaParameterName] as IFixedValueParameter<PercentValue>; }91 get { return (IFixedValueParameter<PercentValue>)Parameters[ThetaParameterName]; } 91 92 } 92 93 public IFixedValueParameter<IntValue> NewDimensionsParameter { 93 get { return Parameters[NewDimensionsParameterName] as IFixedValueParameter<IntValue>; }94 get { return (IFixedValueParameter<IntValue>)Parameters[NewDimensionsParameterName]; } 94 95 } 95 96 public IConstrainedValueParameter<IDistance<double[]>> DistanceFunctionParameter { 96 get { return Parameters[DistanceFunctionParameterName] as IConstrainedValueParameter<IDistance<double[]>>; }97 get { return (IConstrainedValueParameter<IDistance<double[]>>)Parameters[DistanceFunctionParameterName]; } 97 98 } 98 99 public IFixedValueParameter<IntValue> MaxIterationsParameter { 99 get { return Parameters[MaxIterationsParameterName] as IFixedValueParameter<IntValue>; }100 get { return (IFixedValueParameter<IntValue>)Parameters[MaxIterationsParameterName]; } 100 101 } 101 102 public IFixedValueParameter<IntValue> StopLyingIterationParameter { 102 get { return Parameters[StopLyingIterationParameterName] as IFixedValueParameter<IntValue>; }103 get { return (IFixedValueParameter<IntValue>)Parameters[StopLyingIterationParameterName]; } 103 104 } 104 105 public IFixedValueParameter<IntValue> MomentumSwitchIterationParameter { 105 get { return Parameters[MomentumSwitchIterationParameterName] as IFixedValueParameter<IntValue>; }106 get { return (IFixedValueParameter<IntValue>)Parameters[MomentumSwitchIterationParameterName]; } 106 107 } 107 108 public IFixedValueParameter<DoubleValue> InitialMomentumParameter { 108 get { return Parameters[InitialMomentumParameterName] as IFixedValueParameter<DoubleValue>; }109 get { return (IFixedValueParameter<DoubleValue>)Parameters[InitialMomentumParameterName]; } 109 110 } 110 111 public IFixedValueParameter<DoubleValue> FinalMomentumParameter { 111 get { return Parameters[FinalMomentumParameterName] as IFixedValueParameter<DoubleValue>; }112 get { return (IFixedValueParameter<DoubleValue>)Parameters[FinalMomentumParameterName]; } 112 113 } 113 114 public IFixedValueParameter<DoubleValue> EtaParameter { 114 get { return Parameters[EtaParameterName] as IFixedValueParameter<DoubleValue>; }115 get { return (IFixedValueParameter<DoubleValue>)Parameters[EtaParameterName]; } 115 116 } 116 117 public IFixedValueParameter<BoolValue> SetSeedRandomlyParameter { 117 get { return Parameters[SetSeedRandomlyParameterName] as IFixedValueParameter<BoolValue>; }118 get { return (IFixedValueParameter<BoolValue>)Parameters[SetSeedRandomlyParameterName]; } 118 119 } 119 120 public IFixedValueParameter<IntValue> SeedParameter { 120 get { return Parameters[SeedParameterName] as IFixedValueParameter<IntValue>; }121 get { return (IFixedValueParameter<IntValue>)Parameters[SeedParameterName]; } 121 122 } 122 123 public IConstrainedValueParameter<StringValue> ClassesNameParameter { 123 get { return Parameters[ClassesNameParameterName] as IConstrainedValueParameter<StringValue>; }124 get { return (IConstrainedValueParameter<StringValue>)Parameters[ClassesNameParameterName]; } 124 125 } 125 126 public IFixedValueParameter<BoolValue> NormalizationParameter { 126 get { return Parameters[NormalizationParameterName] as IFixedValueParameter<BoolValue>; } 127 get { return (IFixedValueParameter<BoolValue>)Parameters[NormalizationParameterName]; } 128 } 129 public IFixedValueParameter<BoolValue> RandomInitializationParameter { 130 get { return (IFixedValueParameter<BoolValue>)Parameters[RandomInitializationParameterName]; } 127 131 } 128 132 public IFixedValueParameter<IntValue> UpdateIntervalParameter { 129 get { return Parameters[UpdateIntervalParameterName] as IFixedValueParameter<IntValue>; }133 get { return (IFixedValueParameter<IntValue>)Parameters[UpdateIntervalParameterName]; } 130 134 } 131 135 #endregion … … 187 191 set { NormalizationParameter.Value.Value = value; } 188 192 } 189 193 public bool RandomInitialization { 194 get { return RandomInitializationParameter.Value.Value; } 195 set { RandomInitializationParameter.Value.Value = value; } 196 } 190 197 public int UpdateInterval { 191 198 get { return UpdateIntervalParameter.Value.Value; } … … 194 201 #endregion 195 202 203 #region Storable poperties 204 [Storable] 205 private Dictionary<string, IList<int>> dataRowIndices; 206 [Storable] 207 private TSNEStatic<double[]>.TSNEState state; 208 #endregion 209 196 210 #region Constructors & Cloning 197 211 [StorableConstructor] 198 212 private TSNEAlgorithm(bool deserializing) : base(deserializing) { } 199 213 214 [StorableHook(HookType.AfterDeserialization)] 215 private void AfterDeserialization() { 216 if (!Parameters.ContainsKey(RandomInitializationParameterName)) 217 Parameters.Add(new FixedValueParameter<BoolValue>(RandomInitializationParameterName, "Wether data points should be randomly initialized or according to the first 2 dimensions", new BoolValue(true))); 218 RegisterParameterEvents(); 219 } 200 220 private TSNEAlgorithm(TSNEAlgorithm original, Cloner cloner) : base(original, cloner) { 201 if (original.dataRowNames != null) 202 this.dataRowNames = new Dictionary<string, List<int>>(original.dataRowNames); 203 if (original.dataRows != null) 204 this.dataRows = original.dataRows.ToDictionary(kvp => kvp.Key, kvp => cloner.Clone(kvp.Value)); 221 if (original.dataRowIndices != null) 222 dataRowIndices = new Dictionary<string, IList<int>>(original.dataRowIndices); 205 223 if (original.state != null) 206 this.state = cloner.Clone(original.state); 207 this.iter = original.iter; 208 } 209 public override IDeepCloneable Clone(Cloner cloner) { return new TSNEAlgorithm(this, cloner); } 224 state = cloner.Clone(original.state); 225 RegisterParameterEvents(); 226 } 227 public override IDeepCloneable Clone(Cloner cloner) { 228 return new TSNEAlgorithm(this, cloner); 229 } 210 230 public TSNEAlgorithm() { 211 231 var distances = new ItemSet<IDistance<double[]>>(ApplicationManager.Manager.GetInstances<IDistance<double[]>>()); … … 213 233 Parameters.Add(new FixedValueParameter<DoubleValue>(PerplexityParameterName, "Perplexity-parameter of tSNE. Comparable to k in a k-nearest neighbour algorithm. Recommended value is floor(number of points /3) or lower", new DoubleValue(25))); 214 234 Parameters.Add(new FixedValueParameter<PercentValue>(ThetaParameterName, "Value describing how much appoximated " + 215 "gradients my differ from exact gradients. Set to 0 for exact calculation and in [0,1] otherwise. " +216 "Appropriate values for theta are between 0.1 and 0.7 (default = 0.5). CAUTION: exact calculation of " +217 "forces requires building a non-sparse N*N matrix where N is the number of data points. This may " +218 "exceed memory limitations. The function is designed to run on large (N > 5000) data sets. It may give" +219 " poor performance on very small data sets(it is better to use a standard t - SNE implementation on such data).", new PercentValue(0)));235 "gradients my differ from exact gradients. Set to 0 for exact calculation and in [0,1] otherwise. " + 236 "Appropriate values for theta are between 0.1 and 0.7 (default = 0.5). CAUTION: exact calculation of " + 237 "forces requires building a non-sparse N*N matrix where N is the number of data points. This may " + 238 "exceed memory limitations. The function is designed to run on large (N > 5000) data sets. It may give" + 239 " poor performance on very small data sets(it is better to use a standard t - SNE implementation on such data).", new PercentValue(0))); 220 240 Parameters.Add(new FixedValueParameter<IntValue>(NewDimensionsParameterName, "Dimensionality of projected space (usually 2 for easy visual analysis)", new IntValue(2))); 221 241 Parameters.Add(new FixedValueParameter<IntValue>(MaxIterationsParameterName, "Maximum number of iterations for gradient descent.", new IntValue(1000))); … … 230 250 Parameters.Add(new FixedValueParameter<BoolValue>(NormalizationParameterName, "Whether the data should be zero centered and have variance of 1 for each variable, so different scalings are ignored.", new BoolValue(true))); 231 251 Parameters.Add(new FixedValueParameter<IntValue>(UpdateIntervalParameterName, "The interval after which the results will be updated.", new IntValue(50))); 232 Parameters[UpdateIntervalParameterName].Hidden = true; 233 252 Parameters.Add(new FixedValueParameter<BoolValue>(RandomInitializationParameterName, "Wether data points should be randomly initialized or according to the first 2 dimensions", new BoolValue(true))); 253 254 UpdateIntervalParameter.Hidden = true; 234 255 MomentumSwitchIterationParameter.Hidden = true; 235 256 InitialMomentumParameter.Hidden = true; … … 238 259 EtaParameter.Hidden = false; 239 260 Problem = new RegressionProblem(); 240 } 241 #endregion 242 243 [Storable] 244 private Dictionary<string, List<int>> dataRowNames; 245 [Storable] 246 private Dictionary<string, ScatterPlotDataRow> dataRows; 247 [Storable] 248 private TSNEStatic<double[]>.TSNEState state; 249 [Storable] 250 private int iter; 261 RegisterParameterEvents(); 262 } 263 #endregion 251 264 252 265 public override void Prepare() { 253 266 base.Prepare(); 254 dataRowNames = null; 255 dataRows = null; 267 dataRowIndices = null; 256 268 state = null; 257 269 } … … 259 271 protected override void Run(CancellationToken cancellationToken) { 260 272 var problemData = Problem.ProblemData; 261 // set up and initialized everything if necessary 273 // set up and initialize everything if necessary 274 var wdist = DistanceFunction as WeightedEuclideanDistance; 275 if (wdist != null) wdist.Initialize(problemData); 262 276 if (state == null) { 263 277 if (SetSeedRandomly) Seed = new System.Random().Next(); … … 265 279 var dataset = problemData.Dataset; 266 280 var allowedInputVariables = problemData.AllowedInputVariables.ToArray(); 267 var data = new double[dataset.Rows][]; 268 for (var row = 0; row < dataset.Rows; row++) 269 data[row] = allowedInputVariables.Select(col => dataset.GetDoubleValue(col, row)).ToArray(); 270 271 if (Normalization) data = NormalizeData(data); 272 273 state = TSNEStatic<double[]>.CreateState(data, DistanceFunction, random, NewDimensions, Perplexity, Theta, 274 StopLyingIteration, MomentumSwitchIteration, InitialMomentum, FinalMomentum, Eta); 275 276 SetUpResults(data); 277 iter = 0; 278 } 279 for (; iter < MaxIterations && !cancellationToken.IsCancellationRequested; iter++) { 280 if (iter % UpdateInterval == 0) 281 Analyze(state); 281 var allindices = Problem.ProblemData.AllIndices.ToArray(); 282 283 // jagged array is required to meet the static method declarations of TSNEStatic<T> 284 var data = Enumerable.Range(0, dataset.Rows).Select(x => new double[allowedInputVariables.Length]).ToArray(); 285 var col = 0; 286 foreach (var s in allowedInputVariables) { 287 var row = 0; 288 foreach (var d in dataset.GetDoubleValues(s)) { 289 data[row][col] = d; 290 row++; 291 } 292 col++; 293 } 294 if (Normalization) data = NormalizeInputData(data); 295 state = TSNEStatic<double[]>.CreateState(data, DistanceFunction, random, NewDimensions, Perplexity, Theta, StopLyingIteration, MomentumSwitchIteration, InitialMomentum, FinalMomentum, Eta, RandomInitialization); 296 SetUpResults(allindices); 297 } 298 while (state.iter < MaxIterations && !cancellationToken.IsCancellationRequested) { 299 if (state.iter % UpdateInterval == 0) Analyze(state); 282 300 TSNEStatic<double[]>.Iterate(state); 283 301 } … … 294 312 protected override void RegisterProblemEvents() { 295 313 base.RegisterProblemEvents(); 314 if (Problem == null) return; 296 315 Problem.ProblemDataChanged += OnProblemDataChanged; 297 } 316 if (Problem.ProblemData == null) return; 317 Problem.ProblemData.Changed += OnPerplexityChanged; 318 Problem.ProblemData.Changed += OnColumnsChanged; 319 if (Problem.ProblemData.Dataset == null) return; 320 Problem.ProblemData.Dataset.RowsChanged += OnPerplexityChanged; 321 Problem.ProblemData.Dataset.ColumnsChanged += OnColumnsChanged; 322 } 323 298 324 protected override void DeregisterProblemEvents() { 299 325 base.DeregisterProblemEvents(); 326 if (Problem == null) return; 300 327 Problem.ProblemDataChanged -= OnProblemDataChanged; 328 if (Problem.ProblemData == null) return; 329 Problem.ProblemData.Changed -= OnPerplexityChanged; 330 Problem.ProblemData.Changed -= OnColumnsChanged; 331 if (Problem.ProblemData.Dataset == null) return; 332 Problem.ProblemData.Dataset.RowsChanged -= OnPerplexityChanged; 333 Problem.ProblemData.Dataset.ColumnsChanged -= OnColumnsChanged; 334 } 335 336 protected override void OnStopped() { 337 base.OnStopped(); 338 //bwerth: state objects can be very large; avoid state serialization 339 state = null; 340 dataRowIndices = null; 301 341 } 302 342 303 343 private void OnProblemDataChanged(object sender, EventArgs args) { 304 344 if (Problem == null || Problem.ProblemData == null) return; 345 OnPerplexityChanged(this, null); 346 OnColumnsChanged(this, null); 347 Problem.ProblemData.Changed += OnPerplexityChanged; 348 Problem.ProblemData.Changed += OnColumnsChanged; 349 if (Problem.ProblemData.Dataset == null) return; 350 Problem.ProblemData.Dataset.RowsChanged += OnPerplexityChanged; 351 Problem.ProblemData.Dataset.ColumnsChanged += OnColumnsChanged; 305 352 if (!Parameters.ContainsKey(ClassesNameParameterName)) return; 306 353 ClassesNameParameter.ValidValues.Clear(); … … 308 355 } 309 356 357 private void OnColumnsChanged(object sender, EventArgs e) { 358 if (Problem == null || Problem.ProblemData == null || Problem.ProblemData.Dataset == null || !Parameters.ContainsKey(DistanceFunctionParameterName)) return; 359 DistanceFunctionParameter.ValidValues.OfType<WeightedEuclideanDistance>().Single().AdaptToProblemData(Problem.ProblemData); 360 } 361 362 private void RegisterParameterEvents() { 363 PerplexityParameter.Value.ValueChanged += OnPerplexityChanged; 364 } 365 366 private void OnPerplexityChanged(object sender, EventArgs e) { 367 if (Problem == null || Problem.ProblemData == null || Problem.ProblemData.Dataset == null || !Parameters.ContainsKey(PerplexityParameterName)) return; 368 PerplexityParameter.Value.Value = Math.Max(1, Math.Min((Problem.ProblemData.Dataset.Rows - 1) / 3.0, Perplexity)); 369 } 310 370 #endregion 311 371 312 372 #region Helpers 313 private void SetUpResults(IReadOnly Collection<double[]> data) {373 private void SetUpResults(IReadOnlyList<int> allIndices) { 314 374 if (Results == null) return; 315 375 var results = Results; 316 dataRowNames = new Dictionary<string, List<int>>(); 317 dataRows = new Dictionary<string, ScatterPlotDataRow>(); 376 dataRowIndices = new Dictionary<string, IList<int>>(); 318 377 var problemData = Problem.ProblemData; 319 378 320 //color datapoints acording to classes variable (be it double or string) 321 if (problemData.Dataset.VariableNames.Contains(ClassesName)) { 322 if ((problemData.Dataset as Dataset).VariableHasType<string>(ClassesName)) { 323 var classes = problemData.Dataset.GetStringValues(ClassesName).ToArray(); 324 for (var i = 0; i < classes.Length; i++) { 325 if (!dataRowNames.ContainsKey(classes[i])) dataRowNames.Add(classes[i], new List<int>()); 326 dataRowNames[classes[i]].Add(i); 379 if (!results.ContainsKey(IterationResultName)) results.Add(new Result(IterationResultName, new IntValue(0))); 380 if (!results.ContainsKey(ErrorResultName)) results.Add(new Result(ErrorResultName, new DoubleValue(0))); 381 if (!results.ContainsKey(ScatterPlotResultName)) results.Add(new Result(ScatterPlotResultName, "Plot of the projected data", new ScatterPlot(DataResultName, ""))); 382 if (!results.ContainsKey(DataResultName)) results.Add(new Result(DataResultName, "Projected Data", new DoubleMatrix())); 383 if (!results.ContainsKey(ErrorPlotResultName)) { 384 var errortable = new DataTable(ErrorPlotResultName, "Development of errors during gradient descent") { 385 VisualProperties = { 386 XAxisTitle = "UpdateIntervall", 387 YAxisTitle = "Error", 388 YAxisLogScale = true 327 389 } 328 } else if ((problemData.Dataset as Dataset).VariableHasType<double>(ClassesName)) { 329 var classValues = problemData.Dataset.GetDoubleValues(ClassesName).ToArray(); 330 var max = classValues.Max() + 0.1; 331 var min = classValues.Min() - 0.1; 332 const int contours = 8; 333 for (var i = 0; i < contours; i++) { 334 var contourname = GetContourName(i, min, max, contours); 335 dataRowNames.Add(contourname, new List<int>()); 336 dataRows.Add(contourname, new ScatterPlotDataRow(contourname, "", new List<Point2D<double>>())); 337 dataRows[contourname].VisualProperties.Color = GetHeatMapColor(i, contours); 338 dataRows[contourname].VisualProperties.PointSize = i + 3; 339 } 340 for (var i = 0; i < classValues.Length; i++) { 341 dataRowNames[GetContourName(classValues[i], min, max, contours)].Add(i); 342 } 343 } 390 }; 391 errortable.Rows.Add(new DataRow("Errors")); 392 errortable.Rows["Errors"].VisualProperties.StartIndexZero = true; 393 results.Add(new Result(ErrorPlotResultName, errortable)); 394 } 395 396 //color datapoints acording to classes variable (be it double, datetime or string) 397 if (!problemData.Dataset.VariableNames.Contains(ClassesName)) { 398 dataRowIndices.Add("Training", problemData.TrainingIndices.ToList()); 399 dataRowIndices.Add("Test", problemData.TestIndices.ToList()); 400 return; 401 } 402 403 var classificationData = problemData as ClassificationProblemData; 404 if (classificationData != null && classificationData.TargetVariable.Equals(ClassesName)) { 405 var classNames = classificationData.ClassValues.Zip(classificationData.ClassNames, (v, n) => new {v, n}).ToDictionary(x => x.v, x => x.n); 406 var classes = classificationData.Dataset.GetDoubleValues(classificationData.TargetVariable, allIndices).Select(v => classNames[v]).ToArray(); 407 for (var i = 0; i < classes.Length; i++) { 408 if (!dataRowIndices.ContainsKey(classes[i])) dataRowIndices.Add(classes[i], new List<int>()); 409 dataRowIndices[classes[i]].Add(i); 410 } 411 } else if (((Dataset)problemData.Dataset).VariableHasType<string>(ClassesName)) { 412 var classes = problemData.Dataset.GetStringValues(ClassesName, allIndices).ToArray(); 413 for (var i = 0; i < classes.Length; i++) { 414 if (!dataRowIndices.ContainsKey(classes[i])) dataRowIndices.Add(classes[i], new List<int>()); 415 dataRowIndices[classes[i]].Add(i); 416 } 417 } else if (((Dataset)problemData.Dataset).VariableHasType<double>(ClassesName)) { 418 var clusterdata = new Dataset(problemData.Dataset.DoubleVariables, problemData.Dataset.DoubleVariables.Select(v => problemData.Dataset.GetDoubleValues(v, allIndices).ToList())); 419 const int contours = 8; 420 Dictionary<int, string> contourMap; 421 IClusteringModel clusterModel; 422 double[][] borders; 423 CreateClusters(clusterdata, ClassesName, contours, out clusterModel, out contourMap, out borders); 424 var contourorder = borders.Select((x, i) => new {x, i}).OrderBy(x => x.x[0]).Select(x => x.i).ToArray(); 425 for (var i = 0; i < contours; i++) { 426 var c = contourorder[i]; 427 var contourname = contourMap[c]; 428 dataRowIndices.Add(contourname, new List<int>()); 429 var row = new ScatterPlotDataRow(contourname, "", new List<Point2D<double>>()) {VisualProperties = {Color = GetHeatMapColor(i, contours), PointSize = 8}}; 430 ((ScatterPlot)results[ScatterPlotResultName].Value).Rows.Add(row); 431 } 432 var allClusters = clusterModel.GetClusterValues(clusterdata, Enumerable.Range(0, clusterdata.Rows)).ToArray(); 433 for (var i = 0; i < clusterdata.Rows; i++) dataRowIndices[contourMap[allClusters[i] - 1]].Add(i); 434 } else if (((Dataset)problemData.Dataset).VariableHasType<DateTime>(ClassesName)) { 435 var clusterdata = new Dataset(problemData.Dataset.DateTimeVariables, problemData.Dataset.DateTimeVariables.Select(v => problemData.Dataset.GetDoubleValues(v, allIndices).ToList())); 436 const int contours = 8; 437 Dictionary<int, string> contourMap; 438 IClusteringModel clusterModel; 439 double[][] borders; 440 CreateClusters(clusterdata, ClassesName, contours, out clusterModel, out contourMap, out borders); 441 var contourorder = borders.Select((x, i) => new {x, i}).OrderBy(x => x.x[0]).Select(x => x.i).ToArray(); 442 for (var i = 0; i < contours; i++) { 443 var c = contourorder[i]; 444 var contourname = contourMap[c]; 445 dataRowIndices.Add(contourname, new List<int>()); 446 var row = new ScatterPlotDataRow(contourname, "", new List<Point2D<double>>()) {VisualProperties = {Color = GetHeatMapColor(i, contours), PointSize = 8}}; 447 row.VisualProperties.PointSize = 8; 448 ((ScatterPlot)results[ScatterPlotResultName].Value).Rows.Add(row); 449 } 450 var allClusters = clusterModel.GetClusterValues(clusterdata, Enumerable.Range(0, clusterdata.Rows)).ToArray(); 451 for (var i = 0; i < clusterdata.Rows; i++) dataRowIndices[contourMap[allClusters[i] - 1]].Add(i); 344 452 } else { 345 dataRowNames.Add("Training", problemData.TrainingIndices.ToList()); 346 dataRowNames.Add("Test", problemData.TestIndices.ToList()); 347 } 348 349 if (!results.ContainsKey(IterationResultName)) results.Add(new Result(IterationResultName, new IntValue(0))); 350 else ((IntValue)results[IterationResultName].Value).Value = 0; 351 352 if (!results.ContainsKey(ErrorResultName)) results.Add(new Result(ErrorResultName, new DoubleValue(0))); 353 else ((DoubleValue)results[ErrorResultName].Value).Value = 0; 354 355 if (!results.ContainsKey(ErrorPlotResultName)) results.Add(new Result(ErrorPlotResultName, new DataTable(ErrorPlotResultName, "Development of errors during gradient descent"))); 356 else results[ErrorPlotResultName].Value = new DataTable(ErrorPlotResultName, "Development of errors during gradient descent"); 357 358 var plot = results[ErrorPlotResultName].Value as DataTable; 359 if (plot == null) throw new ArgumentException("could not create/access error data table in results collection"); 360 361 if (!plot.Rows.ContainsKey("errors")) plot.Rows.Add(new DataRow("errors")); 362 plot.Rows["errors"].Values.Clear(); 363 plot.Rows["errors"].VisualProperties.StartIndexZero = true; 364 365 results.Add(new Result(ScatterPlotResultName, "Plot of the projected data", new ScatterPlot(DataResultName, ""))); 366 results.Add(new Result(DataResultName, "Projected Data", new DoubleMatrix())); 453 dataRowIndices.Add("Training", problemData.TrainingIndices.ToList()); 454 dataRowIndices.Add("Test", problemData.TestIndices.ToList()); 455 } 367 456 } 368 457 … … 372 461 var plot = results[ErrorPlotResultName].Value as DataTable; 373 462 if (plot == null) throw new ArgumentException("Could not create/access error data table in results collection."); 374 var errors = plot.Rows[" errors"].Values;463 var errors = plot.Rows["Errors"].Values; 375 464 var c = tsneState.EvaluateError(); 376 465 errors.Add(c); … … 378 467 ((DoubleValue)results[ErrorResultName].Value).Value = errors.Last(); 379 468 380 var ndata = Normalize (tsneState.newData);469 var ndata = NormalizeProjectedData(tsneState.newData); 381 470 results[DataResultName].Value = new DoubleMatrix(ndata); 382 471 var splot = results[ScatterPlotResultName].Value as ScatterPlot; 383 FillScatterPlot(ndata, splot); 384 } 385 386 private void FillScatterPlot(double[,] lowDimData, ScatterPlot plot) { 387 foreach (var rowName in dataRowNames.Keys) { 388 if (!plot.Rows.ContainsKey(rowName)) 389 plot.Rows.Add(dataRows.ContainsKey(rowName) ? dataRows[rowName] : new ScatterPlotDataRow(rowName, "", new List<Point2D<double>>())); 390 plot.Rows[rowName].Points.Replace(dataRowNames[rowName].Select(i => new Point2D<double>(lowDimData[i, 0], lowDimData[i, 1]))); 391 } 392 } 393 394 private static double[,] Normalize(double[,] data) { 472 FillScatterPlot(ndata, splot, dataRowIndices); 473 } 474 475 private static void FillScatterPlot(double[,] lowDimData, ScatterPlot plot, Dictionary<string, IList<int>> dataRowIndices) { 476 foreach (var rowName in dataRowIndices.Keys) { 477 if (!plot.Rows.ContainsKey(rowName)) { 478 plot.Rows.Add(new ScatterPlotDataRow(rowName, "", new List<Point2D<double>>())); 479 plot.Rows[rowName].VisualProperties.PointSize = 8; 480 } 481 plot.Rows[rowName].Points.Replace(dataRowIndices[rowName].Select(i => new Point2D<double>(lowDimData[i, 0], lowDimData[i, 1]))); 482 } 483 } 484 485 private static double[,] NormalizeProjectedData(double[,] data) { 395 486 var max = new double[data.GetLength(1)]; 396 487 var min = new double[data.GetLength(1)]; … … 398 489 for (var i = 0; i < max.Length; i++) max[i] = min[i] = data[0, i]; 399 490 for (var i = 0; i < data.GetLength(0); i++) 400 401 402 403 404 491 for (var j = 0; j < data.GetLength(1); j++) { 492 var v = data[i, j]; 493 max[j] = Math.Max(max[j], v); 494 min[j] = Math.Min(min[j], v); 495 } 405 496 for (var i = 0; i < data.GetLength(0); i++) { 406 497 for (var j = 0; j < data.GetLength(1); j++) { 407 498 var d = max[j] - min[j]; 408 var s = data[i, j] - (max[j] + min[j]) / 2; 409 if (d.IsAlmost(0)) res[i, j] = data[i, j]; 410 else res[i, j] = s / d; 499 var s = data[i, j] - (max[j] + min[j]) / 2; //shift data 500 if (d.IsAlmost(0)) res[i, j] = data[i, j]; //no scaling possible 501 else res[i, j] = s / d; //scale data 411 502 } 412 503 } … … 414 505 } 415 506 416 private static double[][] Normalize Data(IReadOnlyList<double[]> data) {507 private static double[][] NormalizeInputData(IReadOnlyList<IReadOnlyList<double>> data) { 417 508 // as in tSNE implementation by van der Maaten 418 var n = data[0]. Length;509 var n = data[0].Count; 419 510 var mean = new double[n]; 420 511 var max = new double[n]; … … 426 517 for (var i = 0; i < data.Count; i++) { 427 518 nData[i] = new double[n]; 428 for (var j = 0; j < n; j++) nData[i][j] = max[j].IsAlmost(0) ? data[i][j] - mean[j] : (data[i][j] - mean[j]) / max[j]; 519 for (var j = 0; j < n; j++) 520 nData[i][j] = max[j].IsAlmost(0) ? data[i][j] - mean[j] : (data[i][j] - mean[j]) / max[j]; 429 521 } 430 522 return nData; … … 432 524 433 525 private static Color GetHeatMapColor(int contourNr, int noContours) { 434 var q = (double)contourNr / noContours; // q in [0,1] 435 var c = q < 0.5 ? Color.FromArgb((int)(q * 2 * 255), 255, 0) : Color.FromArgb(255, (int)((1 - q) * 2 * 255), 0); 436 return c; 437 } 438 439 private static string GetContourName(double value, double min, double max, int noContours) { 440 var size = (max - min) / noContours; 441 var contourNr = (int)((value - min) / size); 442 return GetContourName(contourNr, min, max, noContours); 443 } 444 445 private static string GetContourName(int i, double min, double max, int noContours) { 446 var size = (max - min) / noContours; 447 return "[" + (min + i * size) + ";" + (min + (i + 1) * size) + ")"; 526 return ConvertTotalToRgb(0, noContours, contourNr); 527 } 528 529 private static void CreateClusters(IDataset data, string target, int contours, out IClusteringModel contourCluster, out Dictionary<int, string> contourNames, out double[][] borders) { 530 var cpd = new ClusteringProblemData((Dataset)data, new[] {target}); 531 contourCluster = KMeansClustering.CreateKMeansSolution(cpd, contours, 3).Model; 532 533 borders = Enumerable.Range(0, contours).Select(x => new[] {double.MaxValue, double.MinValue}).ToArray(); 534 var clusters = contourCluster.GetClusterValues(cpd.Dataset, cpd.AllIndices).ToArray(); 535 var targetvalues = cpd.Dataset.GetDoubleValues(target).ToArray(); 536 foreach (var i in cpd.AllIndices) { 537 var cl = clusters[i] - 1; 538 var clv = targetvalues[i]; 539 if (borders[cl][0] > clv) borders[cl][0] = clv; 540 if (borders[cl][1] < clv) borders[cl][1] = clv; 541 } 542 543 contourNames = new Dictionary<int, string>(); 544 for (var i = 0; i < contours; i++) 545 contourNames.Add(i, "[" + borders[i][0] + ";" + borders[i][1] + "]"); 546 } 547 548 private static Color ConvertTotalToRgb(double low, double high, double cell) { 549 var colorGradient = ColorGradient.Colors; 550 var range = high - low; 551 var h = Math.Min(cell / range * colorGradient.Count, colorGradient.Count - 1); 552 return colorGradient[(int)h]; 448 553 } 449 554 #endregion -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/TSNE/TSNEStatic.cs
r15207 r15614 65 65 [StorableClass] 66 66 public class TSNEStatic<T> { 67 68 67 [StorableClass] 69 68 public sealed class TSNEState : DeepCloneable { … … 170 169 [StorableConstructor] 171 170 public TSNEState(bool deserializing) { } 172 public TSNEState(T[] data, IDistance<T> distance, IRandom random, int newDimensions, double perplexity, double theta, int stopLyingIter, int momSwitchIter, double momentum, double finalMomentum, double eta) { 171 172 public TSNEState(IReadOnlyList<T> data, IDistance<T> distance, IRandom random, int newDimensions, double perplexity, 173 double theta, int stopLyingIter, int momSwitchIter, double momentum, double finalMomentum, double eta, bool randomInit) { 173 174 this.distance = distance; 174 175 this.random = random; … … 183 184 184 185 // initialize 185 noDatapoints = data. Length;186 noDatapoints = data.Count; 186 187 if (noDatapoints - 1 < 3 * perplexity) 187 188 throw new ArgumentException("Perplexity too large for the number of data points!"); … … 193 194 gains = new double[noDatapoints, newDimensions]; 194 195 for (var i = 0; i < noDatapoints; i++) 195 196 196 for (var j = 0; j < newDimensions; j++) 197 gains[i, j] = 1.0; 197 198 198 199 p = null; … … 212 213 var rand = new NormalDistributedRandom(random, 0, 1); 213 214 for (var i = 0; i < noDatapoints; i++) 214 for (var j = 0; j < newDimensions; j++) 215 newData[i, j] = rand.NextDouble() * .0001; 215 for (var j = 0; j < newDimensions; j++) 216 newData[i, j] = rand.NextDouble() * .0001; 217 218 if (!(data[0] is IReadOnlyList<double>) || randomInit) return; 219 for (var i = 0; i < noDatapoints; i++) 220 for (var j = 0; j < newDimensions; j++) { 221 var row = (IReadOnlyList<double>) data[i]; 222 newData[i, j] = row[j % row.Count]; 223 } 216 224 } 217 225 #endregion 218 226 219 227 public double EvaluateError() { 220 return exact ? 221 EvaluateErrorExact(p, newData, noDatapoints, newDimensions) : 222 EvaluateErrorApproximate(rowP, colP, valP, newData, theta); 228 return exact ? EvaluateErrorExact(p, newData, noDatapoints, newDimensions) : EvaluateErrorApproximate(rowP, colP, valP, newData, theta); 223 229 } 224 230 225 231 #region Helpers 226 private static void CalculateApproximateSimilarities( T[]data, IDistance<T> distance, double perplexity, out int[] rowP, out int[] colP, out double[] valP) {232 private static void CalculateApproximateSimilarities(IReadOnlyList<T> data, IDistance<T> distance, double perplexity, out int[] rowP, out int[] colP, out double[] valP) { 227 233 // Compute asymmetric pairwise input similarities 228 ComputeGaussianPerplexity(data, distance, out rowP, out colP, out valP, perplexity, (int) (3 * perplexity));234 ComputeGaussianPerplexity(data, distance, out rowP, out colP, out valP, perplexity, (int) (3 * perplexity)); 229 235 // Symmetrize input similarities 230 236 int[] sRowP, symColP; … … 235 241 valP = sValP; 236 242 var sumP = .0; 237 for (var i = 0; i < rowP[data.Length]; i++) sumP += valP[i]; 238 for (var i = 0; i < rowP[data.Length]; i++) valP[i] /= sumP; 239 } 240 241 private static double[,] CalculateExactSimilarites(T[] data, IDistance<T> distance, double perplexity) { 243 for (var i = 0; i < rowP[data.Count]; i++) sumP += valP[i]; 244 for (var i = 0; i < rowP[data.Count]; i++) valP[i] /= sumP; 245 } 246 private static double[,] CalculateExactSimilarites(IReadOnlyList<T> data, IDistance<T> distance, double perplexity) { 242 247 // Compute similarities 243 var p = new double[data. Length, data.Length];248 var p = new double[data.Count, data.Count]; 244 249 ComputeGaussianPerplexity(data, distance, p, perplexity); 245 250 // Symmetrize input similarities 246 for (var n = 0; n < data. Length; n++) {247 for (var m = n + 1; m < data. Length; m++) {251 for (var n = 0; n < data.Count; n++) { 252 for (var m = n + 1; m < data.Count; m++) { 248 253 p[n, m] += p[m, n]; 249 254 p[m, n] = p[n, m]; … … 251 256 } 252 257 var sumP = .0; 253 for (var i = 0; i < data.Length; i++) for (var j = 0; j < data.Length; j++) sumP += p[i, j]; 254 for (var i = 0; i < data.Length; i++) for (var j = 0; j < data.Length; j++) p[i, j] /= sumP; 258 for (var i = 0; i < data.Count; i++) { 259 for (var j = 0; j < data.Count; j++) { 260 sumP += p[i, j]; 261 } 262 } 263 for (var i = 0; i < data.Count; i++) { 264 for (var j = 0; j < data.Count; j++) { 265 p[i, j] /= sumP; 266 } 267 } 255 268 return p; 256 269 } 257 258 270 private static void ComputeGaussianPerplexity(IReadOnlyList<T> x, IDistance<T> distance, out int[] rowP, out int[] colP, out double[] valP, double perplexity, int k) { 259 271 if (perplexity > k) throw new ArgumentException("Perplexity should be lower than k!"); … … 290 302 291 303 // Iterate until we found a good perplexity 292 var iter = 0; double sumP = 0; 304 var iter = 0; 305 double sumP = 0; 293 306 while (!found && iter < 200) { 294 295 307 // Compute Gaussian kernel row 296 308 for (var m = 0; m < k; m++) curP[m] = Math.Exp(-beta * distances[m + 1]); … … 307 319 if (hdiff < tol && -hdiff < tol) { 308 320 found = true; 309 } else { 321 } 322 else { 310 323 if (hdiff > 0) { 311 324 minBeta = beta; … … 314 327 else 315 328 beta = (beta + maxBeta) / 2.0; 316 } else { 329 } 330 else { 317 331 maxBeta = beta; 318 332 if (minBeta.IsAlmost(double.MinValue) || minBeta.IsAlmost(double.MaxValue)) … … 335 349 } 336 350 } 337 private static void ComputeGaussianPerplexity( T[]x, IDistance<T> distance, double[,] p, double perplexity) {351 private static void ComputeGaussianPerplexity(IReadOnlyList<T> x, IDistance<T> distance, double[,] p, double perplexity) { 338 352 // Compute the distance matrix 339 353 var dd = ComputeDistances(x, distance); 340 354 341 var n = x. Length;355 var n = x.Count; 342 356 // Compute the Gaussian kernel row by row 343 357 for (var i = 0; i < n; i++) { … … 352 366 // Iterate until we found a good perplexity 353 367 var iter = 0; 354 while (!found && iter < 200) { 368 while (!found && iter < 200) { // 200 iterations as in tSNE implementation by van der Maarten 355 369 356 370 // Compute Gaussian kernel row … … 369 383 if (hdiff < tol && -hdiff < tol) { 370 384 found = true; 371 } else { 385 } 386 else { 372 387 if (hdiff > 0) { 373 388 minBeta = beta; … … 376 391 else 377 392 beta = (beta + maxBeta) / 2.0; 378 } else { 393 } 394 else { 379 395 maxBeta = beta; 380 396 if (minBeta.IsAlmost(double.MinValue) || minBeta.IsAlmost(double.MaxValue)) … … 393 409 } 394 410 } 395 396 private static double[][] ComputeDistances(T[] x, IDistance<T> distance) { 397 var res = new double[x.Length][]; 398 for (var r = 0; r < x.Length; r++) { 399 var rowV = new double[x.Length]; 411 private static double[][] ComputeDistances(IReadOnlyList<T> x, IDistance<T> distance) { 412 var res = new double[x.Count][]; 413 for (var r = 0; r < x.Count; r++) { 414 var rowV = new double[x.Count]; 400 415 // all distances must be symmetric 401 416 for (var c = 0; c < r; c++) { … … 403 418 } 404 419 rowV[r] = 0.0; // distance to self is zero for all distances 405 for (var c = r + 1; c < x. Length; c++) {420 for (var c = r + 1; c < x.Count; c++) { 406 421 rowV[c] = distance.Get(x[r], x[c]); 407 422 } … … 411 426 // return x.Select(m => x.Select(n => distance.Get(m, n)).ToArray()).ToArray(); 412 427 } 413 414 428 private static double EvaluateErrorExact(double[,] p, double[,] y, int n, int d) { 415 429 // Compute the squared Euclidean distance matrix … … 425 439 q[n1, m] = 1 / (1 + dd[n1, m]); 426 440 sumQ += q[n1, m]; 427 } else q[n1, m] = double.Epsilon; 441 } 442 else q[n1, m] = double.Epsilon; 428 443 } 429 444 } … … 433 448 var c = .0; 434 449 for (var i = 0; i < n; i++) 435 436 437 450 for (var j = 0; j < n; j++) { 451 c += p[i, j] * Math.Log((p[i, j] + float.Epsilon) / (q[i, j] + float.Epsilon)); 452 } 438 453 return c; 439 454 } 440 441 455 private static double EvaluateErrorApproximate(IReadOnlyList<int> rowP, IReadOnlyList<int> colP, IReadOnlyList<double> valP, double[,] y, double theta) { 442 456 // Get estimate of normalization term … … 463 477 } 464 478 private static void SymmetrizeMatrix(IReadOnlyList<int> rowP, IReadOnlyList<int> colP, IReadOnlyList<double> valP, out int[] symRowP, out int[] symColP, out double[] symValP) { 465 466 479 // Count number of elements and row counts of symmetric matrix 467 480 var n = rowP.Count - 1; … … 469 482 for (var j = 0; j < n; j++) { 470 483 for (var i = rowP[j]; i < rowP[j + 1]; i++) { 471 472 484 // Check whether element (col_P[i], n) is present 473 485 var present = false; … … 497 509 var offset = new int[n]; 498 510 for (var j = 0; j < n; j++) { 499 for (var i = rowP[j]; i < rowP[j + 1]; i++) { 511 for (var i = rowP[j]; i < rowP[j + 1]; i++) { // considering element(n, colP[i]) 500 512 501 513 // Check whether element (col_P[i], n) is present … … 549 561 public static double[,] Run(T[] data, IDistance<T> distance, IRandom random, 550 562 int newDimensions = 2, double perplexity = 25, int iterations = 1000, 551 double theta = 0, 552 int stopLyingIter = 0, int momSwitchIter = 0, double momentum = .5, 563 double theta = 0, int stopLyingIter = 0, int momSwitchIter = 0, double momentum = .5, 553 564 double finalMomentum = .8, double eta = 10.0 554 565 ) { 555 566 var state = CreateState(data, distance, random, newDimensions, perplexity, 556 567 theta, stopLyingIter, momSwitchIter, momentum, finalMomentum, eta); … … 565 576 int newDimensions = 2, double perplexity = 25, double theta = 0, 566 577 int stopLyingIter = 0, int momSwitchIter = 0, double momentum = .5, 567 double finalMomentum = .8, double eta = 10.0 568 569 return new TSNEState(data, distance, random, newDimensions, perplexity, theta, stopLyingIter, momSwitchIter, momentum, finalMomentum, eta );578 double finalMomentum = .8, double eta = 10.0, bool randomInit = true 579 ) { 580 return new TSNEState(data, distance, random, newDimensions, perplexity, theta, stopLyingIter, momSwitchIter, momentum, finalMomentum, eta, randomInit); 570 581 } 571 582 … … 580 591 for (var j = 0; j < state.newDimensions; j++) { 581 592 state.gains[i, j] = Math.Sign(state.dY[i, j]) != Math.Sign(state.uY[i, j]) 582 ? state.gains[i, j] + .2 593 ? state.gains[i, j] + .2 // +0.2 nd *0.8 are used in two separate implementations of tSNE -> seems to be correct 583 594 : state.gains[i, j] * .8; 584 585 595 if (state.gains[i, j] < .01) state.gains[i, j] = .01; 586 596 } 587 597 } 588 589 598 590 599 // Perform gradient update (with momentum and gains) 591 600 for (var i = 0; i < state.noDatapoints; i++) 592 593 601 for (var j = 0; j < state.newDimensions; j++) 602 state.uY[i, j] = state.currentMomentum * state.uY[i, j] - state.eta * state.gains[i, j] * state.dY[i, j]; 594 603 595 604 for (var i = 0; i < state.noDatapoints; i++) 596 597 605 for (var j = 0; j < state.newDimensions; j++) 606 state.newData[i, j] = state.newData[i, j] + state.uY[i, j]; 598 607 599 608 // Make solution zero-mean … … 604 613 if (state.exact) 605 614 for (var i = 0; i < state.noDatapoints; i++) 606 607 615 for (var j = 0; j < state.noDatapoints; j++) 616 state.p[i, j] /= 12.0; 608 617 else 609 618 for (var i = 0; i < state.rowP[state.noDatapoints]; i++) … … 634 643 // Compute final t-SNE gradient 635 644 for (var i = 0; i < n; i++) 636 637 638 645 for (var j = 0; j < d; j++) { 646 dC[i, j] = posF[i, j] - negF[i, j] / sumQ; 647 } 639 648 } 640 649 -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/TSNE/TSNEUtils.cs
r14414 r15614 35 35 } 36 36 37 internal static IList<T>Swap<T>(this IList<T> list, int indexA, int indexB) {37 internal static void Swap<T>(this IList<T> list, int indexA, int indexB) { 38 38 var tmp = list[indexA]; 39 39 list[indexA] = list[indexB]; 40 40 list[indexB] = tmp; 41 return list;42 41 } 43 42 44 internalstatic int Partition<T>(this IList<T> list, int left, int right, int pivotindex, IComparer<T> comparer) {43 private static int Partition<T>(this IList<T> list, int left, int right, int pivotindex, IComparer<T> comparer) { 45 44 var pivotValue = list[pivotindex]; 46 45 list.Swap(pivotindex, right); … … 67 66 /// <param name="comparer">comparer for list elemnts </param> 68 67 /// <returns></returns> 69 internal static T NthElement<T>(this IList<T> list, int left, int right, int n, IComparer<T> comparer) {68 internal static void PartialSort<T>(this IList<T> list, int left, int right, int n, IComparer<T> comparer) { 70 69 while (true) { 71 if (left == right) return list[left];72 var pivotindex = left + (int) Math.Floor(new System.Random().Next() % (right - (double)left + 1));70 if (left == right) return; 71 var pivotindex = left + (int) Math.Floor(new System.Random().Next() % (right - (double) left + 1)); 73 72 pivotindex = list.Partition(left, right, pivotindex, comparer); 74 if (n == pivotindex) return list[n];73 if (n == pivotindex) return; 75 74 if (n < pivotindex) right = pivotindex - 1; 76 75 else left = pivotindex + 1; -
branches/M5Regression/HeuristicLab.Algorithms.DataAnalysis/3.4/TSNE/VantagePointTree.cs
r15207 r15614 139 139 // Partition around the median distance 140 140 var median = (upper + lower) / 2; 141 items. NthElement(lower + 1, upper - 1, median, distance.GetDistanceComparer(items[lower]));141 items.PartialSort(lower + 1, upper - 1, median, distance.GetDistanceComparer(items[lower])); 142 142 143 143 // Threshold of the new node will be the distance to the median
Note: See TracChangeset
for help on using the changeset viewer.