Changeset 15030 for branches/HeuristicLab.DatastreamAnalysis/HeuristicLab.Problems.DataAnalysis/3.4/Implementation
- Timestamp:
- 06/08/17 17:02:13 (8 years ago)
- Location:
- branches/HeuristicLab.DatastreamAnalysis/HeuristicLab.Problems.DataAnalysis
- Files:
-
- 7 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/HeuristicLab.DatastreamAnalysis/HeuristicLab.Problems.DataAnalysis
- Property svn:mergeinfo changed
-
branches/HeuristicLab.DatastreamAnalysis/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/DataAnalysisProblemData.cs
r14400 r15030 41 41 42 42 #region parameter properites 43 //mkommend: inserted parameter caching due to performance reasons 44 private IFixedValueParameter<Dataset> datasetParameter; 43 45 public IFixedValueParameter<Dataset> DatasetParameter { 44 get { return (IFixedValueParameter<Dataset>)Parameters[DatasetParameterName]; } 45 } 46 get { 47 if (datasetParameter == null) datasetParameter = (IFixedValueParameter<Dataset>)Parameters[DatasetParameterName]; 48 return datasetParameter; 49 } 50 } 51 52 private IFixedValueParameter<ReadOnlyCheckedItemList<StringValue>> inputVariablesParameter; 46 53 public IFixedValueParameter<ReadOnlyCheckedItemList<StringValue>> InputVariablesParameter { 47 get { return (IFixedValueParameter<ReadOnlyCheckedItemList<StringValue>>)Parameters[InputVariablesParameterName]; } 48 } 54 get { 55 if (inputVariablesParameter == null) inputVariablesParameter = (IFixedValueParameter<ReadOnlyCheckedItemList<StringValue>>)Parameters[InputVariablesParameterName]; 56 return inputVariablesParameter; 57 } 58 } 59 60 private IFixedValueParameter<IntRange> trainingPartitionParameter; 49 61 public IFixedValueParameter<IntRange> TrainingPartitionParameter { 50 get { return (IFixedValueParameter<IntRange>)Parameters[TrainingPartitionParameterName]; } 51 } 62 get { 63 if (trainingPartitionParameter == null) trainingPartitionParameter = (IFixedValueParameter<IntRange>)Parameters[TrainingPartitionParameterName]; 64 return trainingPartitionParameter; 65 } 66 } 67 68 private IFixedValueParameter<IntRange> testPartitionParameter; 52 69 public IFixedValueParameter<IntRange> TestPartitionParameter { 53 get { return (IFixedValueParameter<IntRange>)Parameters[TestPartitionParameterName]; } 54 } 70 get { 71 if (testPartitionParameter == null) testPartitionParameter = (IFixedValueParameter<IntRange>)Parameters[TestPartitionParameterName]; 72 return testPartitionParameter; 73 } 74 } 75 55 76 public IFixedValueParameter<ReadOnlyItemList<ITransformation>> TransformationsParameter { 56 77 get { return (IFixedValueParameter<ReadOnlyItemList<ITransformation>>)Parameters[TransformationsParameterName]; } … … 73 94 } 74 95 96 public double[,] AllowedInputsTrainingValues { 97 get { return Dataset.ToArray(AllowedInputVariables, TrainingIndices); } 98 } 99 100 public double[,] AllowedInputsTestValues { get { return Dataset.ToArray(AllowedInputVariables, TestIndices); } } 75 101 public IntRange TrainingPartition { 76 102 get { return TrainingPartitionParameter.Value; } … … 102 128 public virtual bool IsTrainingSample(int index) { 103 129 return index >= 0 && index < Dataset.Rows && 104 TrainingPartition.Start <= index && index < TrainingPartition.End &&105 (index < TestPartition.Start || TestPartition.End <= index);130 TrainingPartition.Start <= index && index < TrainingPartition.End && 131 (index < TestPartition.Start || TestPartition.End <= index); 106 132 } 107 133 … … 131 157 protected DataAnalysisProblemData(IDataset dataset, IEnumerable<string> allowedInputVariables, IEnumerable<ITransformation> transformations = null) { 132 158 if (dataset == null) throw new ArgumentNullException("The dataset must not be null."); 133 if (allowedInputVariables == null) throw new ArgumentNullException("The allowedInputVariables must not be null."); 134 135 if (allowedInputVariables.Except(dataset.DoubleVariables).Any()) 136 throw new ArgumentException("All allowed input variables must be present in the dataset and of type double."); 137 138 var inputVariables = new CheckedItemList<StringValue>(dataset.DoubleVariables.Select(x => new StringValue(x))); 159 if (allowedInputVariables == null) throw new ArgumentNullException("The allowed input variables must not be null."); 160 161 if (allowedInputVariables.Except(dataset.DoubleVariables).Except(dataset.StringVariables).Any()) 162 throw new ArgumentException("All allowed input variables must be present in the dataset and of type double or string."); 163 164 var variables = dataset.VariableNames.Where(variable => dataset.VariableHasType<double>(variable) || dataset.VariableHasType<string>(variable)); 165 var inputVariables = new CheckedItemList<StringValue>(variables.Select(x => new StringValue(x))); 139 166 foreach (StringValue x in inputVariables) 140 167 inputVariables.SetItemCheckedState(x, allowedInputVariables.Contains(x.Value)); … … 214 241 InputVariables.SetItemCheckedState(inputVariable, variable != null && data.InputVariables.ItemChecked(variable)); 215 242 } 216 217 TrainingPartition.Start = TrainingPartition.End = 0;218 TestPartition.Start = 0;219 TestPartition.End = Dataset.Rows;220 243 } 221 244 } -
branches/HeuristicLab.DatastreamAnalysis/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Regression/RegressionSolutionVariableImpactsCalculator.cs
r14463 r15030 42 42 Noise 43 43 } 44 44 public enum FactorReplacementMethodEnum { 45 Best, 46 Mode, 47 Shuffle 48 } 45 49 public enum DataPartitionEnum { 46 50 Training, … … 88 92 } 89 93 90 public static IEnumerable<Tuple<string, double>> CalculateImpacts(IRegressionSolution solution, 94 public static IEnumerable<Tuple<string, double>> CalculateImpacts( 95 IRegressionSolution solution, 91 96 DataPartitionEnum data = DataPartitionEnum.Training, 92 ReplacementMethodEnum replacement = ReplacementMethodEnum.Median) { 97 ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Median, 98 FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) { 93 99 94 100 var problemData = solution.ProblemData; … … 128 134 var allowedInputVariables = dataset.VariableNames.Where(v => inputvariables.Contains(v)).ToList(); 129 135 130 foreach (var inputVariable in allowedInputVariables) { 131 var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, replacement); 136 // calculate impacts for double variables 137 foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<double>)) { 138 var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, replacementMethod); 132 139 var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error); 133 140 if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation with replaced inputs."); … … 137 144 impacts[inputVariable] = impact; 138 145 } 146 147 // calculate impacts for string variables 148 foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<string>)) { 149 if (factorReplacementMethod == FactorReplacementMethodEnum.Best) { 150 // try replacing with all possible values and find the best replacement value 151 var smallestImpact = double.PositiveInfinity; 152 foreach (var repl in problemData.Dataset.GetStringValues(inputVariable, rows).Distinct()) { 153 var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, 154 Enumerable.Repeat(repl, dataset.Rows)); 155 var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error); 156 if (error != OnlineCalculatorError.None) 157 throw new InvalidOperationException("Error during R² calculation with replaced inputs."); 158 159 newR2 = newR2 * newR2; 160 var impact = originalR2 - newR2; 161 if (impact < smallestImpact) smallestImpact = impact; 162 } 163 impacts[inputVariable] = smallestImpact; 164 } else { 165 // for replacement methods shuffle and mode 166 // calculate impacts for factor variables 167 168 var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, 169 factorReplacementMethod); 170 var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error); 171 if (error != OnlineCalculatorError.None) 172 throw new InvalidOperationException("Error during R² calculation with replaced inputs."); 173 174 newR2 = newR2 * newR2; 175 var impact = originalR2 - newR2; 176 impacts[inputVariable] = impact; 177 } 178 } // foreach 139 179 return impacts.OrderByDescending(i => i.Value).Select(i => Tuple.Create(i.Key, i.Value)); 140 180 } … … 184 224 } 185 225 186 dataset.ReplaceVariable(variable, replacementValues); 226 return EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues); 227 } 228 229 private static IEnumerable<double> EvaluateModelWithReplacedVariable( 230 IRegressionModel model, string variable, ModifiableDataset dataset, 231 IEnumerable<int> rows, 232 FactorReplacementMethodEnum replacement = FactorReplacementMethodEnum.Shuffle) { 233 var originalValues = dataset.GetReadOnlyStringValues(variable).ToList(); 234 List<string> replacementValues; 235 IRandom rand; 236 237 switch (replacement) { 238 case FactorReplacementMethodEnum.Mode: 239 var mostCommonValue = rows.Select(r => originalValues[r]) 240 .GroupBy(v => v) 241 .OrderByDescending(g => g.Count()) 242 .First().Key; 243 replacementValues = Enumerable.Repeat(mostCommonValue, dataset.Rows).ToList(); 244 break; 245 case FactorReplacementMethodEnum.Shuffle: 246 // new var has same empirical distribution but the relation to y is broken 247 rand = new FastRandom(31415); 248 // prepare a complete column for the dataset 249 replacementValues = Enumerable.Repeat(string.Empty, dataset.Rows).ToList(); 250 // shuffle only the selected rows 251 var shuffledValues = rows.Select(r => originalValues[r]).Shuffle(rand).ToList(); 252 int i = 0; 253 // update column values 254 foreach (var r in rows) { 255 replacementValues[r] = shuffledValues[i++]; 256 } 257 break; 258 default: 259 throw new ArgumentException(string.Format("FactorReplacementMethod {0} cannot be handled.", replacement)); 260 } 261 262 return EvaluateModelWithReplacedVariable(model, variable, dataset, rows, replacementValues); 263 } 264 265 private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, 266 ModifiableDataset dataset, IEnumerable<int> rows, IEnumerable<double> replacementValues) { 267 var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList(); 268 dataset.ReplaceVariable(variable, replacementValues.ToList()); 187 269 //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements 188 270 var estimates = model.GetEstimatedValues(dataset, rows).ToList(); … … 191 273 return estimates; 192 274 } 275 private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, 276 ModifiableDataset dataset, IEnumerable<int> rows, IEnumerable<string> replacementValues) { 277 var originalValues = dataset.GetReadOnlyStringValues(variable).ToList(); 278 dataset.ReplaceVariable(variable, replacementValues.ToList()); 279 //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements 280 var estimates = model.GetEstimatedValues(dataset, rows).ToList(); 281 dataset.ReplaceVariable(variable, originalValues); 282 283 return estimates; 284 } 193 285 } 194 286 } -
branches/HeuristicLab.DatastreamAnalysis/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Transformations/LinearTransformation.cs
r14400 r15030 52 52 public double Multiplier { 53 53 get { return MultiplierParameter.Value.Value; } 54 protectedset {54 set { 55 55 MultiplierParameter.Value.Value = value; 56 56 } … … 59 59 public double Addend { 60 60 get { return AddendParameter.Value.Value; } 61 protectedset {61 set { 62 62 AddendParameter.Value.Value = value; 63 63 } -
branches/HeuristicLab.DatastreamAnalysis/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Transformations/ShiftStandardDistributionTransformation.cs
r14400 r15030 71 71 72 72 public override IEnumerable<double> Apply(IEnumerable<double> data) { 73 ConfigureParameters(data);74 73 if (OriginalStandardDeviation.IsAlmost(0.0)) { 75 74 return data; … … 94 93 } 95 94 96 p rotectedvoid ConfigureParameters(IEnumerable<double> data) {95 public override void ConfigureParameters(IEnumerable<double> data) { 97 96 OriginalStandardDeviation = data.StandardDeviation(); 98 97 OriginalMean = data.Average(); -
branches/HeuristicLab.DatastreamAnalysis/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Transformations/ShiftToRangeTransformation.cs
r14400 r15030 44 44 } 45 45 46 public override IEnumerable<double> Apply(IEnumerable<double> data) {47 ConfigureParameters(data);48 return base.Apply(data);49 }50 51 46 public override bool Check(IEnumerable<double> data, out string errorMsg) { 52 47 ConfigureParameters(data); … … 54 49 } 55 50 56 p rotectedvoid ConfigureParameters(IEnumerable<double> data) {51 public override void ConfigureParameters(IEnumerable<double> data) { 57 52 double originalRangeStart = data.Min(); 58 53 double originalRangeEnd = data.Max(); -
branches/HeuristicLab.DatastreamAnalysis/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Transformations/Transformation.cs
r14400 r15030 66 66 protected Transformation(IEnumerable<string> allowedColumns) : base(allowedColumns) { } 67 67 68 public virtual void ConfigureParameters(IEnumerable<T> data) { 69 // override in transformations with parameters 70 } 71 68 72 public abstract IEnumerable<T> Apply(IEnumerable<T> data); 73 public IEnumerable<T> ConfigureAndApply(IEnumerable<T> data) { 74 ConfigureParameters(data); 75 return Apply(data); 76 } 69 77 70 78 public abstract bool Check(IEnumerable<T> data, out string errorMsg);
Note: See TracChangeset
for help on using the changeset viewer.