source: trunk/sources/HeuristicLab.DataPreprocessing/3.4/ProblemDataCreator.cs @ 11156

Last change on this file since 11156 was 11156, checked in by gkronber, 8 years ago

#2206: made several changes / improvements to the data-preprocessing code while reviewing the code

File size: 4.4 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2013 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using HeuristicLab.Common;
25using HeuristicLab.Problems.DataAnalysis;
26
27namespace HeuristicLab.DataPreprocessing {
28  public class ProblemDataCreator {
29
30    private readonly IPreprocessingContext context;
31
32    private Dataset ExportedDataset {
33      get { return exporteDataset ?? (exporteDataset = context.Data.ExportToDataset()); }
34    }
35    private Dataset exporteDataset;
36
37    private IList<ITransformation> Transformations { get { return context.Data.Transformations; } }
38
39    public ProblemDataCreator(IPreprocessingContext context) {
40      this.context = context;
41    }
42
43    public IDataAnalysisProblemData CreateProblemData() {
44      if (context.Data.Rows == 0 || context.Data.Columns == 0) return null;
45
46      var oldProblemData = context.ProblemData;
47      IDataAnalysisProblemData problemData;
48
49      if (oldProblemData is RegressionProblemData) {
50        problemData = CreateRegressionData((RegressionProblemData)oldProblemData);
51      } else if (oldProblemData is ClassificationProblemData) {
52        problemData = CreateClassificationData((ClassificationProblemData)oldProblemData);
53      } else if (oldProblemData is ClusteringProblemData) {
54        problemData = CreateClusteringData((ClusteringProblemData)oldProblemData);
55      } else {
56        throw new NotImplementedException("The type of the DataAnalysisProblemData is not supported.");
57      }
58
59      SetTrainingAndTestPartition(problemData);
60
61      return problemData;
62    }
63
64    private IDataAnalysisProblemData CreateRegressionData(RegressionProblemData oldProblemData) {
65      var targetVariable = oldProblemData.TargetVariable;
66      // target variable must be double and must exist in the new dataset
67      return new RegressionProblemData(ExportedDataset, GetDoubleInputVariables(targetVariable), targetVariable, Transformations);
68    }
69
70    private IDataAnalysisProblemData CreateClassificationData(ClassificationProblemData oldProblemData) {
71      var targetVariable = oldProblemData.TargetVariable;
72      // target variable must be double and must exist in the new dataset
73      return new ClassificationProblemData(ExportedDataset, GetDoubleInputVariables(targetVariable), targetVariable, Transformations);
74    }
75
76    private IDataAnalysisProblemData CreateClusteringData(ClusteringProblemData oldProblemData) {
77      return new ClusteringProblemData(ExportedDataset, GetDoubleInputVariables(String.Empty), Transformations);
78    }
79
80    private void SetTrainingAndTestPartition(IDataAnalysisProblemData problemData) {
81      var ppData = context.Data;
82
83      problemData.TrainingPartition.Start = ppData.TrainingPartition.Start;
84      problemData.TrainingPartition.End = ppData.TrainingPartition.End;
85      problemData.TestPartition.Start = ppData.TestPartition.Start;
86      problemData.TestPartition.End = ppData.TestPartition.End;
87    }
88
89    private IEnumerable<string> GetDoubleInputVariables(string targetVariable) {
90      var variableNames = new List<string>();
91      for (int i = 0; i < context.Data.Columns; ++i) {
92        var variableName = context.Data.GetVariableName(i);
93        if (context.Data.VariableHasType<double>(i)
94          && variableName != targetVariable
95          && IsNotConstantInputVariable(context.Data.GetValues<double>(i))) {
96
97          variableNames.Add(variableName);
98        }
99      }
100      return variableNames;
101    }
102
103    private bool IsNotConstantInputVariable(IList<double> list) {
104      return context.Data.TrainingPartition.End - context.Data.TrainingPartition.Start > 1 || list.Range() > 0;
105    }
106  }
107}
Note: See TracBrowser for help on using the repository browser.