Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
09/14/18 11:47:37 (6 years ago)
Author:
abeham
Message:

#2817: updated to trunk r16140

Location:
branches/2817-BinPackingSpeedup
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • branches/2817-BinPackingSpeedup

  • branches/2817-BinPackingSpeedup/HeuristicLab.Problems.DataAnalysis

  • branches/2817-BinPackingSpeedup/HeuristicLab.Problems.DataAnalysis/3.4/Dataset.cs

    r16140 r16141  
    3838    protected Dataset(Dataset original, Cloner cloner)
    3939      : base(original, cloner) {
     40      // no need to clone the variable values because these can't be modified
    4041      variableValues = new Dictionary<string, IList>(original.variableValues);
    4142      variableNames = new List<string>(original.variableNames);
    4243      rows = original.rows;
    4344    }
     45
    4446    public override IDeepCloneable Clone(Cloner cloner) { return new Dataset(this, cloner); }
    4547
     
    5860    /// <param name="variableValues">The values for the variables (column-oriented storage). Values are not cloned!</param>
    5961    public Dataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues)
    60       : base() {
     62      : this(variableNames, variableValues, cloneValues: true) {
     63    }
     64
     65    protected Dataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues, bool cloneValues = false) {
    6166      Name = "-";
    62       if (!variableNames.Any()) {
     67
     68      if (variableNames.Any()) {
     69        this.variableNames = new List<string>(variableNames);
     70      } else {
    6371        this.variableNames = Enumerable.Range(0, variableValues.Count()).Select(x => "Column " + x).ToList();
    64       } else if (variableNames.Count() != variableValues.Count()) {
    65         throw new ArgumentException("Number of variable names doesn't match the number of columns of variableValues");
    66       } else if (!variableValues.All(list => list.Count == variableValues.First().Count)) {
    67         throw new ArgumentException("The number of values must be equal for every variable");
    68       } else if (variableNames.Distinct().Count() != variableNames.Count()) {
    69         var duplicateVariableNames =
    70           variableNames.GroupBy(v => v).Where(g => g.Count() > 1).Select(g => g.Key).ToList();
    71         string message = "The dataset cannot contain duplicate variables names: " + Environment.NewLine;
    72         foreach (var duplicateVariableName in duplicateVariableNames)
    73           message += duplicateVariableName + Environment.NewLine;
    74         throw new ArgumentException(message);
    75       }
     72      }
     73      // check if the arguments are consistent (no duplicate variables, same number of rows, correct data types, ...)
     74      CheckArguments(this.variableNames, variableValues);
     75
    7676      rows = variableValues.First().Count;
    77       this.variableNames = new List<string>(variableNames);
    78       this.variableValues = new Dictionary<string, IList>(this.variableNames.Count);
    79       for (int i = 0; i < this.variableNames.Count; i++) {
    80         var values = variableValues.ElementAt(i);
    81         this.variableValues.Add(this.variableNames[i], values);
     77
     78      if (cloneValues) {
     79        this.variableValues = CloneValues(this.variableNames, variableValues);
     80      } else {
     81        this.variableValues = new Dictionary<string, IList>(this.variableNames.Count);
     82        for (int i = 0; i < this.variableNames.Count; i++) {
     83          var variableName = this.variableNames[i];
     84          var values = variableValues.ElementAt(i);
     85          this.variableValues.Add(variableName, values);
     86        }
    8287      }
    8388    }
     
    111116
    112117    public ModifiableDataset ToModifiable() {
    113       var values = new List<IList>();
    114       foreach (var v in variableNames) {
    115         if (VariableHasType<double>(v)) {
    116           values.Add(new List<double>((IList<double>)variableValues[v]));
    117         } else if (VariableHasType<string>(v)) {
    118           values.Add(new List<string>((IList<string>)variableValues[v]));
    119         } else if (VariableHasType<DateTime>(v)) {
    120           values.Add(new List<DateTime>((IList<DateTime>)variableValues[v]));
    121         } else {
    122           throw new ArgumentException("Unknown variable type.");
    123         }
    124       }
    125       return new ModifiableDataset(variableNames, values);
    126     }
     118      return new ModifiableDataset(variableNames, variableNames.Select(v => variableValues[v]), true);
     119    }
     120
    127121    /// <summary>
    128122    /// Shuffle a dataset's rows
     
    135129    }
    136130
    137     protected Dataset(Dataset dataset) : this(dataset.variableNames, dataset.variableValues.Values) { }
     131
    138132
    139133    #region Backwards compatible code, remove with 3.5
     
    231225      return new ReadOnlyCollection<DateTime>(values);
    232226    }
    233 
    234 
    235227    private IEnumerable<T> GetValues<T>(string variableName, IEnumerable<int> rows) {
    236228      var values = GetValues<T>(variableName);
     
    248240      return variableValues[variableName] is IList<T>;
    249241    }
     242    protected Type GetVariableType(string variableName) {
     243      IList list;
     244      variableValues.TryGetValue(variableName, out list);
     245      if (list == null)
     246        throw new ArgumentException("The variable " + variableName + " does not exist in the dataset.");
     247      return GetElementType(list);
     248    }
     249    protected static Type GetElementType(IList list) {
     250      var type = list.GetType();
     251      return type.IsGenericType ? type.GetGenericArguments()[0] : type.GetElementType();
     252    }
     253    protected static bool IsAllowedType(IList list) {
     254      var type = GetElementType(list);
     255      return IsAllowedType(type);
     256    }
     257    protected static bool IsAllowedType(Type type) {
     258      return type == typeof(double) || type == typeof(string) || type == typeof(DateTime);
     259    }
     260
     261    protected static void CheckArguments(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) {
     262      if (variableNames.Count() != variableValues.Count()) {
     263        throw new ArgumentException("Number of variable names doesn't match the number of columns of variableValues");
     264      } else if (!variableValues.All(list => list.Count == variableValues.First().Count)) {
     265        throw new ArgumentException("The number of values must be equal for every variable");
     266      } else if (variableNames.Distinct().Count() != variableNames.Count()) {
     267        var duplicateVariableNames =
     268          variableNames.GroupBy(v => v).Where(g => g.Count() > 1).Select(g => g.Key).ToList();
     269        string message = "The dataset cannot contain duplicate variables names: " + Environment.NewLine;
     270        foreach (var duplicateVariableName in duplicateVariableNames)
     271          message += duplicateVariableName + Environment.NewLine;
     272        throw new ArgumentException(message);
     273      }
     274      // check if all the variables are supported
     275      foreach (var t in variableNames.Zip(variableValues, Tuple.Create)) {
     276        var variableName = t.Item1;
     277        var values = t.Item2;
     278
     279        if (!IsAllowedType(values)) {
     280          throw new ArgumentException(string.Format("Unsupported type {0} for variable {1}.", GetElementType(values), variableName));
     281        }
     282      }
     283    }
     284
     285    protected static Dictionary<string, IList> CloneValues(Dictionary<string, IList> variableValues) {
     286      return variableValues.ToDictionary(x => x.Key, x => CloneValues(x.Value));
     287    }
     288
     289    protected static Dictionary<string, IList> CloneValues(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) {
     290      return variableNames.Zip(variableValues, Tuple.Create).ToDictionary(x => x.Item1, x => CloneValues(x.Item2));
     291    }
     292
     293    protected static IList CloneValues(IList values) {
     294      var doubleValues = values as IList<double>;
     295      if (doubleValues != null) return new List<double>(doubleValues);
     296
     297      var stringValues = values as IList<string>;
     298      if (stringValues != null) return new List<string>(stringValues);
     299
     300      var dateTimeValues = values as IList<DateTime>;
     301      if (dateTimeValues != null) return new List<DateTime>(dateTimeValues);
     302
     303      throw new ArgumentException(string.Format("Unsupported variable type {0}.", GetElementType(values)));
     304    }
    250305
    251306    #region IStringConvertibleMatrix Members
    252307    [Storable]
    253     protected int rows;
     308    private int rows;
    254309    public int Rows {
    255310      get { return rows; }
     311      protected set { rows = value; }
    256312    }
    257313    int IStringConvertibleMatrix.Rows {
  • branches/2817-BinPackingSpeedup/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Classification/ClassificationSolutionVariableImpactsCalculator.cs

    r16140 r16141  
    100100      var problemData = solution.ProblemData;
    101101      var dataset = problemData.Dataset;
     102      var model = (IClassificationModel)solution.Model.Clone(); //mkommend: clone of model is necessary, because the thresholds for IDiscriminantClassificationModels are updated
    102103
    103104      IEnumerable<int> rows;
     
    137138      // calculate impacts for double variables
    138139      foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<double>)) {
    139         var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, replacementMethod);
     140        var newEstimates = EvaluateModelWithReplacedVariable(model, inputVariable, modifiableDataset, rows, replacementMethod);
    140141        var newAccuracy = OnlineAccuracyCalculator.Calculate(targetValues, newEstimates, out error);
    141142        if (error != OnlineCalculatorError.None) throw new InvalidOperationException("Error during R² calculation with replaced inputs.");
     
    150151          var smallestImpact = double.PositiveInfinity;
    151152          foreach (var repl in problemData.Dataset.GetStringValues(inputVariable, rows).Distinct()) {
    152             var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows,
     153            var newEstimates = EvaluateModelWithReplacedVariable(model, inputVariable, modifiableDataset, rows,
    153154              Enumerable.Repeat(repl, dataset.Rows));
    154155            var newAccuracy = OnlineAccuracyCalculator.Calculate(targetValues, newEstimates, out error);
     
    164165          // calculate impacts for factor variables
    165166
    166           var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows,
     167          var newEstimates = EvaluateModelWithReplacedVariable(model, inputVariable, modifiableDataset, rows,
    167168            factorReplacementMethod);
    168169          var newAccuracy = OnlineAccuracyCalculator.Calculate(targetValues, newEstimates, out error);
     
    263264      var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList();
    264265      dataset.ReplaceVariable(variable, replacementValues.ToList());
     266
     267      var discModel = model as IDiscriminantFunctionClassificationModel;
     268      if (discModel != null) {
     269        var problemData = new ClassificationProblemData(dataset, dataset.VariableNames, model.TargetVariable);
     270        discModel.RecalculateModelParameters(problemData, rows);
     271      }
     272
    265273      //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements
    266274      var estimates = model.GetEstimatedClassValues(dataset, rows).ToList();
     
    273281      var originalValues = dataset.GetReadOnlyStringValues(variable).ToList();
    274282      dataset.ReplaceVariable(variable, replacementValues.ToList());
     283
     284
     285      var discModel = model as IDiscriminantFunctionClassificationModel;
     286      if (discModel != null) {
     287        var problemData = new ClassificationProblemData(dataset, dataset.VariableNames, model.TargetVariable);
     288        discModel.RecalculateModelParameters(problemData, rows);
     289      }
     290
    275291      //mkommend: ToList is used on purpose to avoid lazy evaluation that could result in wrong estimates due to variable replacements
    276292      var estimates = model.GetEstimatedClassValues(dataset, rows).ToList();
  • branches/2817-BinPackingSpeedup/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/DataAnalysisProblemData.cs

    r16140 r16141  
    163163
    164164      var variables = dataset.VariableNames.Where(variable => dataset.VariableHasType<double>(variable) || dataset.VariableHasType<string>(variable));
    165       var inputVariables = new CheckedItemList<StringValue>(variables.Select(x => new StringValue(x)));
     165      var inputVariables = new CheckedItemList<StringValue>(variables.Select(x => new StringValue(x).AsReadOnly()));
    166166      foreach (StringValue x in inputVariables)
    167167        inputVariables.SetItemCheckedState(x, allowedInputVariables.Contains(x.Value));
  • branches/2817-BinPackingSpeedup/HeuristicLab.Problems.DataAnalysis/3.4/Implementation/Regression/RegressionSolutionVariableImpactsCalculator.cs

    r16140 r16141  
    5252      All
    5353    }
    54    
     54
    5555    private const string ReplacementParameterName = "Replacement Method";
    5656    private const string DataPartitionParameterName = "DataPartition";
     
    9696      DataPartitionEnum data = DataPartitionEnum.Training,
    9797      ReplacementMethodEnum replacementMethod = ReplacementMethodEnum.Median,
    98       FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best) {
     98      FactorReplacementMethodEnum factorReplacementMethod = FactorReplacementMethodEnum.Best,
     99      Func<double, string, bool> progressCallback = null) {
    99100
    100101      var problemData = solution.ProblemData;
     
    134135      var allowedInputVariables = dataset.VariableNames.Where(v => inputvariables.Contains(v)).ToList();
    135136
     137      int curIdx = 0;
     138      int count = allowedInputVariables.Where(problemData.Dataset.VariableHasType<double>).Count();
    136139      // calculate impacts for double variables
    137140      foreach (var inputVariable in allowedInputVariables.Where(problemData.Dataset.VariableHasType<double>)) {
     141        //Report the current progress in percent. If the callback returns true, it means the execution shall be stopped
     142        if (progressCallback != null) {
     143          curIdx++;
     144          if (progressCallback((double)curIdx / count, string.Format("Calculating impact for variable {0} ({1} of {2})", inputVariable, curIdx, count))) { return null; }
     145        }
    138146        var newEstimates = EvaluateModelWithReplacedVariable(solution.Model, inputVariable, modifiableDataset, rows, replacementMethod);
    139147        var newR2 = OnlinePearsonsRCalculator.Calculate(targetValues, newEstimates, out error);
     
    180188    }
    181189
     190
    182191    private static IEnumerable<double> EvaluateModelWithReplacedVariable(IRegressionModel model, string variable, ModifiableDataset dataset, IEnumerable<int> rows, ReplacementMethodEnum replacement = ReplacementMethodEnum.Median) {
    183192      var originalValues = dataset.GetReadOnlyDoubleValues(variable).ToList();
  • branches/2817-BinPackingSpeedup/HeuristicLab.Problems.DataAnalysis/3.4/ModifiableDataset.cs

    r16140 r16141  
    3939
    4040    private ModifiableDataset(ModifiableDataset original, Cloner cloner) : base(original, cloner) {
    41       var variables = variableValues.Keys.ToList();
    42       foreach (var v in variables) {
    43         var type = GetVariableType(v);
    44         if (type == typeof(DateTime)) {
    45           variableValues[v] = GetDateTimeValues(v).ToList();
    46         } else if (type == typeof(double)) {
    47           variableValues[v] = GetDoubleValues(v).ToList();
    48         } else if (type == typeof(string)) {
    49           variableValues[v] = GetStringValues(v).ToList();
    50         } else {
    51           throw new ArgumentException("Unsupported type " + type + " for variable " + v);
     41      variableNames = new List<string>(original.variableNames);
     42      variableValues = CloneValues(original.variableValues);
     43    }
     44
     45    public override IDeepCloneable Clone(Cloner cloner) { return new ModifiableDataset(this, cloner); }
     46
     47    public ModifiableDataset() { }
     48
     49    public ModifiableDataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues, bool cloneValues = false) :
     50      base(variableNames, variableValues, cloneValues) { }
     51
     52    public Dataset ToDataset() {
     53      return new Dataset(variableNames, variableNames.Select(v => variableValues[v]));
     54    }
     55
     56
     57    public IEnumerable<object> GetRow(int row) {
     58      return variableValues.Select(x => x.Value[row]);
     59    }
     60
     61    public void AddRow(IEnumerable<object> values) {
     62      var list = values.ToList();
     63      if (list.Count != variableNames.Count)
     64        throw new ArgumentException("The number of values must be equal to the number of variable names.");
     65      // check if all the values are of the correct type
     66      for (int i = 0; i < list.Count; ++i) {
     67        if (list[i].GetType() != GetVariableType(variableNames[i])) {
     68          throw new ArgumentException("The type of the provided value does not match the variable type.");
    5269        }
    5370      }
    54     }
    55     public override IDeepCloneable Clone(Cloner cloner) { return new ModifiableDataset(this, cloner); }
    56     public ModifiableDataset() : base() { }
    57 
    58     public ModifiableDataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) : base(variableNames, variableValues) { }
     71      // add values
     72      for (int i = 0; i < list.Count; ++i) {
     73        variableValues[variableNames[i]].Add(list[i]);
     74      }
     75      Rows++;
     76      OnRowsChanged();
     77      OnReset();
     78    }
    5979
    6080    public void ReplaceRow(int row, IEnumerable<object> values) {
     
    7292        variableValues[variableNames[i]][row] = list[i];
    7393      }
     94      OnReset();
     95    }
     96
     97    // slow, avoid using this
     98    public void RemoveRow(int row) {
     99      foreach (var list in variableValues.Values)
     100        list.RemoveAt(row);
     101      Rows--;
     102      OnRowsChanged();
     103      OnReset();
     104    }
     105
     106    // adds a new variable to the dataset
     107    public void AddVariable(string variableName, IList values) {
     108      InsertVariable(variableName, Columns, values);
     109    }
     110
     111    public void InsertVariable(string variableName, int position, IList values) {
     112      if (variableValues.ContainsKey(variableName))
     113        throw new ArgumentException(string.Format("Variable {0} is already present in the dataset.", variableName));
     114
     115      if (position < 0 || position > Columns)
     116        throw new ArgumentException(string.Format("Incorrect position {0} specified. The position must be between 0 and {1}.", position, Columns));
     117
     118      if (values == null)
     119        throw new ArgumentNullException("values", "Values must not be null. At least an empty list of values has to be provided.");
     120
     121      if (values.Count != Rows)
     122        throw new ArgumentException(string.Format("{0} values are provided, but {1} rows are present in the dataset.", values.Count, Rows));
     123
     124      if (!IsAllowedType(values))
     125        throw new ArgumentException(string.Format("Unsupported type {0} for variable {1}.", GetElementType(values), variableName));
     126
     127      variableNames.Insert(position, variableName);
     128      variableValues[variableName] = values;
     129
     130      OnColumnsChanged();
     131      OnColumnNamesChanged();
    74132      OnReset();
    75133    }
     
    85143    }
    86144
    87     public void AddRow(IEnumerable<object> values) {
    88       var list = values.ToList();
    89       if (list.Count != variableNames.Count)
    90         throw new ArgumentException("The number of values must be equal to the number of variable names.");
    91       // check if all the values are of the correct type
    92       for (int i = 0; i < list.Count; ++i) {
    93         if (list[i].GetType() != GetVariableType(variableNames[i])) {
    94           throw new ArgumentException("The type of the provided value does not match the variable type.");
    95         }
    96       }
    97       // add values
    98       for (int i = 0; i < list.Count; ++i) {
    99         variableValues[variableNames[i]].Add(list[i]);
    100       }
    101       rows++;
    102       OnRowsChanged();
    103       OnReset();
    104     }
    105 
    106     // adds a new variable to the dataset
    107     public void AddVariable<T>(string variableName, IEnumerable<T> values) {
    108       if (variableValues.ContainsKey(variableName))
    109         throw new ArgumentException("Variable " + variableName + " is already present in the dataset.");
    110       int count = values.Count();
    111       if (count != rows)
    112         throw new ArgumentException("The number of values must exactly match the number of rows in the dataset.");
    113       variableValues[variableName] = new List<T>(values);
    114       variableNames.Add(variableName);
    115       OnColumnsChanged();
    116       OnColumnNamesChanged();
    117       OnReset();
    118     }
    119145
    120146    public void RemoveVariable(string variableName) {
    121147      if (!variableValues.ContainsKey(variableName))
    122         throw new ArgumentException("The variable " + variableName + " does not exist in the dataset.");
     148        throw new ArgumentException(string.Format("The variable {0} does not exist in the dataset.", variableName));
    123149      variableValues.Remove(variableName);
    124150      variableNames.Remove(variableName);
     
    128154    }
    129155
    130     // slow, avoid to use this
    131     public void RemoveRow(int row) {
    132       foreach (var list in variableValues.Values)
    133         list.RemoveAt(row);
    134       rows--;
     156    public void ClearValues() {
     157      foreach (var list in variableValues.Values) {
     158        list.Clear();
     159      }
     160      Rows = 0;
    135161      OnRowsChanged();
    136162      OnReset();
    137163    }
     164
    138165
    139166    public void SetVariableValue(object value, string variableName, int row) {
     
    151178    }
    152179
    153     private Type GetVariableType(string variableName) {
    154       IList list;
    155       variableValues.TryGetValue(variableName, out list);
    156       if (list == null)
    157         throw new ArgumentException("The variable " + variableName + " does not exist in the dataset.");
    158       return list.GetType().GetGenericArguments()[0];
    159     }
    160 
    161180    bool IStringConvertibleMatrix.SetValue(string value, int rowIndex, int columnIndex) {
    162181      var variableName = variableNames[columnIndex];
Note: See TracChangeset for help on using the changeset viewer.