Changeset 16124 for stable


Ignore:
Timestamp:
09/06/18 12:19:23 (13 months ago)
Author:
mkommend
Message:

#2897: Merged r15769 and r15829 into stable.

Location:
stable
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • stable

  • stable/HeuristicLab.Algorithms.DataAnalysis

  • stable/HeuristicLab.Algorithms.DataAnalysis/3.4

  • stable/HeuristicLab.Algorithms.DataAnalysis/3.4/GBM/GradientBoostingRegressionAlgorithm.cs

    r15584 r16124  
    258258
    259259          modifiableDataset.RemoveVariable(targetVarName);
    260           modifiableDataset.AddVariable(targetVarName, curY.Concat(curYTest));
     260          modifiableDataset.AddVariable(targetVarName, curY.Concat(curYTest).ToList());
    261261
    262262          SampleTrainingData(rand, modifiableDataset, rRows, problemData.Dataset, curY, problemData.TargetVariable, problemData.TrainingIndices); // all training indices from the original problem data are allowed
  • stable/HeuristicLab.Problems.DataAnalysis

  • stable/HeuristicLab.Problems.DataAnalysis/3.4/Dataset.cs

    r15584 r16124  
    3838    protected Dataset(Dataset original, Cloner cloner)
    3939      : base(original, cloner) {
     40      // no need to clone the variable values because these can't be modified
    4041      variableValues = new Dictionary<string, IList>(original.variableValues);
    4142      variableNames = new List<string>(original.variableNames);
    4243      rows = original.rows;
    4344    }
     45
    4446    public override IDeepCloneable Clone(Cloner cloner) { return new Dataset(this, cloner); }
    4547
     
    5860    /// <param name="variableValues">The values for the variables (column-oriented storage). Values are not cloned!</param>
    5961    public Dataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues)
    60       : base() {
     62      : this(variableNames, variableValues, cloneValues: true) {
     63    }
     64
     65    protected Dataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues, bool cloneValues = false) {
    6166      Name = "-";
    62       if (!variableNames.Any()) {
     67
     68      if (variableNames.Any()) {
     69        this.variableNames = new List<string>(variableNames);
     70      } else {
    6371        this.variableNames = Enumerable.Range(0, variableValues.Count()).Select(x => "Column " + x).ToList();
    64       } else if (variableNames.Count() != variableValues.Count()) {
    65         throw new ArgumentException("Number of variable names doesn't match the number of columns of variableValues");
    66       } else if (!variableValues.All(list => list.Count == variableValues.First().Count)) {
    67         throw new ArgumentException("The number of values must be equal for every variable");
    68       } else if (variableNames.Distinct().Count() != variableNames.Count()) {
    69         var duplicateVariableNames =
    70           variableNames.GroupBy(v => v).Where(g => g.Count() > 1).Select(g => g.Key).ToList();
    71         string message = "The dataset cannot contain duplicate variables names: " + Environment.NewLine;
    72         foreach (var duplicateVariableName in duplicateVariableNames)
    73           message += duplicateVariableName + Environment.NewLine;
    74         throw new ArgumentException(message);
    75       }
     72      }
     73      // check if the arguments are consistent (no duplicate variables, same number of rows, correct data types, ...)
     74      CheckArguments(this.variableNames, variableValues);
     75
    7676      rows = variableValues.First().Count;
    77       this.variableNames = new List<string>(variableNames);
    78       this.variableValues = new Dictionary<string, IList>(this.variableNames.Count);
    79       for (int i = 0; i < this.variableNames.Count; i++) {
    80         var values = variableValues.ElementAt(i);
    81         this.variableValues.Add(this.variableNames[i], values);
     77
     78      if (cloneValues) {
     79        this.variableValues = CloneValues(this.variableNames, variableValues);
     80      } else {
     81        this.variableValues = new Dictionary<string, IList>(this.variableNames.Count);
     82        for (int i = 0; i < this.variableNames.Count; i++) {
     83          var variableName = this.variableNames[i];
     84          var values = variableValues.ElementAt(i);
     85          this.variableValues.Add(variableName, values);
     86        }
    8287      }
    8388    }
     
    125130      return new ModifiableDataset(variableNames, values);
    126131    }
     132
    127133    /// <summary>
    128134    /// Shuffle a dataset's rows
     
    231237      return new ReadOnlyCollection<DateTime>(values);
    232238    }
    233 
    234 
    235239    private IEnumerable<T> GetValues<T>(string variableName, IEnumerable<int> rows) {
    236240      var values = GetValues<T>(variableName);
     
    247251    public bool VariableHasType<T>(string variableName) {
    248252      return variableValues[variableName] is IList<T>;
     253    }
     254    protected Type GetVariableType(string variableName) {
     255      IList list;
     256      variableValues.TryGetValue(variableName, out list);
     257      if (list == null)
     258        throw new ArgumentException("The variable " + variableName + " does not exist in the dataset.");
     259      return GetElementType(list);
     260    }
     261    protected static Type GetElementType(IList list) {
     262      var type = list.GetType();
     263      return type.IsGenericType ? type.GetGenericArguments()[0] : type.GetElementType();
     264    }
     265    protected static bool IsAllowedType(IList list) {
     266      var type = GetElementType(list);
     267      return IsAllowedType(type);
     268    }
     269    protected static bool IsAllowedType(Type type) {
     270      return type == typeof(double) || type == typeof(string) || type == typeof(DateTime);
     271    }
     272
     273    protected static void CheckArguments(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) {
     274      if (variableNames.Count() != variableValues.Count()) {
     275        throw new ArgumentException("Number of variable names doesn't match the number of columns of variableValues");
     276      } else if (!variableValues.All(list => list.Count == variableValues.First().Count)) {
     277        throw new ArgumentException("The number of values must be equal for every variable");
     278      } else if (variableNames.Distinct().Count() != variableNames.Count()) {
     279        var duplicateVariableNames =
     280          variableNames.GroupBy(v => v).Where(g => g.Count() > 1).Select(g => g.Key).ToList();
     281        string message = "The dataset cannot contain duplicate variables names: " + Environment.NewLine;
     282        foreach (var duplicateVariableName in duplicateVariableNames)
     283          message += duplicateVariableName + Environment.NewLine;
     284        throw new ArgumentException(message);
     285      }
     286      // check if all the variables are supported
     287      foreach (var t in variableNames.Zip(variableValues, Tuple.Create)) {
     288        var variableName = t.Item1;
     289        var values = t.Item2;
     290
     291        if (!IsAllowedType(values)) {
     292          throw new ArgumentException(string.Format("Unsupported type {0} for variable {1}.", GetElementType(values), variableName));
     293        }
     294      }
     295    }
     296
     297    protected static Dictionary<string, IList> CloneValues(Dictionary<string, IList> variableValues) {
     298      return variableValues.ToDictionary(x => x.Key, x => CloneValues(x.Value));
     299    }
     300
     301    protected static Dictionary<string, IList> CloneValues(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) {
     302      return variableNames.Zip(variableValues, Tuple.Create).ToDictionary(x => x.Item1, x => CloneValues(x.Item2));
     303    }
     304
     305    protected static IList CloneValues(IList values) {
     306      var doubleValues = values as IList<double>;
     307      if (doubleValues != null) return new List<double>(doubleValues);
     308
     309      var stringValues = values as IList<string>;
     310      if (stringValues != null) return new List<string>(stringValues);
     311
     312      var dateTimeValues = values as IList<DateTime>;
     313      if (dateTimeValues != null) return new List<DateTime>(dateTimeValues);
     314
     315      throw new ArgumentException(string.Format("Unsupported variable type {0}.", GetElementType(values)));
    249316    }
    250317
  • stable/HeuristicLab.Problems.DataAnalysis/3.4/ModifiableDataset.cs

    r15584 r16124  
    3939
    4040    private ModifiableDataset(ModifiableDataset original, Cloner cloner) : base(original, cloner) {
    41       var variables = variableValues.Keys.ToList();
    42       foreach (var v in variables) {
    43         var type = GetVariableType(v);
    44         if (type == typeof(DateTime)) {
    45           variableValues[v] = GetDateTimeValues(v).ToList();
    46         } else if (type == typeof(double)) {
    47           variableValues[v] = GetDoubleValues(v).ToList();
    48         } else if (type == typeof(string)) {
    49           variableValues[v] = GetStringValues(v).ToList();
    50         } else {
    51           throw new ArgumentException("Unsupported type " + type + " for variable " + v);
    52         }
    53       }
    54     }
     41      variableNames = new List<string>(original.variableNames);
     42      variableValues = CloneValues(original.variableValues);
     43    }
     44
    5545    public override IDeepCloneable Clone(Cloner cloner) { return new ModifiableDataset(this, cloner); }
    56     public ModifiableDataset() : base() { }
    57 
    58     public ModifiableDataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) : base(variableNames, variableValues) { }
     46
     47    public ModifiableDataset() { }
     48
     49    public ModifiableDataset(IEnumerable<string> variableNames, IEnumerable<IList> variableValues) :
     50      base(variableNames, variableValues, cloneValues: false) { }
    5951
    6052    public void ReplaceRow(int row, IEnumerable<object> values) {
     
    10597
    10698    // adds a new variable to the dataset
    107     public void AddVariable<T>(string variableName, IEnumerable<T> values) {
     99    public void AddVariable(string variableName, IList values) {
    108100      if (variableValues.ContainsKey(variableName))
    109         throw new ArgumentException("Variable " + variableName + " is already present in the dataset.");
    110       int count = values.Count();
    111       if (count != rows)
    112         throw new ArgumentException("The number of values must exactly match the number of rows in the dataset.");
    113       variableValues[variableName] = new List<T>(values);
     101        throw new ArgumentException(string.Format("Variable {0} is already present in the dataset.", variableName));
     102
     103      if (values == null || values.Count == 0)
     104        throw new ArgumentException("Cannot add variable with no values.");
     105
     106      if (!IsAllowedType(values))
     107        throw new ArgumentException(string.Format("Unsupported type {0} for variable {1}.", GetElementType(values), variableName));
     108
     109      variableValues[variableName] = values;
    114110      variableNames.Add(variableName);
     111
    115112      OnColumnsChanged();
    116113      OnColumnNamesChanged();
     
    120117    public void RemoveVariable(string variableName) {
    121118      if (!variableValues.ContainsKey(variableName))
    122         throw new ArgumentException("The variable " + variableName + " does not exist in the dataset.");
     119        throw new ArgumentException(string.Format("The variable {0} does not exist in the dataset.", variableName));
    123120      variableValues.Remove(variableName);
    124121      variableNames.Remove(variableName);
     
    128125    }
    129126
    130     // slow, avoid to use this
     127    // slow, avoid using this
    131128    public void RemoveRow(int row) {
    132129      foreach (var list in variableValues.Values)
     
    151148    }
    152149
    153     private Type GetVariableType(string variableName) {
    154       IList list;
    155       variableValues.TryGetValue(variableName, out list);
    156       if (list == null)
    157         throw new ArgumentException("The variable " + variableName + " does not exist in the dataset.");
    158       return list.GetType().GetGenericArguments()[0];
    159     }
    160 
    161150    bool IStringConvertibleMatrix.SetValue(string value, int rowIndex, int columnIndex) {
    162151      var variableName = variableNames[columnIndex];
Note: See TracChangeset for help on using the changeset viewer.