#region License Information /* HeuristicLab * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Collections; using System.Collections.Generic; using System.Collections.ObjectModel; using System.Linq; using HEAL.Attic; using HeuristicLab.Common; using HeuristicLab.Core; using HeuristicLab.Data; using DoubleVector = MathNet.Numerics.LinearAlgebra.Vector; namespace HeuristicLab.Problems.DataAnalysis { [Item("Dataset", "Represents a dataset containing data that should be analyzed.")] [StorableType("49F4D145-50D7-4497-8D8A-D190CD556CC8")] public class Dataset : NamedItem, IDataset { [StorableConstructor] protected Dataset(StorableConstructorFlag _) : base(_) { } protected Dataset(Dataset original, Cloner cloner) : base(original, cloner) { // no need to clone the variable values because these can't be modified variableValues = new Dictionary(original.variableValues); variableNames = new List(original.variableNames); rows = original.rows; } public override IDeepCloneable Clone(Cloner cloner) { return new Dataset(this, cloner); } public Dataset() : base() { Name = "-"; VariableNames = Enumerable.Empty(); variableValues = new Dictionary(); rows = 0; } /// /// Creates a new dataset. The variableValues are not cloned. /// /// The names of the variables in the dataset /// The values for the variables (column-oriented storage). Values are not cloned! public Dataset(IEnumerable variableNames, IEnumerable variableValues) : this(variableNames, variableValues, cloneValues: true) { } protected Dataset(IEnumerable variableNames, IEnumerable variableValues, bool cloneValues = false) { Name = "-"; if (variableNames.Any()) { this.variableNames = new List(variableNames); } else { this.variableNames = Enumerable.Range(0, variableValues.Count()).Select(x => "Column " + x).ToList(); } // check if the arguments are consistent (no duplicate variables, same number of rows, correct data types, ...) CheckArguments(this.variableNames, variableValues); rows = variableValues.First().Count; if (cloneValues) { this.variableValues = CloneValues(this.variableNames, variableValues); } else { this.variableValues = new Dictionary(this.variableNames.Count); for (int i = 0; i < this.variableNames.Count; i++) { var variableName = this.variableNames[i]; var values = variableValues.ElementAt(i); this.variableValues.Add(variableName, values); } } } public Dataset(IEnumerable variableNames, double[,] variableValues) { Name = "-"; if (variableNames.Count() != variableValues.GetLength(1)) { throw new ArgumentException("Number of variable names doesn't match the number of columns of variableValues"); } if (variableNames.Distinct().Count() != variableNames.Count()) { var duplicateVariableNames = variableNames.GroupBy(v => v).Where(g => g.Count() > 1).Select(g => g.Key).ToList(); string message = "The dataset cannot contain duplicate variables names: " + Environment.NewLine; foreach (var duplicateVariableName in duplicateVariableNames) message += duplicateVariableName + Environment.NewLine; throw new ArgumentException(message); } rows = variableValues.GetLength(0); this.variableNames = new List(variableNames); this.variableValues = new Dictionary(variableValues.GetLength(1)); for (int col = 0; col < variableValues.GetLength(1); col++) { string columName = this.variableNames[col]; var values = new List(variableValues.GetLength(0)); for (int row = 0; row < variableValues.GetLength(0); row++) { values.Add(variableValues[row, col]); } this.variableValues.Add(columName, values); } } public static Dataset FromRowData(IEnumerable variableNames, double[,] data) { var colWise = new List(data.GetLength(1)); for (var col = 0; col < data.GetLength(1); col++) { var column = new List(data.GetLength(0)); for (var row = 0; row < data.GetLength(0); row++) { column.Add(data[row, col]); } colWise.Add(column); } return new Dataset(variableNames, colWise); } public static Dataset FromRowData(IEnumerable variableNames, IEnumerable data) { var vnames = variableNames.ToList(); var transposed = new List(); var iter = data.GetEnumerator(); if (!iter.MoveNext()) throw new ArgumentException("Data does not contain any rows", nameof(data)); for (var i = 0; i < iter.Current.Count; i++) { if (i >= vnames.Count) throw new ArgumentException("There are more variables in data, than variable names.", nameof(variableNames)); if (iter.Current[i] == null) throw new ArgumentException("Null values are not supported.", nameof(data)); if (!IsAllowedType(iter.Current[i].GetType())) throw new ArgumentException("Data contains types that are not allowed.", nameof(data)); if (iter.Current[i] is double d) transposed.Add(new List() { d }); else if (iter.Current[i] is DateTime dt) transposed.Add(new List() { dt }); else if (iter.Current[i] is string s) transposed.Add(new List() { s }); else if (iter.Current[i] is DoubleVector dv) transposed.Add(new List() { dv }); else throw new NotSupportedException(string.Format("Variable {0} has type {1}. This is not supported when converting from row-wise data.", vnames[i], iter.Current[i].GetType())); } if (transposed.Count < vnames.Count) throw new ArgumentException("There are less variables in data, than variable names.", nameof(variableNames)); while (iter.MoveNext()) { for (var i = 0; i < iter.Current.Count; i++) if (transposed[i].Add(iter.Current[i]) < 0) throw new ArgumentException(string.Format("Variable {0} has invalid value ({1})", vnames[i], iter.Current[i]), nameof(data)); } return new Dataset(vnames, transposed); } public ModifiableDataset ToModifiable() { return new ModifiableDataset(variableNames, variableNames.Select(v => variableValues[v]), true); } /// /// Shuffle a dataset's rows /// /// Random number generator used for shuffling. /// A shuffled copy of the current dataset. public Dataset Shuffle(IRandom random) { var values = variableNames.Select(x => variableValues[x]).ToList(); return new Dataset(variableNames, values.ShuffleLists(random)); } #region Backwards compatible code, remove with 3.5 private double[,] storableData; //name alias used to support backwards compatibility [Storable(OldName = "data")] private double[,] StorableData { set { storableData = value; } } [StorableHook(HookType.AfterDeserialization)] private void AfterDeserialization() { if (variableValues == null) { rows = storableData.GetLength(0); variableValues = new Dictionary(); for (int col = 0; col < storableData.GetLength(1); col++) { string columName = variableNames[col]; var values = new List(rows); for (int row = 0; row < rows; row++) { values.Add(storableData[row, col]); } variableValues.Add(columName, values); } storableData = null; } } #endregion [Storable(Name = "VariableValues")] protected Dictionary variableValues; protected List variableNames; [Storable] public IEnumerable VariableNames { get { return variableNames; } protected set { if (variableNames != null) throw new InvalidOperationException(); variableNames = new List(value); } } public bool ContainsVariable(string variableName) { return variableValues.ContainsKey(variableName); } public IEnumerable DoubleVariables { get { return variableValues.Where(p => p.Value is IList).Select(p => p.Key); } } public IEnumerable StringVariables { get { return variableValues.Where(p => p.Value is IList).Select(p => p.Key); } } public IEnumerable DateTimeVariables { get { return variableValues.Where(p => p.Value is IList).Select(p => p.Key); } } public IEnumerable DoubleVectorVariables { get { return variableValues.Where(p => p.Value is IList).Select(p => p.Key); } } public IEnumerable GetDoubleValues(string variableName) { return GetValues(variableName); } public IEnumerable GetStringValues(string variableName) { return GetValues(variableName); } public IEnumerable GetDateTimeValues(string variableName) { return GetValues(variableName); } public IEnumerable GetDoubleVectorValues(string variableName) { return GetValues(variableName); } public ReadOnlyCollection GetReadOnlyDoubleValues(string variableName) { var values = GetValues(variableName); return new ReadOnlyCollection(values); } public double GetDoubleValue(string variableName, int row) { var values = GetValues(variableName); return values[row]; } public IEnumerable GetDoubleValues(string variableName, IEnumerable rows) { return GetValues(variableName, rows); } public string GetStringValue(string variableName, int row) { var values = GetValues(variableName); return values[row]; } public IEnumerable GetStringValues(string variableName, IEnumerable rows) { return GetValues(variableName, rows); } public ReadOnlyCollection GetReadOnlyStringValues(string variableName) { var values = GetValues(variableName); return new ReadOnlyCollection(values); } public DateTime GetDateTimeValue(string variableName, int row) { var values = GetValues(variableName); return values[row]; } public IEnumerable GetDateTimeValues(string variableName, IEnumerable rows) { return GetValues(variableName, rows); } public ReadOnlyCollection GetReadOnlyDateTimeValues(string variableName) { var values = GetValues(variableName); return new ReadOnlyCollection(values); } public DoubleVector GetDoubleVectorValue(string variableName, int row) { var values = GetValues(variableName); return values[row]; } public IEnumerable GetDoubleVectorValues(string variableName, IEnumerable rows) { return GetValues(variableName, rows); } public ReadOnlyCollection GetReadOnlyDoubleVectorValues(string variableName) { var values = GetValues(variableName); return new ReadOnlyCollection(values); } private IEnumerable GetValues(string variableName, IEnumerable rows) { var values = GetValues(variableName); return rows.Select(x => values[x]); } private IList GetValues(string variableName) { IList list; if (!variableValues.TryGetValue(variableName, out list)) throw new ArgumentException("The variable " + variableName + " does not exist in the dataset."); IList values = list as IList; if (values == null) throw new ArgumentException("The variable " + variableName + " is not a " + typeof(T) + " variable."); return values; } public bool VariableHasType(string variableName) { return variableValues[variableName] is IList; } protected Type GetVariableType(string variableName) { IList list; variableValues.TryGetValue(variableName, out list); if (list == null) throw new ArgumentException("The variable " + variableName + " does not exist in the dataset."); return GetElementType(list); } protected static Type GetElementType(IList list) { var type = list.GetType(); return type.IsGenericType ? type.GetGenericArguments()[0] : type.GetElementType(); } protected static bool IsAllowedType(IList list) { var type = GetElementType(list); return IsAllowedType(type); } protected static bool IsAllowedType(Type type) { return type == typeof(double) || type == typeof(string) || type == typeof(DateTime) || type == typeof(DoubleVector); } protected static void CheckArguments(IEnumerable variableNames, IEnumerable variableValues) { if (variableNames.Count() != variableValues.Count()) { throw new ArgumentException("Number of variable names doesn't match the number of columns of variableValues"); } else if (!variableValues.All(list => list.Count == variableValues.First().Count)) { throw new ArgumentException("The number of values must be equal for every variable"); } else if (variableNames.Distinct().Count() != variableNames.Count()) { var duplicateVariableNames = variableNames.GroupBy(v => v).Where(g => g.Count() > 1).Select(g => g.Key).ToList(); string message = "The dataset cannot contain duplicate variables names: " + Environment.NewLine; foreach (var duplicateVariableName in duplicateVariableNames) message += duplicateVariableName + Environment.NewLine; throw new ArgumentException(message); } // check if all the variables are supported foreach (var t in variableNames.Zip(variableValues, Tuple.Create)) { var variableName = t.Item1; var values = t.Item2; if (!IsAllowedType(values)) { throw new ArgumentException(string.Format("Unsupported type {0} for variable {1}.", GetElementType(values), variableName)); } } } protected static Dictionary CloneValues(Dictionary variableValues) { return variableValues.ToDictionary(x => x.Key, x => CloneValues(x.Value)); } protected static Dictionary CloneValues(IEnumerable variableNames, IEnumerable variableValues) { return variableNames.Zip(variableValues, Tuple.Create).ToDictionary(x => x.Item1, x => CloneValues(x.Item2)); } protected static IList CloneValues(IList values) { var doubleValues = values as IList; if (doubleValues != null) return new List(doubleValues); var stringValues = values as IList; if (stringValues != null) return new List(stringValues); var dateTimeValues = values as IList; if (dateTimeValues != null) return new List(dateTimeValues); var doubleVectorValues = values as IList; if (doubleVectorValues != null) return doubleVectorValues.Select(x => x.Clone()).ToList(); throw new ArgumentException(string.Format("Unsupported variable type {0}.", GetElementType(values))); } #region IStringConvertibleMatrix Members [Storable] private int rows; public int Rows { get { return rows; } protected set { rows = value; } } int IStringConvertibleMatrix.Rows { get { return Rows; } set { throw new NotSupportedException(); } } public int Columns { get { return variableNames.Count; } } int IStringConvertibleMatrix.Columns { get { return Columns; } set { throw new NotSupportedException(); } } bool IStringConvertibleMatrix.SortableView { get { return false; } set { throw new NotSupportedException(); } } bool IStringConvertibleMatrix.ReadOnly { get { return true; } } IEnumerable IStringConvertibleMatrix.ColumnNames { get { return this.VariableNames; } set { throw new NotSupportedException(); } } IEnumerable IStringConvertibleMatrix.RowNames { get { return Enumerable.Empty(); } set { throw new NotSupportedException(); } } string IStringConvertibleMatrix.GetValue(int rowIndex, int columnIndex) { var value = variableValues[variableNames[columnIndex]][rowIndex]; if (value is DoubleVector vector) { return $"[{vector.ToVectorString(10, 80)}]"; //const int maxCount = 10; //string extension = vector.Count > maxCount ? ", ..." : ""; //return $"[{string.Join(", ", vector.Cast().Take(Math.Min(vector.Count, maxCount)))}{extension}]"; } return value.ToString(); } bool IStringConvertibleMatrix.SetValue(string value, int rowIndex, int columnIndex) { throw new NotSupportedException(); } bool IStringConvertibleMatrix.Validate(string value, out string errorMessage) { throw new NotSupportedException(); } public virtual event EventHandler ColumnsChanged { add { } remove { } } public virtual event EventHandler RowsChanged { add { } remove { } } public virtual event EventHandler ColumnNamesChanged { add { } remove { } } public virtual event EventHandler RowNamesChanged { add { } remove { } } public virtual event EventHandler SortableViewChanged { add { } remove { } } public virtual event EventHandler> ItemChanged { add { } remove { } } public virtual event EventHandler Reset { add { } remove { } } #endregion } }