Free cookie consent management tool by TermsFeed Policy Generator

Ignore:
Timestamp:
03/27/17 15:15:23 (7 years ago)
Author:
gkronber
Message:

#2700: changes and while reviewing

File:
1 moved

Legend:

Unmodified
Added
Removed
  • branches/TSNE/HeuristicLab.Algorithms.DataAnalysis/3.4/TSNE/TSNEAlgorithm.cs

    r14784 r14785  
    3838namespace HeuristicLab.Algorithms.DataAnalysis {
    3939  /// <summary>
    40   /// t-distributed stochastic neighbourhood embedding (TSNE) projects the data in a low dimensional
     40  /// t-distributed stochastic neighbourhood embedding (tSNE) projects the data in a low dimensional
    4141  /// space to allow visual cluster identification.
    4242  /// </summary>
    43   [Item("TSNE", "t-distributed stochastic neighbourhood embedding projects the data in a low " +
     43  [Item("tSNE", "t-distributed stochastic neighbourhood embedding projects the data in a low " +
    4444                "dimensional space to allow visual cluster identification.")]
    4545  [Creatable(CreatableAttribute.Categories.DataAnalysis, Priority = 100)]
    4646  [StorableClass]
    47   public sealed class TSNEAnalysis : BasicAlgorithm {
     47  public sealed class TSNEAlgorithm : BasicAlgorithm {
    4848    public override bool SupportsPause {
    4949      get { return false; }
     
    5757    }
    5858
    59     #region Parameternames
     59    #region parameter names
    6060    private const string DistanceParameterName = "DistanceFunction";
    6161    private const string PerplexityParameterName = "Perplexity";
     
    7474    #endregion
    7575
    76     #region Parameterproperties
     76    #region parameter properties
    7777    public IFixedValueParameter<DoubleValue> PerplexityParameter {
    7878      get { return Parameters[PerplexityParameterName] as IFixedValueParameter<DoubleValue>; }
    7979    }
    80     public OptionalValueParameter<DoubleValue> ThetaParameter {
    81       get { return Parameters[ThetaParameterName] as OptionalValueParameter<DoubleValue>; }
     80    public IFixedValueParameter<DoubleValue> ThetaParameter {
     81      get { return Parameters[ThetaParameterName] as IFixedValueParameter<DoubleValue>; }
    8282    }
    8383    public IFixedValueParameter<IntValue> NewDimensionsParameter {
    8484      get { return Parameters[NewDimensionsParameterName] as IFixedValueParameter<IntValue>; }
    8585    }
    86     public IValueParameter<IDistance<RealVector>> DistanceParameter {
    87       get { return Parameters[DistanceParameterName] as IValueParameter<IDistance<RealVector>>; }
     86    public IValueParameter<IDistance<double[]>> DistanceParameter {
     87      get { return Parameters[DistanceParameterName] as IValueParameter<IDistance<double[]>>; }
    8888    }
    8989    public IFixedValueParameter<IntValue> MaxIterationsParameter {
     
    120120
    121121    #region  Properties
    122     public IDistance<RealVector> Distance {
     122    public IDistance<double[]> Distance {
    123123      get { return DistanceParameter.Value; }
    124124    }
    125125    public double Perplexity {
    126126      get { return PerplexityParameter.Value.Value; }
     127      set { PerplexityParameter.Value.Value = value; }
    127128    }
    128129    public double Theta {
    129       get { return ThetaParameter.Value == null ? 0 : ThetaParameter.Value.Value; }
     130      get { return ThetaParameter.Value.Value; }
     131      set { ThetaParameter.Value.Value = value; }
    130132    }
    131133    public int NewDimensions {
    132134      get { return NewDimensionsParameter.Value.Value; }
     135      set { NewDimensionsParameter.Value.Value = value; }
    133136    }
    134137    public int MaxIterations {
    135138      get { return MaxIterationsParameter.Value.Value; }
     139      set { MaxIterationsParameter.Value.Value = value; }
    136140    }
    137141    public int StopLyingIteration {
    138142      get { return StopLyingIterationParameter.Value.Value; }
     143      set { StopLyingIterationParameter.Value.Value = value; }
    139144    }
    140145    public int MomentumSwitchIteration {
    141146      get { return MomentumSwitchIterationParameter.Value.Value; }
     147      set { MomentumSwitchIterationParameter.Value.Value = value; }
    142148    }
    143149    public double InitialMomentum {
    144150      get { return InitialMomentumParameter.Value.Value; }
     151      set { InitialMomentumParameter.Value.Value = value; }
    145152    }
    146153    public double FinalMomentum {
    147154      get { return FinalMomentumParameter.Value.Value; }
     155      set { FinalMomentumParameter.Value.Value = value; }
    148156    }
    149157    public double Eta {
    150       get {
    151         return EtaParameter.Value == null ? 0 : EtaParameter.Value.Value;
    152       }
     158      get { return EtaParameter.Value.Value; }
     159      set { EtaParameter.Value.Value = value; }
    153160    }
    154161    public bool SetSeedRandomly {
    155162      get { return SetSeedRandomlyParameter.Value.Value; }
    156     }
    157     public uint Seed {
    158       get { return (uint)SeedParameter.Value.Value; }
     163      set { SetSeedRandomlyParameter.Value.Value = value; }
     164    }
     165    public int Seed {
     166      get { return SeedParameter.Value.Value; }
     167      set { SeedParameter.Value.Value = value; }
    159168    }
    160169    public string Classes {
    161170      get { return ClassesParameter.Value.Value; }
     171      set { ClassesParameter.Value.Value = value; }
    162172    }
    163173    public bool Normalization {
    164174      get { return NormalizationParameter.Value.Value; }
     175      set { NormalizationParameter.Value.Value = value; }
    165176    }
    166177    [Storable]
    167     public TSNE<RealVector> tsne;
     178    public TSNE<double[]> tsne;
    168179    #endregion
    169180
    170181    #region Constructors & Cloning
    171182    [StorableConstructor]
    172     private TSNEAnalysis(bool deserializing) : base(deserializing) { }
    173     private TSNEAnalysis(TSNEAnalysis original, Cloner cloner) : base(original, cloner) { }
    174     public override IDeepCloneable Clone(Cloner cloner) { return new TSNEAnalysis(this, cloner); }
    175     public TSNEAnalysis() {
     183    private TSNEAlgorithm(bool deserializing) : base(deserializing) { }
     184    private TSNEAlgorithm(TSNEAlgorithm original, Cloner cloner) : base(original, cloner) { }
     185    public override IDeepCloneable Clone(Cloner cloner) { return new TSNEAlgorithm(this, cloner); }
     186    public TSNEAlgorithm() {
    176187      Problem = new RegressionProblem();
    177       Parameters.Add(new ValueParameter<IDistance<RealVector>>(DistanceParameterName, "The distance function used to differentiate similar from non-similar points", new EuclideanDistance()));
    178       Parameters.Add(new FixedValueParameter<DoubleValue>(PerplexityParameterName, "Perplexity-Parameter of TSNE. Comparable to k in a k-nearest neighbour algorithm. Recommended Value is Floor(number of points /3) or lower", new DoubleValue(25)));
    179       Parameters.Add(new OptionalValueParameter<DoubleValue>(ThetaParameterName, "Value describing how much appoximated gradients my differ from exact gradients. Set to 0 for exact calculation and in [0,1] otherwise \n CAUTION: exact calculation of forces requires building a non-sparse N*N matrix where N is the number of data points\n This may exceed memory limitations", new DoubleValue(0.1)));
    180       Parameters.Add(new FixedValueParameter<IntValue>(NewDimensionsParameterName, "Dimensionality of projected space (usually 2 for easy visual analysis", new IntValue(2)));
     188      Parameters.Add(new ValueParameter<IDistance<double[]>>(DistanceParameterName, "The distance function used to differentiate similar from non-similar points", new EuclideanDistance()));
     189      Parameters.Add(new FixedValueParameter<DoubleValue>(PerplexityParameterName, "Perplexity-Parameter of tSNE. Comparable to k in a k-nearest neighbour algorithm. Recommended value is floor(number of points /3) or lower", new DoubleValue(25)));
     190      Parameters.Add(new FixedValueParameter<DoubleValue>(ThetaParameterName, "Value describing how much appoximated gradients my differ from exact gradients. Set to 0 for exact calculation and in [0,1] otherwise \n CAUTION: exact calculation of forces requires building a non-sparse N*N matrix where N is the number of data points\n This may exceed memory limitations", new DoubleValue(0)));
     191      Parameters.Add(new FixedValueParameter<IntValue>(NewDimensionsParameterName, "Dimensionality of projected space (usually 2 for easy visual analysis)", new IntValue(2)));
    181192      Parameters.Add(new FixedValueParameter<IntValue>(MaxIterationsParameterName, "Maximum number of iterations for gradient descent", new IntValue(1000)));
    182193      Parameters.Add(new FixedValueParameter<IntValue>(StopLyingIterationParameterName, "Number of iterations after which p is no longer approximated", new IntValue(0)));
     
    184195      Parameters.Add(new FixedValueParameter<DoubleValue>(InitialMomentumParameterName, "The initial momentum in the gradient descent", new DoubleValue(0.5)));
    185196      Parameters.Add(new FixedValueParameter<DoubleValue>(FinalMomentumParameterName, "The final momentum", new DoubleValue(0.8)));
    186       Parameters.Add(new FixedValueParameter<DoubleValue>(EtaParameterName, "Gradient Descent learning rate", new DoubleValue(200)));
     197      Parameters.Add(new FixedValueParameter<DoubleValue>(EtaParameterName, "Gradient descent learning rate", new DoubleValue(200)));
    187198      Parameters.Add(new FixedValueParameter<BoolValue>(SetSeedRandomlyParameterName, "If the seed should be random", new BoolValue(true)));
    188199      Parameters.Add(new FixedValueParameter<IntValue>(SeedParameterName, "The seed used if it should not be random", new IntValue(0)));
    189       Parameters.Add(new FixedValueParameter<StringValue>(ClassesParameterName, "name of the column specifying the class lables of each data point. \n if the lable column can not be found Training/Test is used as labels", new StringValue("none")));
    190       Parameters.Add(new FixedValueParameter<BoolValue>(NormalizationParameterName, "Wether the data should be zero centered and have variance of 1 for each variable, so different scalings are ignored", new BoolValue(true)));
     200      Parameters.Add(new FixedValueParameter<StringValue>(ClassesParameterName, "name of the column specifying the class lables of each data point. \n if the lable column can not be found training/test is used as labels", new StringValue("none")));
     201      Parameters.Add(new FixedValueParameter<BoolValue>(NormalizationParameterName, "Whether the data should be zero centered and have variance of 1 for each variable, so different scalings are ignored", new BoolValue(true)));
    191202
    192203      MomentumSwitchIterationParameter.Hidden = true;
     
    200211    public override void Stop() {
    201212      base.Stop();
    202       if(tsne != null) tsne.Running = false;
     213      if (tsne != null) tsne.Running = false;
    203214    }
    204215
     
    208219      var problemData = Problem.ProblemData;
    209220
    210       //color datapoints acording to Classes-Variable (be it double or string)
    211       if(problemData.Dataset.VariableNames.Contains(Classes)) {
    212         if((problemData.Dataset as Dataset).VariableHasType<string>(Classes)) {
     221      //color datapoints acording to classes variable (be it double or string)
     222      if (problemData.Dataset.VariableNames.Contains(Classes)) {
     223        if ((problemData.Dataset as Dataset).VariableHasType<string>(Classes)) {
    213224          var classes = problemData.Dataset.GetStringValues(Classes).ToArray();
    214           for(var i = 0; i < classes.Length; i++) {
    215             if(!dataRowNames.ContainsKey(classes[i])) dataRowNames.Add(classes[i], new List<int>());
     225          for (var i = 0; i < classes.Length; i++) {
     226            if (!dataRowNames.ContainsKey(classes[i])) dataRowNames.Add(classes[i], new List<int>());
    216227            dataRowNames[classes[i]].Add(i);
    217228          }
    218         } else if((problemData.Dataset as Dataset).VariableHasType<double>(Classes)) {
     229        } else if ((problemData.Dataset as Dataset).VariableHasType<double>(Classes)) {
    219230          var classValues = problemData.Dataset.GetDoubleValues(Classes).ToArray();
    220           var max = classValues.Max() + 0.1;
     231          var max = classValues.Max() + 0.1;     // TODO consts
    221232          var min = classValues.Min() - 0.1;
    222233          const int contours = 8;
    223           for(var i = 0; i < contours; i++) {
     234          for (var i = 0; i < contours; i++) {
    224235            var contourname = GetContourName(i, min, max, contours);
    225236            dataRowNames.Add(contourname, new List<int>());
     
    228239            rows[contourname].VisualProperties.PointSize = i + 3;
    229240          }
    230           for(var i = 0; i < classValues.Length; i++) {
     241          for (var i = 0; i < classValues.Length; i++) {
    231242            dataRowNames[GetContourName(classValues[i], min, max, contours)].Add(i);
    232243          }
     
    237248      }
    238249
    239       //Set up and run TSNE
    240       if(SetSeedRandomly) SeedParameter.Value.Value = new System.Random().Next();
    241       var random = new MersenneTwister(Seed);
    242       tsne = new TSNE<RealVector>(Distance, random, Results, MaxIterations, StopLyingIteration, MomentumSwitchIteration, InitialMomentum, FinalMomentum, Eta, dataRowNames, rows);
     250      // set up and run tSNE
     251      if (SetSeedRandomly) Seed = new System.Random().Next();
     252      var random = new MersenneTwister((uint)Seed);
     253      tsne = new TSNE<double[]>(Distance, random, Results, MaxIterations, StopLyingIteration, MomentumSwitchIteration, InitialMomentum, FinalMomentum, Eta, dataRowNames, rows);
    243254      var dataset = problemData.Dataset;
    244255      var allowedInputVariables = problemData.AllowedInputVariables.ToArray();
    245       var data = new RealVector[dataset.Rows];
    246       for(var row = 0; row < dataset.Rows; row++) data[row] = new RealVector(allowedInputVariables.Select(col => dataset.GetDoubleValue(col, row)).ToArray());
    247       if(Normalization) data = NormalizeData(data);
     256      var data = new double[dataset.Rows][];
     257      for (var row = 0; row < dataset.Rows; row++) data[row] = allowedInputVariables.Select(col => dataset.GetDoubleValue(col, row)).ToArray();
     258      if (Normalization) data = NormalizeData(data);
    248259      tsne.Run(data, NewDimensions, Perplexity, Theta);
    249260    }
    250261
    251     private static RealVector[] NormalizeData(IReadOnlyList<RealVector> data) {
     262    private static double[][] NormalizeData(IReadOnlyList<double[]> data) {
    252263      var n = data[0].Length;
    253264      var mean = new double[n];
    254265      var sd = new double[n];
    255       var nData = new RealVector[data.Count];
    256       for(var i = 0; i < n; i++) {
     266      var nData = new double[data.Count][];
     267      for (var i = 0; i < n; i++) {
    257268        var i1 = i;
    258269        sd[i] = Enumerable.Range(0, data.Count).Select(x => data[x][i1]).StandardDeviation();
    259270        mean[i] = Enumerable.Range(0, data.Count).Select(x => data[x][i1]).Average();
    260271      }
    261       for(var i = 0; i < data.Count; i++) {
    262         nData[i] = new RealVector(n);
    263         for(var j = 0; j < n; j++) nData[i][j] = (data[i][j] - mean[j]) / sd[j];
     272      for (var i = 0; i < data.Count; i++) {
     273        nData[i] = new double[n];
     274        for (var j = 0; j < n; j++) nData[i][j] = (data[i][j] - mean[j]) / sd[j];
    264275      }
    265276      return nData;
Note: See TracChangeset for help on using the changeset viewer.