Free cookie consent management tool by TermsFeed Policy Generator

Changeset 15842


Ignore:
Timestamp:
03/15/18 10:41:20 (7 years ago)
Author:
gkronber
Message:

#2886: added clustering of functions and output of clusters, fixed bug in evaluation

Location:
branches/2886_SymRegGrammarEnumeration
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • branches/2886_SymRegGrammarEnumeration/ExpressionClustering/ExpressionClustering.csproj

    r15840 r15842  
    3838      <HintPath>..\..\..\trunk\bin\ALGLIB-3.7.0.dll</HintPath>
    3939    </Reference>
     40    <Reference Include="HeuristicLab.Analysis-3.3">
     41      <HintPath>..\..\..\trunk\bin\HeuristicLab.Analysis-3.3.dll</HintPath>
     42    </Reference>
     43    <Reference Include="HeuristicLab.Analysis.Views-3.3">
     44      <HintPath>..\..\..\trunk\bin\HeuristicLab.Analysis.Views-3.3.dll</HintPath>
     45    </Reference>
     46    <Reference Include="HeuristicLab.Collections-3.3, Version=3.3.0.0, Culture=neutral, PublicKeyToken=ba48961d6f65dcec, processorArchitecture=MSIL">
     47      <SpecificVersion>False</SpecificVersion>
     48      <HintPath>..\..\..\trunk\bin\HeuristicLab.Collections-3.3.dll</HintPath>
     49    </Reference>
    4050    <Reference Include="HeuristicLab.Common-3.3">
    4151      <HintPath>..\..\..\trunk\bin\HeuristicLab.Common-3.3.dll</HintPath>
     52    </Reference>
     53    <Reference Include="HeuristicLab.Core-3.3, Version=3.3.0.0, Culture=neutral, PublicKeyToken=ba48961d6f65dcec" />
     54    <Reference Include="HeuristicLab.Core.Views-3.3, Version=3.3.0.0, Culture=neutral, PublicKeyToken=ba48961d6f65dcec, processorArchitecture=MSIL">
     55      <SpecificVersion>False</SpecificVersion>
     56      <HintPath>..\..\..\trunk\bin\HeuristicLab.Core.Views-3.3.dll</HintPath>
     57    </Reference>
     58    <Reference Include="HeuristicLab.MainForm-3.3, Version=3.3.0.0, Culture=neutral, PublicKeyToken=ba48961d6f65dcec, processorArchitecture=MSIL">
     59      <SpecificVersion>False</SpecificVersion>
     60      <HintPath>..\..\..\trunk\bin\HeuristicLab.MainForm-3.3.dll</HintPath>
     61    </Reference>
     62    <Reference Include="HeuristicLab.MainForm.WindowsForms-3.3, Version=3.3.0.0, Culture=neutral, PublicKeyToken=ba48961d6f65dcec, processorArchitecture=MSIL">
     63      <SpecificVersion>False</SpecificVersion>
     64      <HintPath>..\..\..\trunk\bin\HeuristicLab.MainForm.WindowsForms-3.3.dll</HintPath>
    4265    </Reference>
    4366    <Reference Include="HeuristicLab.Problems.DataAnalysis-3.4">
    4467      <HintPath>..\..\..\trunk\bin\HeuristicLab.Problems.DataAnalysis-3.4.dll</HintPath>
    4568    </Reference>
     69    <Reference Include="HeuristicLab.Visualization.ChartControlsExtensions-3.3, Version=3.3.0.0, Culture=neutral, PublicKeyToken=ba48961d6f65dcec, processorArchitecture=MSIL">
     70      <SpecificVersion>False</SpecificVersion>
     71      <HintPath>..\..\..\trunk\bin\HeuristicLab.Visualization.ChartControlsExtensions-3.3.dll</HintPath>
     72    </Reference>
    4673    <Reference Include="System" />
    4774    <Reference Include="System.Core" />
     75    <Reference Include="System.Drawing" />
     76    <Reference Include="System.Windows" />
     77    <Reference Include="System.Windows.Forms" />
    4878    <Reference Include="System.Xml.Linq" />
    4979    <Reference Include="System.Data.DataSetExtensions" />
  • branches/2886_SymRegGrammarEnumeration/ExpressionClustering/Flann.cs

    r15841 r15842  
    8686      public int checks;                /* how many leafs (features) to check in one search */
    8787      public float cb_index;            /* cluster boundary index. Used when searching the kmeans tree */
    88       public float eps; 
     88      public float eps;
    8989
    9090      /*  kdtree index parameters */
     
    153153    [DllImport("flann-1.7.1.dll")]
    154154    public static extern int flann_compute_cluster_centers(float[] dataset, int rows, int cols, int clusters, float[] result, ref FLANNParameters flann_params);
     155
     156    public static int FindClusters(List<double[]> dataset, out List<int> results, out List<double> distances, int nClusters) {
     157      var _nRows = dataset.Count;
     158      var _dists = new float[_nRows];
     159      var _result = new int[_nRows];
     160      var _dim = dataset.First().Length;
     161      FLANNParameters p = DEFAULT_FLANN_PARAMETERS;
     162      p.algorithm = flann_algorithm_t.FLANN_INDEX_LINEAR;
     163      p.centers_init = flann_centers_init_t.FLANN_CENTERS_RANDOM;
     164      p.target_precision = 0.9f;
     165      p.log_level = flann_log_level_t.FLANN_LOG_INFO;
     166      // copy training set
     167      var _ds = new float[dataset.Count * _dim];
     168      var i = 0;
     169      foreach (var e in dataset) {
     170        for (int d = 0; d < _dim; d++) {
     171          _ds[i++] = (float)e[d];
     172        }
     173      }
     174
     175      flann_set_distance_type(flann_distance_t.FLANN_DIST_EUCLIDEAN, 0);
     176
     177      float[] centers = new float[nClusters * _dim];
     178      int actualClusters = flann_compute_cluster_centers(_ds, rows: dataset.Count, cols: _dim, clusters: nClusters, result: centers, flann_params: ref p);
     179
     180      var res = flann_find_nearest_neighbors(centers, actualClusters, _dim, _ds, _nRows, _result, _dists, 1, ref p);
     181
     182
     183      distances = _dists.Select(fi => (double)fi).ToList();
     184      results = _result.ToList();
     185      return res;
     186    }
    155187
    156188    public static int FindNearestNeighbours(List<double[]> dataset, List<double[]> queryset, out List<int> results, out List<double> distances, int nearestNeighbours = 3) {
     
    176208      flann_set_distance_type(flann_distance_t.FLANN_DIST_EUCLIDEAN, 0);
    177209
    178       int nClusters = 100;
    179       float[] centers = new float[nClusters * _dim];
    180       flann_compute_cluster_centers(_ds, rows: dataset.Count, cols: _dim, clusters: nClusters, result: centers, flann_params: ref p);
    181 
    182       float speedup = -1.0f;
     210      // int nClusters = 100;
     211      // float[] centers = new float[nClusters * _dim];
     212      // flann_compute_cluster_centers(_ds, rows: dataset.Count, cols: _dim, clusters: nClusters, result: centers, flann_params: ref p);
     213
     214
     215      // for each point in the training set find the nearest cluster
     216
     217      // float speedup = -1.0f;
    183218      // _ds must be a rows × cols matrix stored in row-major order (one feature on each row)                         
    184219      //var index = flann_build_index(_ds, rows: dataset.Count, cols: _dim, speedup: ref speedup, flann_params: ref p);
    185      
     220
    186221
    187222      // copy testset
    188223      var _testset = new float[_tRows * _dim];
    189224      i = 0;
    190       for(int d = 0; d < _dim; d++) {
    191         foreach(var e in queryset) {
     225      for (int d = 0; d < _dim; d++) {
     226        foreach (var e in queryset) {
    192227          _testset[i++] = (float)e[d];
    193228        }
  • branches/2886_SymRegGrammarEnumeration/ExpressionClustering/Program.cs

    r15840 r15842  
    11using System;
    22using System.Collections.Generic;
     3using System.Drawing;
    34using System.IO;
    45using System.Linq;
     6using HeuristicLab.Analysis;
     7using HeuristicLab.Analysis.Views;
     8using System.Windows.Forms;
    59
    610namespace ExpressionClustering {
    711  class Program {
    812    private static readonly string folder = Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory);
     13    private static readonly string clusterFolder = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "clusters");
    914    private static readonly string distinctSentencesFileName = Path.Combine(folder, @"distinctSentences.csv");
    1015    private static readonly string allSentencesFileName = Path.Combine(folder, "allSentences.csv");
    1116    private static readonly string outputFileName = Path.Combine(folder, "evaluations.csv.gz");
    12     private static readonly string columnDelimiter = ";";
    1317    private static int N = 100;
    1418    private static double[] evalBuf = new double[N];
     
    3135    static void Main(string[] args) {
    3236
    33       TestFLANN();
    34 
    3537      var hash2Sentences = new Dictionary<string, List<string>>();
     38      // for debugging only
     39      var postfix2infix = new Dictionary<string, string>();
    3640
    3741
     
    5256          }
    5357          ls.Add(toks[1]);
     58          postfix2infix.Add(toks[1], toks[0]);
    5459          nSentences++;
    5560        }
     
    5964      }
    6065
    61       using (var writer = new StreamWriter(new System.IO.Compression.GZipStream(new FileStream(outputFileName, FileMode.OpenOrCreate), System.IO.Compression.CompressionMode.Compress)))
    62         foreach (var kvp in hash2Sentences) {
    63           var ls = kvp.Value;
    64           var sentence = FindShortest(ls);
    65           Evaluate(sentence, xs, evalBuf);
    66           Add(writer, sentence, evalBuf);
    67         }
    68     }
    69 
    70     private static void TestFLANN() {
    71       var rand = new Random(1234);
    72       var dim = 100;
    73       var N = 10000;
    74       var dataset = new List<double[]>();
    75       for(int i=0;i<N;i++) {
    76         var x = new double[dim];
    77         for (int j = 0; j < dim; j++) x[j] = rand.NextDouble();
    78         dataset.Add(x);
    79       }
    80       List<int> nnResults;
    81       List<double> nnDists;
    82       Flann.FindNearestNeighbours(dataset, dataset, out nnResults, out nnDists);
    83 
    84     }
     66      List<double[]> functions = new List<double[]>();
     67      List<string> sentences = new List<string>();
     68      List<double> qualities = new List<double>();
     69
     70      foreach (var kvp in hash2Sentences) {
     71        var ls = kvp.Value;
     72        var sentence = FindShortest(ls);
     73        Evaluate(sentence, xs, evalBuf);
     74        if (evalBuf.Any(ei => float.IsInfinity((float)ei) || float.IsNaN((float)ei))) {
     75          Console.WriteLine("skipping {0} {1}", evalBuf.Average(), sentence);
     76        } else {
     77          try {
     78            Scale(evalBuf, ys);
     79            functions.Add((double[])evalBuf.Clone());
     80            sentences.Add(sentence);
     81            HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError error;
     82            qualities.Add(HeuristicLab.Problems.DataAnalysis.OnlinePearsonsRSquaredCalculator.Calculate(evalBuf, ys, out error));
     83          } catch (ArgumentException e) {
     84            // scaling failed
     85          }
     86        }
     87      }
     88
     89      List<int> clusters;
     90      List<double> distances;
     91      Flann.FindClusters(functions, out clusters, out distances, 100);
     92     
     93      // output all clusters and functions
     94      using (var writer = new StreamWriter(new System.IO.Compression.GZipStream(new FileStream(outputFileName, FileMode.OpenOrCreate), System.IO.Compression.CompressionMode.Compress))) {
     95        for (int i = 0; i < functions.Count; i++) {
     96          writer.WriteLine("{0};{1};{2};{3};{4};{5}", clusters[i], distances[i], qualities[i], sentences[i], postfix2infix[sentences[i]], string.Join(";", functions[i].Select(fi => fi.ToString())));
     97        }
     98      }
     99
     100      var funClusters = functions.Zip(clusters, (f, c) => Tuple.Create(f, c)).GroupBy(t => t.Item2);
     101      var dtView = new DataTableView();
     102      dtView.Size = new Size(800, 600);
     103
     104      foreach (var funCluster in funClusters) {
     105        // draw the functions for each cluster into a separate png
     106        var dtName = string.Format("R² {0}", Enumerable.Range(0, qualities.Count).Where(idx => clusters[idx] == funCluster.Key).Select(idx => qualities[idx]).Average());
     107        var dt = new DataTable(dtName, dtName);
     108        var rows = new List<DataRow>();
     109        int i = 0;
     110        foreach (var fun in funCluster.Select(t => t.Item1)) {
     111          var name = i.ToString();
     112          var dr = new DataRow(name, name, fun);
     113          rows.Add(dr);
     114          i++;
     115        }
     116        dt.Rows.AddRange(rows);
     117        dtView.Content = dt;
     118        using (var bm = new Bitmap(800, 600)) {
     119          dtView.DrawToBitmap(bm, new Rectangle(0, 0, 800, 600));
     120          bm.Save(Path.Combine(clusterFolder, string.Format("cluster_{0,3}.png", funCluster.Key)));
     121        }
     122      }
     123    }
     124
     125
    85126
    86127    private static string FindShortest(List<string> ls) {
     
    91132      return minElem;
    92133    }
     134
    93135
    94136
     
    102144      HeuristicLab.Problems.DataAnalysis.OnlineLinearScalingParameterCalculator.Calculate(evalBuf, ys, out alpha, out beta, out error);
    103145      if (error != HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError.None) {
    104         alpha = 0.0;
    105         beta = 1.0;
    106         Console.WriteLine("WARNING: error in scaling");
     146        throw new ArgumentException();
    107147      }
    108148
     
    110150        evalBuf[i] = beta * evalBuf[i] + alpha;
    111151      }
    112 
    113 
    114       //
    115       //   var meanE = 0.0;
    116       //   var meanE2 = 0.0;
    117       //   var meanY = 0.0;
    118       //   var meanY2 = 0.0;
    119       //   for(int i=0;i<evalBuf.Length;i++) {
    120       //     var deltaE = evalBuf[i] - meanE;
    121       //     meanE +=  deltaE / (i+1);
    122       //     var deltaE2 = evalBuf[i] - meanE;
    123       //     meanE2 += deltaE * deltaE2;
    124       //
    125       //
    126       //     var deltaY = ys[i] - meanY;
    127       //     meanY += deltaY / (i + 1);
    128       //     var deltaY2 = ys[i] - meanY;
    129       //     meanY2 += deltaY * deltaY2;
    130       //     TODO COVARIANCE
    131       //     Linear Scaling: b = cov(y,e) / var(e); a = meanY - b*meanE;
    132       //   }
    133       //
    134       //   var varE = meanE2 / evalBuf.Length;
    135       //   var varY = meanY2 / evalBuf.Length;
    136       //   Console.WriteLine("{0} {1} {2} {3}", meanE, evalBuf.Average(), meanY, ys.Average());
    137       //   Console.WriteLine("{0} {1}", varE, varY);
    138       //
    139       //   var factor = varY / varE;
    140       //   for(int i=0;i<evalBuf.Length;i++) {
    141       //     evalBuf[i] = (evalBuf[i] - meanE) * factor + meanY;
    142       //   }
    143152    }
    144153
     
    158167              var b = stack[topOfStack - 1];
    159168              for (int i = 0; i < N; i++) {
    160                 a[i] += b[i];
    161               }
     169                b[i] += a[i];
     170              }
     171              topOfStack--;
    162172              break;
    163173            }
     
    167177              var b = stack[topOfStack - 1];
    168178              for (int i = 0; i < N; i++) {
    169                 a[i] *= b[i];
    170               }
     179                b[i] *= a[i];
     180              }
     181              topOfStack--;
    171182              break;
    172183            }
     
    231242    }
    232243    #endregion
    233 
    234 
    235 
    236     // add the line with it's evaluation result to a data structure for clustering
    237     private static void Add(StreamWriter writer, string line, double[] evalBuf) {
    238       var avg = evalBuf.Average();
    239       if (double.IsNaN(avg) || double.IsInfinity(avg)) {
    240         Console.WriteLine("skipping {0} {1}", evalBuf.Average(), line);
    241       } else {
    242         Scale(evalBuf, ys);
    243 
    244         writer.WriteLine(string.Join("\t", evalBuf.Select(ei => ei.ToString(System.Globalization.CultureInfo.InvariantCulture))));
    245       }
    246     }
    247244  }
    248245}
  • branches/2886_SymRegGrammarEnumeration/HeuristicLab.Algorithms.DataAnalysis.SymRegGrammarEnumeration/GrammarEnumeration/GrammarEnumerationAlgorithm.cs

    r15834 r15842  
    115115      Analyzers.CheckedItemsChanged += AnalyzersOnCheckedItemsChanged;
    116116      Analyzers.SetItemCheckedState(Analyzers.First(analyzer => analyzer is RSquaredEvaluator), true);
     117      Analyzers.SetItemCheckedState(Analyzers.First(analyzer => analyzer is SentenceLogger), true);
    117118    }
    118119
Note: See TracChangeset for help on using the changeset viewer.