source: branches/2886_SymRegGrammarEnumeration/ExpressionClustering/Program.cs @ 15842

Last change on this file since 15842 was 15842, checked in by gkronber, 4 years ago

#2886: added clustering of functions and output of clusters, fixed bug in evaluation

File size: 8.7 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Drawing;
4using System.IO;
5using System.Linq;
6using HeuristicLab.Analysis;
7using HeuristicLab.Analysis.Views;
8using System.Windows.Forms;
9
10namespace ExpressionClustering {
11  class Program {
12    private static readonly string folder = Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory);
13    private static readonly string clusterFolder = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "clusters");
14    private static readonly string distinctSentencesFileName = Path.Combine(folder, @"distinctSentences.csv");
15    private static readonly string allSentencesFileName = Path.Combine(folder, "allSentences.csv");
16    private static readonly string outputFileName = Path.Combine(folder, "evaluations.csv.gz");
17    private static int N = 100;
18    private static double[] evalBuf = new double[N];
19
20    // pagie-1 (univariate)
21    private static double min = -5.0;
22    private static double max = +5.0;
23    private static double[] xs = Enumerable.Range(1, N).Select(xi => ((double)xi / N) * (max - min) + min).ToArray(); // input
24    private static double[] ys = xs.Select(xi => 1.0 / (1 + Math.Pow(xi, -4))).ToArray(); // target (necessary for scaling and clustering
25
26
27    public static int MAX_STACK = 20;
28    public static double[][] stack = new double[MAX_STACK][];
29    static Program() {
30      for (int i = 0; i < MAX_STACK; i++)
31        stack[i] = new double[N];
32    }
33
34    // loads symbolic expressions in postfix notation from a stream and identifies clusters of expressions
35    static void Main(string[] args) {
36
37      var hash2Sentences = new Dictionary<string, List<string>>();
38      // for debugging only
39      var postfix2infix = new Dictionary<string, string>();
40
41
42
43      // read all sentences and determine shortest sentences
44      using (var reader = new StreamReader(allSentencesFileName)) {
45        // read header
46        reader.ReadLine();
47        int nSentences = 0;
48        while (!reader.EndOfStream) {
49          var line = reader.ReadLine();
50          var toks = line.Split(';');
51          var hash = toks[2];
52          List<string> ls;
53          if (!hash2Sentences.TryGetValue(hash, out ls)) {
54            ls = new List<string>(1);
55            hash2Sentences.Add(hash, ls);
56          }
57          ls.Add(toks[1]);
58          postfix2infix.Add(toks[1], toks[0]);
59          nSentences++;
60        }
61
62        Console.WriteLine("{0} {1}", nSentences, hash2Sentences.Count);
63        //Evaluate(toks[1], xs, evalBuf);
64      }
65
66      List<double[]> functions = new List<double[]>();
67      List<string> sentences = new List<string>();
68      List<double> qualities = new List<double>();
69
70      foreach (var kvp in hash2Sentences) {
71        var ls = kvp.Value;
72        var sentence = FindShortest(ls);
73        Evaluate(sentence, xs, evalBuf);
74        if (evalBuf.Any(ei => float.IsInfinity((float)ei) || float.IsNaN((float)ei))) {
75          Console.WriteLine("skipping {0} {1}", evalBuf.Average(), sentence);
76        } else {
77          try {
78            Scale(evalBuf, ys);
79            functions.Add((double[])evalBuf.Clone());
80            sentences.Add(sentence);
81            HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError error;
82            qualities.Add(HeuristicLab.Problems.DataAnalysis.OnlinePearsonsRSquaredCalculator.Calculate(evalBuf, ys, out error));
83          } catch (ArgumentException e) {
84            // scaling failed
85          }
86        }
87      }
88
89      List<int> clusters;
90      List<double> distances;
91      Flann.FindClusters(functions, out clusters, out distances, 100);
92     
93      // output all clusters and functions
94      using (var writer = new StreamWriter(new System.IO.Compression.GZipStream(new FileStream(outputFileName, FileMode.OpenOrCreate), System.IO.Compression.CompressionMode.Compress))) {
95        for (int i = 0; i < functions.Count; i++) {
96          writer.WriteLine("{0};{1};{2};{3};{4};{5}", clusters[i], distances[i], qualities[i], sentences[i], postfix2infix[sentences[i]], string.Join(";", functions[i].Select(fi => fi.ToString())));
97        }
98      }
99
100      var funClusters = functions.Zip(clusters, (f, c) => Tuple.Create(f, c)).GroupBy(t => t.Item2);
101      var dtView = new DataTableView();
102      dtView.Size = new Size(800, 600);
103
104      foreach (var funCluster in funClusters) {
105        // draw the functions for each cluster into a separate png
106        var dtName = string.Format("R² {0}", Enumerable.Range(0, qualities.Count).Where(idx => clusters[idx] == funCluster.Key).Select(idx => qualities[idx]).Average());
107        var dt = new DataTable(dtName, dtName);
108        var rows = new List<DataRow>();
109        int i = 0;
110        foreach (var fun in funCluster.Select(t => t.Item1)) {
111          var name = i.ToString();
112          var dr = new DataRow(name, name, fun);
113          rows.Add(dr);
114          i++;
115        }
116        dt.Rows.AddRange(rows);
117        dtView.Content = dt;
118        using (var bm = new Bitmap(800, 600)) {
119          dtView.DrawToBitmap(bm, new Rectangle(0, 0, 800, 600));
120          bm.Save(Path.Combine(clusterFolder, string.Format("cluster_{0,3}.png", funCluster.Key)));
121        }
122      }
123    }
124
125
126
127    private static string FindShortest(List<string> ls) {
128      var minElem = ls.First();
129      for (int i = 1; i < ls.Count; i++) {
130        if (ls[i].Length < minElem.Length) minElem = ls[i];
131      }
132      return minElem;
133    }
134
135
136
137    #region evaluation
138    // linear scaling
139    private static void Scale(double[] evalBuf, double[] ys) {
140      double alpha;
141      double beta;
142      HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError error;
143
144      HeuristicLab.Problems.DataAnalysis.OnlineLinearScalingParameterCalculator.Calculate(evalBuf, ys, out alpha, out beta, out error);
145      if (error != HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError.None) {
146        throw new ArgumentException();
147      }
148
149      for (int i = 0; i < evalBuf.Length; i++) {
150        evalBuf[i] = beta * evalBuf[i] + alpha;
151      }
152    }
153
154    // evaluates postfix expressions (only for a very specific format)
155    private static void Evaluate(string postfixExpr, double[] xs, double[] evalBuf) {
156      int topOfStack = -1;
157      Evaluate(postfixExpr, 0, xs, ref topOfStack);
158      Array.Copy(stack[topOfStack], evalBuf, evalBuf.Length);
159    }
160
161    private static void Evaluate(string postfixExpr, int exprPos, double[] xs, ref int topOfStack) {
162      while (exprPos < postfixExpr.Length) {
163        switch (postfixExpr[exprPos]) {
164          case '+': {
165              exprPos += 2;
166              var a = stack[topOfStack];
167              var b = stack[topOfStack - 1];
168              for (int i = 0; i < N; i++) {
169                b[i] += a[i];
170              }
171              topOfStack--;
172              break;
173            }
174          case '*': {
175              exprPos += 2;
176              var a = stack[topOfStack];
177              var b = stack[topOfStack - 1];
178              for (int i = 0; i < N; i++) {
179                b[i] *= a[i];
180              }
181              topOfStack--;
182              break;
183            }
184          case 'X': {
185              exprPos += 2;
186              topOfStack++;
187              Array.Copy(xs, stack[topOfStack], N);
188              break;
189            }
190          case 'c': {
191              // cos
192              exprPos += 4;
193              var a = stack[topOfStack];
194              for (int i = 0; i < N; i++) {
195                a[i] = Math.Cos(a[i]);
196              }
197              break;
198            }
199          case 's': {
200              // sin
201              exprPos += 4;
202              var a = stack[topOfStack];
203              for (int i = 0; i < N; i++) {
204                a[i] = Math.Sin(a[i]);
205              }
206              break;
207            }
208          case 'l': {
209              // log
210              exprPos += 4;
211              var a = stack[topOfStack];
212              for (int i = 0; i < N; i++) {
213                a[i] = Math.Log(a[i]);
214              }
215
216              break;
217            }
218          case 'e': {
219              // exp
220              exprPos += 4;
221              var a = stack[topOfStack];
222              for (int i = 0; i < N; i++) {
223                a[i] = Math.Exp(a[i]);
224              }
225
226              break;
227            }
228          case 'i': {
229              // inv
230              exprPos += 4;
231              var a = stack[topOfStack];
232              for (int i = 0; i < N; i++) {
233                a[i] = 1.0 / a[i];
234              }
235              break;
236            }
237          default: {
238              throw new InvalidOperationException(string.Format("Cannot handle {0} in {1}", postfixExpr[exprPos], postfixExpr));
239            }
240        }
241      }
242    }
243    #endregion
244  }
245}
Note: See TracBrowser for help on using the repository browser.