source: branches/2886_SymRegGrammarEnumeration/ExpressionClustering/Program.cs @ 15903

Last change on this file since 15903 was 15903, checked in by gkronber, 4 years ago

#2886 worked on cluster analysis / visualization for GPTP

File size: 11.9 KB
Line 
1using System;
2using System.Collections;
3using System.Collections.Generic;
4using System.Drawing;
5using System.IO;
6using System.Linq;
7using HeuristicLab.Analysis;
8using HeuristicLab.Analysis.Views;
9using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
10using HeuristicLab.Problems.DataAnalysis;
11using HeuristicLab.Problems.DataAnalysis.Symbolic;
12
13// Evaluates sentences on randomly generated data
14namespace ExpressionClustering {
15  class Program {
16    private static readonly string folder = Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory);
17    private static readonly string clusterFolder = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "clusters");
18    private static readonly string distinctSentencesFileName = Path.Combine(folder, @"distinctSentences_2018-04-13_09-52_TreeSize-10.csv");
19    private static readonly string allSentencesFileName = Path.Combine(folder, "allSentences_2018-04-13_09-52_TreeSize-10.csv");
20    private static readonly string outputFileName = Path.Combine(folder, "evaluations_2018-04-13_09-52_TreeSize-10.csv.gz");
21    private static int N = 100;
22    private static double[] evalBuf = new double[N];
23
24    // pagie-1 (univariate)
25    private static double min = -5.0;
26    private static double max = +5.0;
27    private static double[] xs = Enumerable.Range(1, N).Select(xi => ((double)xi / N) * (max - min) + min).ToArray(); // input
28
29    private static double[] ys_pagie = xs.Select(xi => 1.0 / (1 + Math.Pow(xi, -4))).ToArray(); // a potential target (not used for search)
30
31    // x³  * exp(-x) * cos(x) * sin(x) * (sin(x)² * cos(x) - 1)
32    // for keijzer x should be in scale 0 - 10 inclusive
33    private static double[] ys_keijzer4 = xs
34      .Select(xi => xi + 10.0) // scale
35      .Select(xi => xi * xi * xi + Math.Exp(-xi) * Math.Cos(xi) * Math.Sin(xi) * (Math.Sin(xi) * Math.Sin(xi) * Math.Cos(xi) - 1))
36      .ToArray();
37
38
39    public static int MAX_STACK = 20;
40    public static double[][] stack = new double[MAX_STACK][];
41    static Program() {
42      for (int i = 0; i < MAX_STACK; i++)
43        stack[i] = new double[N];
44    }
45
46    // loads symbolic expressions in postfix notation from a stream and identifies clusters of expressions
47    static void Main(string[] args) {
48
49      var hash2Postfix = new Dictionary<string, List<string>>();
50      var postfix2infix = new Dictionary<string, string>();
51
52      // read all sentences and determine shortest sentences
53      using (var reader = new StreamReader(allSentencesFileName)) {
54        // read header
55        reader.ReadLine();
56        int nSentences = 0;
57        while (!reader.EndOfStream) {
58          var line = reader.ReadLine();
59          var toks = line.Split(';');
60          var hash = toks[0];
61          var length = toks[1];
62          var postfix = toks[2];
63          var infix = toks[3];
64          List<string> alternativesList;
65          if (!hash2Postfix.TryGetValue(hash, out alternativesList)) {
66            alternativesList = new List<string>(1);
67            hash2Postfix.Add(hash, alternativesList);
68          }
69          alternativesList.Add(postfix);
70          postfix2infix.Add(postfix, infix);
71          nSentences++;
72        }
73
74        Console.WriteLine("{0} {1}", nSentences, hash2Postfix.Count);
75        //Evaluate(toks[1], xs, evalBuf);
76      }
77
78      List<double[]> functions = new List<double[]>();
79      List<string> sentences = new List<string>();
80      List<double[]> qualities = new List<double[]>(); // we might have multiple target functions to which we might compare
81
82      var ds = new Dataset(new string[] { "X" }, new IList[] { xs });
83      foreach (var kvp in hash2Postfix) {
84        var ls = kvp.Value;
85        var sentence = FindShortest(ls);
86        //EvaluatePostfix(sentence, xs, evalBuf);
87        evalBuf = EvaluateInfix(postfix2infix[sentence], ds).ToArray();
88        if (evalBuf.Any(ei => double.IsInfinity(ei) || double.IsNaN(ei))) {
89          Console.WriteLine("skipping {0} {1}", evalBuf.Average(), sentence);
90        } else {
91          try {
92            Scale(evalBuf);
93            functions.Add((double[])evalBuf.Clone());
94            sentences.Add(sentence);
95            OnlineCalculatorError error;
96            var r2_pagie = OnlinePearsonsRSquaredCalculator.Calculate(evalBuf, ys_pagie, out error);
97            if (error != OnlineCalculatorError.None) r2_pagie = 0.0;
98            var r2_keijzer4 = OnlinePearsonsRSquaredCalculator.Calculate(evalBuf, ys_keijzer4, out error);
99            if (error != OnlineCalculatorError.None) r2_keijzer4 = 0.0;
100            qualities.Add(new double[] { r2_pagie, r2_keijzer4});
101          } catch (ArgumentException e) {
102            // scaling failed
103          }
104        }
105      }
106
107
108      List<int> clusters;
109      List<double> distances;
110      // DEACTIVATED FOR NOW -> USE LARGEVIS in R instead
111      // Flann.FindClusters(functions, out clusters, out distances, 100);
112      clusters = functions.Select(_ => 0).ToList();
113      distances = functions.Select(_ => 0.0).ToList();
114      //
115      // output all clusters and functions
116      using (var writer = new StreamWriter(new System.IO.Compression.GZipStream(new FileStream(outputFileName, FileMode.OpenOrCreate), System.IO.Compression.CompressionMode.Compress))) {
117        for (int i = 0; i < functions.Count; i++) {
118          writer.WriteLine("{0};{1};{2};{3};{4};{5}", clusters[i], distances[i], string.Join(";", qualities[i]), sentences[i], postfix2infix[sentences[i]], string.Join(";", functions[i].Select(fi => fi.ToString())));
119        }
120      }
121      //
122      // var funClusters = functions.Zip(clusters, (f, c) => Tuple.Create(f, c)).GroupBy(t => t.Item2);
123      // var dtView = new DataTableView();
124      // dtView.Size = new Size(800, 600);
125      //
126      // foreach (var funCluster in funClusters) {
127      //   // draw the functions for each cluster into a separate png
128      //   // var dtName = string.Format("R² {0}", Enumerable.Range(0, qualities.Count).Where(idx => clusters[idx] == funCluster.Key).Select(idx => qualities[idx]).Average());
129      //   var dtName = "Cluster";
130      //   var dt = new DataTable(dtName, dtName);
131      //   var rows = new List<DataRow>();
132      //   int i = 0;
133      //   foreach (var fun in funCluster.Select(t => t.Item1)) {
134      //     var name = i.ToString();
135      //     var dr = new DataRow(name, name, fun);
136      //     rows.Add(dr);
137      //     i++;
138      //   }
139      //   dt.Rows.AddRange(rows);
140      //   dtView.Content = dt;
141      //   using (var bm = new Bitmap(800, 600)) {
142      //     dtView.DrawToBitmap(bm, new Rectangle(0, 0, 800, 600));
143      //     bm.Save(Path.Combine(clusterFolder, string.Format("cluster_{0,3}.png", funCluster.Key)));
144      //   }
145      // }
146    }
147
148
149
150    private static string FindShortest(List<string> ls) {
151      var minElem = ls.First();
152      for (int i = 1; i < ls.Count; i++) {
153        if (ls[i].Length < minElem.Length) minElem = ls[i];
154      }
155      return minElem;
156    }
157
158
159
160    #region evaluation
161
162    // scaling to zero-mean unit variance
163    private static void Scale(double[] evalBuf) {
164      double mean;
165      double variance;
166      var max = evalBuf.Max();
167      for (int i = 0; i < evalBuf.Length; i++) {
168        evalBuf[i] /= max;
169      }
170
171      OnlineCalculatorError error, varError;
172      OnlineMeanAndVarianceCalculator.Calculate(evalBuf, out mean, out variance, out error, out varError);
173      if(error!=OnlineCalculatorError.None || varError != OnlineCalculatorError.None) {
174        throw new ArgumentException("Cannot scale vector");
175      }
176
177      for (int i = 0; i < evalBuf.Length; i++) {
178        evalBuf[i] = 1.0 / variance * evalBuf[i] + mean;
179      }
180    }
181
182    // linear scaling to match target
183    private static void Scale(double[] evalBuf, double[] ys) {
184      double alpha;
185      double beta;
186      HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError error;
187
188      HeuristicLab.Problems.DataAnalysis.OnlineLinearScalingParameterCalculator.Calculate(evalBuf, ys, out alpha, out beta, out error);
189      if (error != HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError.None) {
190        throw new ArgumentException();
191      }
192
193      for (int i = 0; i < evalBuf.Length; i++) {
194        evalBuf[i] = beta * evalBuf[i] + alpha;
195      }
196    }
197
198    // evaluates infix expressions (using the infix parser)
199    private static IEnumerable<double> EvaluateInfix(string infixExpr, Dataset ds) {
200      var parser = new HeuristicLab.Problems.DataAnalysis.Symbolic.InfixExpressionParser();
201      var tree = parser.Parse(infixExpr);
202      var interpreter = new SymbolicDataAnalysisExpressionTreeLinearInterpreter();
203      return interpreter.GetSymbolicExpressionTreeValues(tree, ds, Enumerable.Range(0, ds.Rows));
204    }                                                 
205
206    /*
207    // evaluates postfix expressions (only for a very specific format)
208    private static void EvaluatePostfix(string postfixExpr, double[] xs, double[] evalBuf) {
209      int topOfStack = -1;
210      Evaluate(postfixExpr, 0, xs, ref topOfStack);
211      Array.Copy(stack[topOfStack], evalBuf, evalBuf.Length);
212    }
213
214   
215    private static void Evaluate(string postfixExpr, int exprPos, double[] xs, ref int topOfStack) {
216      while (exprPos < postfixExpr.Length) {
217        switch (postfixExpr[exprPos]) {
218          case '+': {
219              exprPos += 2;
220              var a = stack[topOfStack];
221              var b = stack[topOfStack - 1];
222              for (int i = 0; i < N; i++) {
223                b[i] += a[i];
224              }
225              topOfStack--;
226              break;
227            }
228          case '*': {
229              exprPos += 2;
230              var a = stack[topOfStack];
231              var b = stack[topOfStack - 1];
232              for (int i = 0; i < N; i++) {
233                b[i] *= a[i];
234              }
235              topOfStack--;
236              break;
237            }
238          case 'X': {
239              exprPos += 2;
240              topOfStack++;
241              Array.Copy(xs, stack[topOfStack], N);
242              break;
243            }
244          case 'c': {
245              if (postfixExpr[exprPos + 1] == 'o') {
246                // cos
247                exprPos += 4;
248                var a = stack[topOfStack];
249                for (int i = 0; i < N; i++) {
250                  a[i] = Math.Cos(a[i]);
251                }
252                break;
253              } else {
254                exprPos += 2;
255                // put 1 onto top of stack     // BUG!
256                topOfStack++;
257                var a = stack[topOfStack];
258                for (int i = 0; i < N; i++) a[i] = 1.0;
259                break;
260              }
261            }
262          case 's': {
263              // sin
264              exprPos += 4;
265              var a = stack[topOfStack];
266              for (int i = 0; i < N; i++) {
267                a[i] = Math.Sin(a[i]);
268              }
269              break;
270            }
271          case 'l': {
272              // log
273              exprPos += 4;
274              var a = stack[topOfStack];
275              for (int i = 0; i < N; i++) {
276                a[i] = Math.Log(a[i]);
277              }
278
279              break;
280            }
281          case 'e': {
282              // exp
283              exprPos += 4;
284              var a = stack[topOfStack];
285              for (int i = 0; i < N; i++) {
286                a[i] = Math.Exp(a[i]);
287              }
288
289              break;
290            }
291          case 'i': {
292              // inv
293              exprPos += 4;
294              var a = stack[topOfStack];
295              for (int i = 0; i < N; i++) {
296                a[i] = 1.0 / a[i];
297              }
298              break;
299            }
300          default: {
301              throw new InvalidOperationException(string.Format("Cannot handle {0} in {1}", postfixExpr[exprPos], postfixExpr));
302            }
303        }
304      }
305    }
306    */
307    #endregion
308  }
309}
Note: See TracBrowser for help on using the repository browser.