using System; using System.Collections.Generic; using System.Drawing; using System.IO; using System.Linq; using HeuristicLab.Analysis; using HeuristicLab.Analysis.Views; using System.Windows.Forms; namespace ExpressionClustering { class Program { private static readonly string folder = Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory); private static readonly string clusterFolder = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "clusters"); private static readonly string distinctSentencesFileName = Path.Combine(folder, @"distinctSentences.csv"); private static readonly string allSentencesFileName = Path.Combine(folder, "allSentences.csv"); private static readonly string outputFileName = Path.Combine(folder, "evaluations.csv.gz"); private static int N = 100; private static double[] evalBuf = new double[N]; // pagie-1 (univariate) private static double min = -5.0; private static double max = +5.0; private static double[] xs = Enumerable.Range(1, N).Select(xi => ((double)xi / N) * (max - min) + min).ToArray(); // input private static double[] ys = xs.Select(xi => 1.0 / (1 + Math.Pow(xi, -4))).ToArray(); // target (necessary for scaling and clustering public static int MAX_STACK = 20; public static double[][] stack = new double[MAX_STACK][]; static Program() { for (int i = 0; i < MAX_STACK; i++) stack[i] = new double[N]; } // loads symbolic expressions in postfix notation from a stream and identifies clusters of expressions static void Main(string[] args) { var hash2Sentences = new Dictionary>(); // for debugging only var postfix2infix = new Dictionary(); // read all sentences and determine shortest sentences using (var reader = new StreamReader(allSentencesFileName)) { // read header reader.ReadLine(); int nSentences = 0; while (!reader.EndOfStream) { var line = reader.ReadLine(); var toks = line.Split(';'); var hash = toks[2]; List ls; if (!hash2Sentences.TryGetValue(hash, out ls)) { ls = new List(1); hash2Sentences.Add(hash, ls); } ls.Add(toks[1]); postfix2infix.Add(toks[1], toks[0]); nSentences++; } Console.WriteLine("{0} {1}", nSentences, hash2Sentences.Count); //Evaluate(toks[1], xs, evalBuf); } List functions = new List(); List sentences = new List(); List qualities = new List(); foreach (var kvp in hash2Sentences) { var ls = kvp.Value; var sentence = FindShortest(ls); Evaluate(sentence, xs, evalBuf); if (evalBuf.Any(ei => float.IsInfinity((float)ei) || float.IsNaN((float)ei))) { Console.WriteLine("skipping {0} {1}", evalBuf.Average(), sentence); } else { try { Scale(evalBuf, ys); functions.Add((double[])evalBuf.Clone()); sentences.Add(sentence); HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError error; qualities.Add(HeuristicLab.Problems.DataAnalysis.OnlinePearsonsRSquaredCalculator.Calculate(evalBuf, ys, out error)); } catch (ArgumentException e) { // scaling failed } } } List clusters; List distances; Flann.FindClusters(functions, out clusters, out distances, 100); // output all clusters and functions using (var writer = new StreamWriter(new System.IO.Compression.GZipStream(new FileStream(outputFileName, FileMode.OpenOrCreate), System.IO.Compression.CompressionMode.Compress))) { for (int i = 0; i < functions.Count; i++) { writer.WriteLine("{0};{1};{2};{3};{4};{5}", clusters[i], distances[i], qualities[i], sentences[i], postfix2infix[sentences[i]], string.Join(";", functions[i].Select(fi => fi.ToString()))); } } var funClusters = functions.Zip(clusters, (f, c) => Tuple.Create(f, c)).GroupBy(t => t.Item2); var dtView = new DataTableView(); dtView.Size = new Size(800, 600); foreach (var funCluster in funClusters) { // draw the functions for each cluster into a separate png var dtName = string.Format("R² {0}", Enumerable.Range(0, qualities.Count).Where(idx => clusters[idx] == funCluster.Key).Select(idx => qualities[idx]).Average()); var dt = new DataTable(dtName, dtName); var rows = new List(); int i = 0; foreach (var fun in funCluster.Select(t => t.Item1)) { var name = i.ToString(); var dr = new DataRow(name, name, fun); rows.Add(dr); i++; } dt.Rows.AddRange(rows); dtView.Content = dt; using (var bm = new Bitmap(800, 600)) { dtView.DrawToBitmap(bm, new Rectangle(0, 0, 800, 600)); bm.Save(Path.Combine(clusterFolder, string.Format("cluster_{0,3}.png", funCluster.Key))); } } } private static string FindShortest(List ls) { var minElem = ls.First(); for (int i = 1; i < ls.Count; i++) { if (ls[i].Length < minElem.Length) minElem = ls[i]; } return minElem; } #region evaluation // linear scaling private static void Scale(double[] evalBuf, double[] ys) { double alpha; double beta; HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError error; HeuristicLab.Problems.DataAnalysis.OnlineLinearScalingParameterCalculator.Calculate(evalBuf, ys, out alpha, out beta, out error); if (error != HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError.None) { throw new ArgumentException(); } for (int i = 0; i < evalBuf.Length; i++) { evalBuf[i] = beta * evalBuf[i] + alpha; } } // evaluates postfix expressions (only for a very specific format) private static void Evaluate(string postfixExpr, double[] xs, double[] evalBuf) { int topOfStack = -1; Evaluate(postfixExpr, 0, xs, ref topOfStack); Array.Copy(stack[topOfStack], evalBuf, evalBuf.Length); } private static void Evaluate(string postfixExpr, int exprPos, double[] xs, ref int topOfStack) { while (exprPos < postfixExpr.Length) { switch (postfixExpr[exprPos]) { case '+': { exprPos += 2; var a = stack[topOfStack]; var b = stack[topOfStack - 1]; for (int i = 0; i < N; i++) { b[i] += a[i]; } topOfStack--; break; } case '*': { exprPos += 2; var a = stack[topOfStack]; var b = stack[topOfStack - 1]; for (int i = 0; i < N; i++) { b[i] *= a[i]; } topOfStack--; break; } case 'X': { exprPos += 2; topOfStack++; Array.Copy(xs, stack[topOfStack], N); break; } case 'c': { // cos exprPos += 4; var a = stack[topOfStack]; for (int i = 0; i < N; i++) { a[i] = Math.Cos(a[i]); } break; } case 's': { // sin exprPos += 4; var a = stack[topOfStack]; for (int i = 0; i < N; i++) { a[i] = Math.Sin(a[i]); } break; } case 'l': { // log exprPos += 4; var a = stack[topOfStack]; for (int i = 0; i < N; i++) { a[i] = Math.Log(a[i]); } break; } case 'e': { // exp exprPos += 4; var a = stack[topOfStack]; for (int i = 0; i < N; i++) { a[i] = Math.Exp(a[i]); } break; } case 'i': { // inv exprPos += 4; var a = stack[topOfStack]; for (int i = 0; i < N; i++) { a[i] = 1.0 / a[i]; } break; } default: { throw new InvalidOperationException(string.Format("Cannot handle {0} in {1}", postfixExpr[exprPos], postfixExpr)); } } } } #endregion } }