using System; using System.Collections; using System.Collections.Generic; using System.Drawing; using System.IO; using System.Linq; using HeuristicLab.Analysis; using HeuristicLab.Analysis.Views; using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding; using HeuristicLab.Problems.DataAnalysis; using HeuristicLab.Problems.DataAnalysis.Symbolic; // Evaluates sentences on randomly generated data namespace ExpressionClustering { class Program { private static readonly string folder = Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory); private static readonly string clusterFolder = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "clusters"); private static readonly string distinctSentencesFileName = Path.Combine(folder, @"distinctSentences_2018-04-13_09-52_TreeSize-10.csv"); private static readonly string allSentencesFileName = Path.Combine(folder, "allSentences_2018-04-13_09-52_TreeSize-10.csv"); private static readonly string outputFileName = Path.Combine(folder, "evaluations_2018-04-13_09-52_TreeSize-10.csv.gz"); private static int N = 100; private static double[] evalBuf = new double[N]; // pagie-1 (univariate) private static double min = -5.0; private static double max = +5.0; private static double[] xs = Enumerable.Range(1, N).Select(xi => ((double)xi / N) * (max - min) + min).ToArray(); // input private static double[] ys_pagie = xs.Select(xi => 1.0 / (1 + Math.Pow(xi, -4))).ToArray(); // a potential target (not used for search) // x³ * exp(-x) * cos(x) * sin(x) * (sin(x)² * cos(x) - 1) // for keijzer x should be in scale 0 - 10 inclusive private static double[] ys_keijzer4 = xs .Select(xi => xi + 10.0) // scale .Select(xi => xi * xi * xi + Math.Exp(-xi) * Math.Cos(xi) * Math.Sin(xi) * (Math.Sin(xi) * Math.Sin(xi) * Math.Cos(xi) - 1)) .ToArray(); public static int MAX_STACK = 20; public static double[][] stack = new double[MAX_STACK][]; static Program() { for (int i = 0; i < MAX_STACK; i++) stack[i] = new double[N]; } // loads symbolic expressions in postfix notation from a stream and identifies clusters of expressions static void Main(string[] args) { var hash2Postfix = new Dictionary>(); var postfix2infix = new Dictionary(); // read all sentences and determine shortest sentences using (var reader = new StreamReader(allSentencesFileName)) { // read header reader.ReadLine(); int nSentences = 0; while (!reader.EndOfStream) { var line = reader.ReadLine(); var toks = line.Split(';'); var hash = toks[0]; var length = toks[1]; var postfix = toks[2]; var infix = toks[3]; List alternativesList; if (!hash2Postfix.TryGetValue(hash, out alternativesList)) { alternativesList = new List(1); hash2Postfix.Add(hash, alternativesList); } alternativesList.Add(postfix); postfix2infix.Add(postfix, infix); nSentences++; } Console.WriteLine("{0} {1}", nSentences, hash2Postfix.Count); //Evaluate(toks[1], xs, evalBuf); } List functions = new List(); List sentences = new List(); List qualities = new List(); // we might have multiple target functions to which we might compare var ds = new Dataset(new string[] { "X" }, new IList[] { xs }); foreach (var kvp in hash2Postfix) { var ls = kvp.Value; var sentence = FindShortest(ls); //EvaluatePostfix(sentence, xs, evalBuf); evalBuf = EvaluateInfix(postfix2infix[sentence], ds).ToArray(); if (evalBuf.Any(ei => double.IsInfinity(ei) || double.IsNaN(ei))) { Console.WriteLine("skipping {0} {1}", evalBuf.Average(), sentence); } else { try { Scale(evalBuf); functions.Add((double[])evalBuf.Clone()); sentences.Add(sentence); OnlineCalculatorError error; var r2_pagie = OnlinePearsonsRSquaredCalculator.Calculate(evalBuf, ys_pagie, out error); if (error != OnlineCalculatorError.None) r2_pagie = 0.0; var r2_keijzer4 = OnlinePearsonsRSquaredCalculator.Calculate(evalBuf, ys_keijzer4, out error); if (error != OnlineCalculatorError.None) r2_keijzer4 = 0.0; qualities.Add(new double[] { r2_pagie, r2_keijzer4}); } catch (ArgumentException e) { // scaling failed } } } List clusters; List distances; // DEACTIVATED FOR NOW -> USE LARGEVIS in R instead // Flann.FindClusters(functions, out clusters, out distances, 100); clusters = functions.Select(_ => 0).ToList(); distances = functions.Select(_ => 0.0).ToList(); // // output all clusters and functions using (var writer = new StreamWriter(new System.IO.Compression.GZipStream(new FileStream(outputFileName, FileMode.OpenOrCreate), System.IO.Compression.CompressionMode.Compress))) { for (int i = 0; i < functions.Count; i++) { writer.WriteLine("{0};{1};{2};{3};{4};{5}", clusters[i], distances[i], string.Join(";", qualities[i]), sentences[i], postfix2infix[sentences[i]], string.Join(";", functions[i].Select(fi => fi.ToString()))); } } // // var funClusters = functions.Zip(clusters, (f, c) => Tuple.Create(f, c)).GroupBy(t => t.Item2); // var dtView = new DataTableView(); // dtView.Size = new Size(800, 600); // // foreach (var funCluster in funClusters) { // // draw the functions for each cluster into a separate png // // var dtName = string.Format("R² {0}", Enumerable.Range(0, qualities.Count).Where(idx => clusters[idx] == funCluster.Key).Select(idx => qualities[idx]).Average()); // var dtName = "Cluster"; // var dt = new DataTable(dtName, dtName); // var rows = new List(); // int i = 0; // foreach (var fun in funCluster.Select(t => t.Item1)) { // var name = i.ToString(); // var dr = new DataRow(name, name, fun); // rows.Add(dr); // i++; // } // dt.Rows.AddRange(rows); // dtView.Content = dt; // using (var bm = new Bitmap(800, 600)) { // dtView.DrawToBitmap(bm, new Rectangle(0, 0, 800, 600)); // bm.Save(Path.Combine(clusterFolder, string.Format("cluster_{0,3}.png", funCluster.Key))); // } // } } private static string FindShortest(List ls) { var minElem = ls.First(); for (int i = 1; i < ls.Count; i++) { if (ls[i].Length < minElem.Length) minElem = ls[i]; } return minElem; } #region evaluation // scaling to zero-mean unit variance private static void Scale(double[] evalBuf) { double mean; double variance; var max = evalBuf.Max(); for (int i = 0; i < evalBuf.Length; i++) { evalBuf[i] /= max; } OnlineCalculatorError error, varError; OnlineMeanAndVarianceCalculator.Calculate(evalBuf, out mean, out variance, out error, out varError); if(error!=OnlineCalculatorError.None || varError != OnlineCalculatorError.None) { throw new ArgumentException("Cannot scale vector"); } for (int i = 0; i < evalBuf.Length; i++) { evalBuf[i] = 1.0 / variance * evalBuf[i] + mean; } } // linear scaling to match target private static void Scale(double[] evalBuf, double[] ys) { double alpha; double beta; HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError error; HeuristicLab.Problems.DataAnalysis.OnlineLinearScalingParameterCalculator.Calculate(evalBuf, ys, out alpha, out beta, out error); if (error != HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError.None) { throw new ArgumentException(); } for (int i = 0; i < evalBuf.Length; i++) { evalBuf[i] = beta * evalBuf[i] + alpha; } } // evaluates infix expressions (using the infix parser) private static IEnumerable EvaluateInfix(string infixExpr, Dataset ds) { var parser = new HeuristicLab.Problems.DataAnalysis.Symbolic.InfixExpressionParser(); var tree = parser.Parse(infixExpr); var interpreter = new SymbolicDataAnalysisExpressionTreeLinearInterpreter(); return interpreter.GetSymbolicExpressionTreeValues(tree, ds, Enumerable.Range(0, ds.Rows)); } /* // evaluates postfix expressions (only for a very specific format) private static void EvaluatePostfix(string postfixExpr, double[] xs, double[] evalBuf) { int topOfStack = -1; Evaluate(postfixExpr, 0, xs, ref topOfStack); Array.Copy(stack[topOfStack], evalBuf, evalBuf.Length); } private static void Evaluate(string postfixExpr, int exprPos, double[] xs, ref int topOfStack) { while (exprPos < postfixExpr.Length) { switch (postfixExpr[exprPos]) { case '+': { exprPos += 2; var a = stack[topOfStack]; var b = stack[topOfStack - 1]; for (int i = 0; i < N; i++) { b[i] += a[i]; } topOfStack--; break; } case '*': { exprPos += 2; var a = stack[topOfStack]; var b = stack[topOfStack - 1]; for (int i = 0; i < N; i++) { b[i] *= a[i]; } topOfStack--; break; } case 'X': { exprPos += 2; topOfStack++; Array.Copy(xs, stack[topOfStack], N); break; } case 'c': { if (postfixExpr[exprPos + 1] == 'o') { // cos exprPos += 4; var a = stack[topOfStack]; for (int i = 0; i < N; i++) { a[i] = Math.Cos(a[i]); } break; } else { exprPos += 2; // put 1 onto top of stack // BUG! topOfStack++; var a = stack[topOfStack]; for (int i = 0; i < N; i++) a[i] = 1.0; break; } } case 's': { // sin exprPos += 4; var a = stack[topOfStack]; for (int i = 0; i < N; i++) { a[i] = Math.Sin(a[i]); } break; } case 'l': { // log exprPos += 4; var a = stack[topOfStack]; for (int i = 0; i < N; i++) { a[i] = Math.Log(a[i]); } break; } case 'e': { // exp exprPos += 4; var a = stack[topOfStack]; for (int i = 0; i < N; i++) { a[i] = Math.Exp(a[i]); } break; } case 'i': { // inv exprPos += 4; var a = stack[topOfStack]; for (int i = 0; i < N; i++) { a[i] = 1.0 / a[i]; } break; } default: { throw new InvalidOperationException(string.Format("Cannot handle {0} in {1}", postfixExpr[exprPos], postfixExpr)); } } } } */ #endregion } }