[15840] | 1 | using System;
|
---|
| 2 | using System.Collections.Generic;
|
---|
[15842] | 3 | using System.Drawing;
|
---|
[15840] | 4 | using System.IO;
|
---|
| 5 | using System.Linq;
|
---|
[15842] | 6 | using HeuristicLab.Analysis;
|
---|
| 7 | using HeuristicLab.Analysis.Views;
|
---|
| 8 | using System.Windows.Forms;
|
---|
[15840] | 9 |
|
---|
| 10 | namespace ExpressionClustering {
|
---|
| 11 | class Program {
|
---|
| 12 | private static readonly string folder = Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory);
|
---|
[15842] | 13 | private static readonly string clusterFolder = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "clusters");
|
---|
[15840] | 14 | private static readonly string distinctSentencesFileName = Path.Combine(folder, @"distinctSentences.csv");
|
---|
| 15 | private static readonly string allSentencesFileName = Path.Combine(folder, "allSentences.csv");
|
---|
| 16 | private static readonly string outputFileName = Path.Combine(folder, "evaluations.csv.gz");
|
---|
| 17 | private static int N = 100;
|
---|
| 18 | private static double[] evalBuf = new double[N];
|
---|
| 19 |
|
---|
| 20 | // pagie-1 (univariate)
|
---|
| 21 | private static double min = -5.0;
|
---|
| 22 | private static double max = +5.0;
|
---|
| 23 | private static double[] xs = Enumerable.Range(1, N).Select(xi => ((double)xi / N) * (max - min) + min).ToArray(); // input
|
---|
| 24 | private static double[] ys = xs.Select(xi => 1.0 / (1 + Math.Pow(xi, -4))).ToArray(); // target (necessary for scaling and clustering
|
---|
| 25 |
|
---|
| 26 |
|
---|
| 27 | public static int MAX_STACK = 20;
|
---|
| 28 | public static double[][] stack = new double[MAX_STACK][];
|
---|
| 29 | static Program() {
|
---|
| 30 | for (int i = 0; i < MAX_STACK; i++)
|
---|
| 31 | stack[i] = new double[N];
|
---|
| 32 | }
|
---|
| 33 |
|
---|
| 34 | // loads symbolic expressions in postfix notation from a stream and identifies clusters of expressions
|
---|
| 35 | static void Main(string[] args) {
|
---|
| 36 |
|
---|
| 37 | var hash2Sentences = new Dictionary<string, List<string>>();
|
---|
[15842] | 38 | // for debugging only
|
---|
| 39 | var postfix2infix = new Dictionary<string, string>();
|
---|
[15840] | 40 |
|
---|
| 41 |
|
---|
| 42 |
|
---|
| 43 | // read all sentences and determine shortest sentences
|
---|
| 44 | using (var reader = new StreamReader(allSentencesFileName)) {
|
---|
| 45 | // read header
|
---|
| 46 | reader.ReadLine();
|
---|
| 47 | int nSentences = 0;
|
---|
| 48 | while (!reader.EndOfStream) {
|
---|
| 49 | var line = reader.ReadLine();
|
---|
| 50 | var toks = line.Split(';');
|
---|
| 51 | var hash = toks[2];
|
---|
| 52 | List<string> ls;
|
---|
| 53 | if (!hash2Sentences.TryGetValue(hash, out ls)) {
|
---|
| 54 | ls = new List<string>(1);
|
---|
| 55 | hash2Sentences.Add(hash, ls);
|
---|
| 56 | }
|
---|
| 57 | ls.Add(toks[1]);
|
---|
[15842] | 58 | postfix2infix.Add(toks[1], toks[0]);
|
---|
[15840] | 59 | nSentences++;
|
---|
| 60 | }
|
---|
| 61 |
|
---|
| 62 | Console.WriteLine("{0} {1}", nSentences, hash2Sentences.Count);
|
---|
| 63 | //Evaluate(toks[1], xs, evalBuf);
|
---|
| 64 | }
|
---|
| 65 |
|
---|
[15842] | 66 | List<double[]> functions = new List<double[]>();
|
---|
| 67 | List<string> sentences = new List<string>();
|
---|
| 68 | List<double> qualities = new List<double>();
|
---|
| 69 |
|
---|
| 70 | foreach (var kvp in hash2Sentences) {
|
---|
| 71 | var ls = kvp.Value;
|
---|
| 72 | var sentence = FindShortest(ls);
|
---|
| 73 | Evaluate(sentence, xs, evalBuf);
|
---|
| 74 | if (evalBuf.Any(ei => float.IsInfinity((float)ei) || float.IsNaN((float)ei))) {
|
---|
| 75 | Console.WriteLine("skipping {0} {1}", evalBuf.Average(), sentence);
|
---|
| 76 | } else {
|
---|
| 77 | try {
|
---|
| 78 | Scale(evalBuf, ys);
|
---|
| 79 | functions.Add((double[])evalBuf.Clone());
|
---|
| 80 | sentences.Add(sentence);
|
---|
| 81 | HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError error;
|
---|
| 82 | qualities.Add(HeuristicLab.Problems.DataAnalysis.OnlinePearsonsRSquaredCalculator.Calculate(evalBuf, ys, out error));
|
---|
| 83 | } catch (ArgumentException e) {
|
---|
| 84 | // scaling failed
|
---|
| 85 | }
|
---|
[15840] | 86 | }
|
---|
[15842] | 87 | }
|
---|
[15840] | 88 |
|
---|
[15842] | 89 | List<int> clusters;
|
---|
| 90 | List<double> distances;
|
---|
| 91 | Flann.FindClusters(functions, out clusters, out distances, 100);
|
---|
| 92 |
|
---|
| 93 | // output all clusters and functions
|
---|
| 94 | using (var writer = new StreamWriter(new System.IO.Compression.GZipStream(new FileStream(outputFileName, FileMode.OpenOrCreate), System.IO.Compression.CompressionMode.Compress))) {
|
---|
| 95 | for (int i = 0; i < functions.Count; i++) {
|
---|
| 96 | writer.WriteLine("{0};{1};{2};{3};{4};{5}", clusters[i], distances[i], qualities[i], sentences[i], postfix2infix[sentences[i]], string.Join(";", functions[i].Select(fi => fi.ToString())));
|
---|
| 97 | }
|
---|
[15840] | 98 | }
|
---|
| 99 |
|
---|
[15842] | 100 | var funClusters = functions.Zip(clusters, (f, c) => Tuple.Create(f, c)).GroupBy(t => t.Item2);
|
---|
| 101 | var dtView = new DataTableView();
|
---|
| 102 | dtView.Size = new Size(800, 600);
|
---|
| 103 |
|
---|
| 104 | foreach (var funCluster in funClusters) {
|
---|
| 105 | // draw the functions for each cluster into a separate png
|
---|
| 106 | var dtName = string.Format("R² {0}", Enumerable.Range(0, qualities.Count).Where(idx => clusters[idx] == funCluster.Key).Select(idx => qualities[idx]).Average());
|
---|
| 107 | var dt = new DataTable(dtName, dtName);
|
---|
| 108 | var rows = new List<DataRow>();
|
---|
| 109 | int i = 0;
|
---|
| 110 | foreach (var fun in funCluster.Select(t => t.Item1)) {
|
---|
| 111 | var name = i.ToString();
|
---|
| 112 | var dr = new DataRow(name, name, fun);
|
---|
| 113 | rows.Add(dr);
|
---|
| 114 | i++;
|
---|
| 115 | }
|
---|
| 116 | dt.Rows.AddRange(rows);
|
---|
| 117 | dtView.Content = dt;
|
---|
| 118 | using (var bm = new Bitmap(800, 600)) {
|
---|
| 119 | dtView.DrawToBitmap(bm, new Rectangle(0, 0, 800, 600));
|
---|
| 120 | bm.Save(Path.Combine(clusterFolder, string.Format("cluster_{0,3}.png", funCluster.Key)));
|
---|
| 121 | }
|
---|
| 122 | }
|
---|
[15840] | 123 | }
|
---|
| 124 |
|
---|
[15842] | 125 |
|
---|
| 126 |
|
---|
[15840] | 127 | private static string FindShortest(List<string> ls) {
|
---|
| 128 | var minElem = ls.First();
|
---|
| 129 | for (int i = 1; i < ls.Count; i++) {
|
---|
| 130 | if (ls[i].Length < minElem.Length) minElem = ls[i];
|
---|
| 131 | }
|
---|
| 132 | return minElem;
|
---|
| 133 | }
|
---|
| 134 |
|
---|
| 135 |
|
---|
[15842] | 136 |
|
---|
[15840] | 137 | #region evaluation
|
---|
| 138 | // linear scaling
|
---|
| 139 | private static void Scale(double[] evalBuf, double[] ys) {
|
---|
| 140 | double alpha;
|
---|
| 141 | double beta;
|
---|
| 142 | HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError error;
|
---|
| 143 |
|
---|
| 144 | HeuristicLab.Problems.DataAnalysis.OnlineLinearScalingParameterCalculator.Calculate(evalBuf, ys, out alpha, out beta, out error);
|
---|
| 145 | if (error != HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError.None) {
|
---|
[15842] | 146 | throw new ArgumentException();
|
---|
[15840] | 147 | }
|
---|
| 148 |
|
---|
| 149 | for (int i = 0; i < evalBuf.Length; i++) {
|
---|
| 150 | evalBuf[i] = beta * evalBuf[i] + alpha;
|
---|
| 151 | }
|
---|
| 152 | }
|
---|
| 153 |
|
---|
| 154 | // evaluates postfix expressions (only for a very specific format)
|
---|
| 155 | private static void Evaluate(string postfixExpr, double[] xs, double[] evalBuf) {
|
---|
| 156 | int topOfStack = -1;
|
---|
| 157 | Evaluate(postfixExpr, 0, xs, ref topOfStack);
|
---|
| 158 | Array.Copy(stack[topOfStack], evalBuf, evalBuf.Length);
|
---|
| 159 | }
|
---|
| 160 |
|
---|
| 161 | private static void Evaluate(string postfixExpr, int exprPos, double[] xs, ref int topOfStack) {
|
---|
| 162 | while (exprPos < postfixExpr.Length) {
|
---|
| 163 | switch (postfixExpr[exprPos]) {
|
---|
| 164 | case '+': {
|
---|
| 165 | exprPos += 2;
|
---|
| 166 | var a = stack[topOfStack];
|
---|
| 167 | var b = stack[topOfStack - 1];
|
---|
| 168 | for (int i = 0; i < N; i++) {
|
---|
[15842] | 169 | b[i] += a[i];
|
---|
[15840] | 170 | }
|
---|
[15842] | 171 | topOfStack--;
|
---|
[15840] | 172 | break;
|
---|
| 173 | }
|
---|
| 174 | case '*': {
|
---|
| 175 | exprPos += 2;
|
---|
| 176 | var a = stack[topOfStack];
|
---|
| 177 | var b = stack[topOfStack - 1];
|
---|
| 178 | for (int i = 0; i < N; i++) {
|
---|
[15842] | 179 | b[i] *= a[i];
|
---|
[15840] | 180 | }
|
---|
[15842] | 181 | topOfStack--;
|
---|
[15840] | 182 | break;
|
---|
| 183 | }
|
---|
| 184 | case 'X': {
|
---|
| 185 | exprPos += 2;
|
---|
| 186 | topOfStack++;
|
---|
| 187 | Array.Copy(xs, stack[topOfStack], N);
|
---|
| 188 | break;
|
---|
| 189 | }
|
---|
| 190 | case 'c': {
|
---|
| 191 | // cos
|
---|
| 192 | exprPos += 4;
|
---|
| 193 | var a = stack[topOfStack];
|
---|
| 194 | for (int i = 0; i < N; i++) {
|
---|
| 195 | a[i] = Math.Cos(a[i]);
|
---|
| 196 | }
|
---|
| 197 | break;
|
---|
| 198 | }
|
---|
| 199 | case 's': {
|
---|
| 200 | // sin
|
---|
| 201 | exprPos += 4;
|
---|
| 202 | var a = stack[topOfStack];
|
---|
| 203 | for (int i = 0; i < N; i++) {
|
---|
| 204 | a[i] = Math.Sin(a[i]);
|
---|
| 205 | }
|
---|
| 206 | break;
|
---|
| 207 | }
|
---|
| 208 | case 'l': {
|
---|
| 209 | // log
|
---|
| 210 | exprPos += 4;
|
---|
| 211 | var a = stack[topOfStack];
|
---|
| 212 | for (int i = 0; i < N; i++) {
|
---|
| 213 | a[i] = Math.Log(a[i]);
|
---|
| 214 | }
|
---|
| 215 |
|
---|
| 216 | break;
|
---|
| 217 | }
|
---|
| 218 | case 'e': {
|
---|
| 219 | // exp
|
---|
| 220 | exprPos += 4;
|
---|
| 221 | var a = stack[topOfStack];
|
---|
| 222 | for (int i = 0; i < N; i++) {
|
---|
| 223 | a[i] = Math.Exp(a[i]);
|
---|
| 224 | }
|
---|
| 225 |
|
---|
| 226 | break;
|
---|
| 227 | }
|
---|
| 228 | case 'i': {
|
---|
| 229 | // inv
|
---|
| 230 | exprPos += 4;
|
---|
| 231 | var a = stack[topOfStack];
|
---|
| 232 | for (int i = 0; i < N; i++) {
|
---|
| 233 | a[i] = 1.0 / a[i];
|
---|
| 234 | }
|
---|
| 235 | break;
|
---|
| 236 | }
|
---|
| 237 | default: {
|
---|
| 238 | throw new InvalidOperationException(string.Format("Cannot handle {0} in {1}", postfixExpr[exprPos], postfixExpr));
|
---|
| 239 | }
|
---|
| 240 | }
|
---|
| 241 | }
|
---|
| 242 | }
|
---|
| 243 | #endregion
|
---|
| 244 | }
|
---|
| 245 | }
|
---|