1 | using System;
|
---|
2 | using System.Collections;
|
---|
3 | using System.Collections.Generic;
|
---|
4 | using System.Drawing;
|
---|
5 | using System.IO;
|
---|
6 | using System.Linq;
|
---|
7 | using HeuristicLab.Analysis;
|
---|
8 | using HeuristicLab.Analysis.Views;
|
---|
9 | using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
|
---|
10 | using HeuristicLab.Problems.DataAnalysis;
|
---|
11 | using HeuristicLab.Problems.DataAnalysis.Symbolic;
|
---|
12 |
|
---|
13 | // Evaluates sentences on randomly generated data
|
---|
14 | namespace ExpressionClustering {
|
---|
15 | class Program {
|
---|
16 | private static readonly string folder = Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory);
|
---|
17 | private static readonly string clusterFolder = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "clusters");
|
---|
18 | private static readonly string distinctSentencesFileName = Path.Combine(folder, @"distinctSentences_2018-04-13_09-52_TreeSize-10.csv");
|
---|
19 | private static readonly string allSentencesFileName = Path.Combine(folder, "allSentences_2018-04-13_09-52_TreeSize-10.csv");
|
---|
20 | private static readonly string outputFileName = Path.Combine(folder, "evaluations_2018-04-13_09-52_TreeSize-10.csv.gz");
|
---|
21 | private static int N = 100;
|
---|
22 | private static double[] evalBuf = new double[N];
|
---|
23 |
|
---|
24 | // pagie-1 (univariate)
|
---|
25 | private static double min = -5.0;
|
---|
26 | private static double max = +5.0;
|
---|
27 | private static double[] xs = Enumerable.Range(1, N).Select(xi => ((double)xi / N) * (max - min) + min).ToArray(); // input
|
---|
28 |
|
---|
29 | private static double[] ys_pagie = xs.Select(xi => 1.0 / (1 + Math.Pow(xi, -4))).ToArray(); // a potential target (not used for search)
|
---|
30 |
|
---|
31 | // x³ * exp(-x) * cos(x) * sin(x) * (sin(x)² * cos(x) - 1)
|
---|
32 | // for keijzer x should be in scale 0 - 10 inclusive
|
---|
33 | private static double[] ys_keijzer4 = xs
|
---|
34 | .Select(xi => xi + 10.0) // scale
|
---|
35 | .Select(xi => xi * xi * xi + Math.Exp(-xi) * Math.Cos(xi) * Math.Sin(xi) * (Math.Sin(xi) * Math.Sin(xi) * Math.Cos(xi) - 1))
|
---|
36 | .ToArray();
|
---|
37 |
|
---|
38 |
|
---|
39 | public static int MAX_STACK = 20;
|
---|
40 | public static double[][] stack = new double[MAX_STACK][];
|
---|
41 | static Program() {
|
---|
42 | for (int i = 0; i < MAX_STACK; i++)
|
---|
43 | stack[i] = new double[N];
|
---|
44 | }
|
---|
45 |
|
---|
46 | // loads symbolic expressions in postfix notation from a stream and identifies clusters of expressions
|
---|
47 | static void Main(string[] args) {
|
---|
48 |
|
---|
49 | var hash2Postfix = new Dictionary<string, List<string>>();
|
---|
50 | var postfix2infix = new Dictionary<string, string>();
|
---|
51 |
|
---|
52 | // read all sentences and determine shortest sentences
|
---|
53 | using (var reader = new StreamReader(allSentencesFileName)) {
|
---|
54 | // read header
|
---|
55 | reader.ReadLine();
|
---|
56 | int nSentences = 0;
|
---|
57 | while (!reader.EndOfStream) {
|
---|
58 | var line = reader.ReadLine();
|
---|
59 | var toks = line.Split(';');
|
---|
60 | var hash = toks[0];
|
---|
61 | var length = toks[1];
|
---|
62 | var postfix = toks[2];
|
---|
63 | var infix = toks[3];
|
---|
64 | List<string> alternativesList;
|
---|
65 | if (!hash2Postfix.TryGetValue(hash, out alternativesList)) {
|
---|
66 | alternativesList = new List<string>(1);
|
---|
67 | hash2Postfix.Add(hash, alternativesList);
|
---|
68 | }
|
---|
69 | alternativesList.Add(postfix);
|
---|
70 | postfix2infix.Add(postfix, infix);
|
---|
71 | nSentences++;
|
---|
72 | }
|
---|
73 |
|
---|
74 | Console.WriteLine("{0} {1}", nSentences, hash2Postfix.Count);
|
---|
75 | //Evaluate(toks[1], xs, evalBuf);
|
---|
76 | }
|
---|
77 |
|
---|
78 | List<double[]> functions = new List<double[]>();
|
---|
79 | List<string> sentences = new List<string>();
|
---|
80 | List<double[]> qualities = new List<double[]>(); // we might have multiple target functions to which we might compare
|
---|
81 |
|
---|
82 | var ds = new Dataset(new string[] { "X" }, new IList[] { xs });
|
---|
83 | foreach (var kvp in hash2Postfix) {
|
---|
84 | var ls = kvp.Value;
|
---|
85 | var sentence = FindShortest(ls);
|
---|
86 | //EvaluatePostfix(sentence, xs, evalBuf);
|
---|
87 | evalBuf = EvaluateInfix(postfix2infix[sentence], ds).ToArray();
|
---|
88 | if (evalBuf.Any(ei => double.IsInfinity(ei) || double.IsNaN(ei))) {
|
---|
89 | Console.WriteLine("skipping {0} {1}", evalBuf.Average(), sentence);
|
---|
90 | } else {
|
---|
91 | try {
|
---|
92 | Scale(evalBuf);
|
---|
93 | functions.Add((double[])evalBuf.Clone());
|
---|
94 | sentences.Add(sentence);
|
---|
95 | OnlineCalculatorError error;
|
---|
96 | var r2_pagie = OnlinePearsonsRSquaredCalculator.Calculate(evalBuf, ys_pagie, out error);
|
---|
97 | if (error != OnlineCalculatorError.None) r2_pagie = 0.0;
|
---|
98 | var r2_keijzer4 = OnlinePearsonsRSquaredCalculator.Calculate(evalBuf, ys_keijzer4, out error);
|
---|
99 | if (error != OnlineCalculatorError.None) r2_keijzer4 = 0.0;
|
---|
100 | qualities.Add(new double[] { r2_pagie, r2_keijzer4});
|
---|
101 | } catch (ArgumentException e) {
|
---|
102 | // scaling failed
|
---|
103 | }
|
---|
104 | }
|
---|
105 | }
|
---|
106 |
|
---|
107 |
|
---|
108 | List<int> clusters;
|
---|
109 | List<double> distances;
|
---|
110 | // DEACTIVATED FOR NOW -> USE LARGEVIS in R instead
|
---|
111 | // Flann.FindClusters(functions, out clusters, out distances, 100);
|
---|
112 | clusters = functions.Select(_ => 0).ToList();
|
---|
113 | distances = functions.Select(_ => 0.0).ToList();
|
---|
114 | //
|
---|
115 | // output all clusters and functions
|
---|
116 | using (var writer = new StreamWriter(new System.IO.Compression.GZipStream(new FileStream(outputFileName, FileMode.OpenOrCreate), System.IO.Compression.CompressionMode.Compress))) {
|
---|
117 | for (int i = 0; i < functions.Count; i++) {
|
---|
118 | writer.WriteLine("{0};{1};{2};{3};{4};{5}", clusters[i], distances[i], string.Join(";", qualities[i]), sentences[i], postfix2infix[sentences[i]], string.Join(";", functions[i].Select(fi => fi.ToString())));
|
---|
119 | }
|
---|
120 | }
|
---|
121 | //
|
---|
122 | // var funClusters = functions.Zip(clusters, (f, c) => Tuple.Create(f, c)).GroupBy(t => t.Item2);
|
---|
123 | // var dtView = new DataTableView();
|
---|
124 | // dtView.Size = new Size(800, 600);
|
---|
125 | //
|
---|
126 | // foreach (var funCluster in funClusters) {
|
---|
127 | // // draw the functions for each cluster into a separate png
|
---|
128 | // // var dtName = string.Format("R² {0}", Enumerable.Range(0, qualities.Count).Where(idx => clusters[idx] == funCluster.Key).Select(idx => qualities[idx]).Average());
|
---|
129 | // var dtName = "Cluster";
|
---|
130 | // var dt = new DataTable(dtName, dtName);
|
---|
131 | // var rows = new List<DataRow>();
|
---|
132 | // int i = 0;
|
---|
133 | // foreach (var fun in funCluster.Select(t => t.Item1)) {
|
---|
134 | // var name = i.ToString();
|
---|
135 | // var dr = new DataRow(name, name, fun);
|
---|
136 | // rows.Add(dr);
|
---|
137 | // i++;
|
---|
138 | // }
|
---|
139 | // dt.Rows.AddRange(rows);
|
---|
140 | // dtView.Content = dt;
|
---|
141 | // using (var bm = new Bitmap(800, 600)) {
|
---|
142 | // dtView.DrawToBitmap(bm, new Rectangle(0, 0, 800, 600));
|
---|
143 | // bm.Save(Path.Combine(clusterFolder, string.Format("cluster_{0,3}.png", funCluster.Key)));
|
---|
144 | // }
|
---|
145 | // }
|
---|
146 | }
|
---|
147 |
|
---|
148 |
|
---|
149 |
|
---|
150 | private static string FindShortest(List<string> ls) {
|
---|
151 | var minElem = ls.First();
|
---|
152 | for (int i = 1; i < ls.Count; i++) {
|
---|
153 | if (ls[i].Length < minElem.Length) minElem = ls[i];
|
---|
154 | }
|
---|
155 | return minElem;
|
---|
156 | }
|
---|
157 |
|
---|
158 |
|
---|
159 |
|
---|
160 | #region evaluation
|
---|
161 |
|
---|
162 | // scaling to zero-mean unit variance
|
---|
163 | private static void Scale(double[] evalBuf) {
|
---|
164 | double mean;
|
---|
165 | double variance;
|
---|
166 | var max = evalBuf.Max();
|
---|
167 | for (int i = 0; i < evalBuf.Length; i++) {
|
---|
168 | evalBuf[i] /= max;
|
---|
169 | }
|
---|
170 |
|
---|
171 | OnlineCalculatorError error, varError;
|
---|
172 | OnlineMeanAndVarianceCalculator.Calculate(evalBuf, out mean, out variance, out error, out varError);
|
---|
173 | if(error!=OnlineCalculatorError.None || varError != OnlineCalculatorError.None) {
|
---|
174 | throw new ArgumentException("Cannot scale vector");
|
---|
175 | }
|
---|
176 |
|
---|
177 | for (int i = 0; i < evalBuf.Length; i++) {
|
---|
178 | evalBuf[i] = 1.0 / variance * evalBuf[i] + mean;
|
---|
179 | }
|
---|
180 | }
|
---|
181 |
|
---|
182 | // linear scaling to match target
|
---|
183 | private static void Scale(double[] evalBuf, double[] ys) {
|
---|
184 | double alpha;
|
---|
185 | double beta;
|
---|
186 | HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError error;
|
---|
187 |
|
---|
188 | HeuristicLab.Problems.DataAnalysis.OnlineLinearScalingParameterCalculator.Calculate(evalBuf, ys, out alpha, out beta, out error);
|
---|
189 | if (error != HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError.None) {
|
---|
190 | throw new ArgumentException();
|
---|
191 | }
|
---|
192 |
|
---|
193 | for (int i = 0; i < evalBuf.Length; i++) {
|
---|
194 | evalBuf[i] = beta * evalBuf[i] + alpha;
|
---|
195 | }
|
---|
196 | }
|
---|
197 |
|
---|
198 | // evaluates infix expressions (using the infix parser)
|
---|
199 | private static IEnumerable<double> EvaluateInfix(string infixExpr, Dataset ds) {
|
---|
200 | var parser = new HeuristicLab.Problems.DataAnalysis.Symbolic.InfixExpressionParser();
|
---|
201 | var tree = parser.Parse(infixExpr);
|
---|
202 | var interpreter = new SymbolicDataAnalysisExpressionTreeLinearInterpreter();
|
---|
203 | return interpreter.GetSymbolicExpressionTreeValues(tree, ds, Enumerable.Range(0, ds.Rows));
|
---|
204 | }
|
---|
205 |
|
---|
206 | /*
|
---|
207 | // evaluates postfix expressions (only for a very specific format)
|
---|
208 | private static void EvaluatePostfix(string postfixExpr, double[] xs, double[] evalBuf) {
|
---|
209 | int topOfStack = -1;
|
---|
210 | Evaluate(postfixExpr, 0, xs, ref topOfStack);
|
---|
211 | Array.Copy(stack[topOfStack], evalBuf, evalBuf.Length);
|
---|
212 | }
|
---|
213 |
|
---|
214 |
|
---|
215 | private static void Evaluate(string postfixExpr, int exprPos, double[] xs, ref int topOfStack) {
|
---|
216 | while (exprPos < postfixExpr.Length) {
|
---|
217 | switch (postfixExpr[exprPos]) {
|
---|
218 | case '+': {
|
---|
219 | exprPos += 2;
|
---|
220 | var a = stack[topOfStack];
|
---|
221 | var b = stack[topOfStack - 1];
|
---|
222 | for (int i = 0; i < N; i++) {
|
---|
223 | b[i] += a[i];
|
---|
224 | }
|
---|
225 | topOfStack--;
|
---|
226 | break;
|
---|
227 | }
|
---|
228 | case '*': {
|
---|
229 | exprPos += 2;
|
---|
230 | var a = stack[topOfStack];
|
---|
231 | var b = stack[topOfStack - 1];
|
---|
232 | for (int i = 0; i < N; i++) {
|
---|
233 | b[i] *= a[i];
|
---|
234 | }
|
---|
235 | topOfStack--;
|
---|
236 | break;
|
---|
237 | }
|
---|
238 | case 'X': {
|
---|
239 | exprPos += 2;
|
---|
240 | topOfStack++;
|
---|
241 | Array.Copy(xs, stack[topOfStack], N);
|
---|
242 | break;
|
---|
243 | }
|
---|
244 | case 'c': {
|
---|
245 | if (postfixExpr[exprPos + 1] == 'o') {
|
---|
246 | // cos
|
---|
247 | exprPos += 4;
|
---|
248 | var a = stack[topOfStack];
|
---|
249 | for (int i = 0; i < N; i++) {
|
---|
250 | a[i] = Math.Cos(a[i]);
|
---|
251 | }
|
---|
252 | break;
|
---|
253 | } else {
|
---|
254 | exprPos += 2;
|
---|
255 | // put 1 onto top of stack // BUG!
|
---|
256 | topOfStack++;
|
---|
257 | var a = stack[topOfStack];
|
---|
258 | for (int i = 0; i < N; i++) a[i] = 1.0;
|
---|
259 | break;
|
---|
260 | }
|
---|
261 | }
|
---|
262 | case 's': {
|
---|
263 | // sin
|
---|
264 | exprPos += 4;
|
---|
265 | var a = stack[topOfStack];
|
---|
266 | for (int i = 0; i < N; i++) {
|
---|
267 | a[i] = Math.Sin(a[i]);
|
---|
268 | }
|
---|
269 | break;
|
---|
270 | }
|
---|
271 | case 'l': {
|
---|
272 | // log
|
---|
273 | exprPos += 4;
|
---|
274 | var a = stack[topOfStack];
|
---|
275 | for (int i = 0; i < N; i++) {
|
---|
276 | a[i] = Math.Log(a[i]);
|
---|
277 | }
|
---|
278 |
|
---|
279 | break;
|
---|
280 | }
|
---|
281 | case 'e': {
|
---|
282 | // exp
|
---|
283 | exprPos += 4;
|
---|
284 | var a = stack[topOfStack];
|
---|
285 | for (int i = 0; i < N; i++) {
|
---|
286 | a[i] = Math.Exp(a[i]);
|
---|
287 | }
|
---|
288 |
|
---|
289 | break;
|
---|
290 | }
|
---|
291 | case 'i': {
|
---|
292 | // inv
|
---|
293 | exprPos += 4;
|
---|
294 | var a = stack[topOfStack];
|
---|
295 | for (int i = 0; i < N; i++) {
|
---|
296 | a[i] = 1.0 / a[i];
|
---|
297 | }
|
---|
298 | break;
|
---|
299 | }
|
---|
300 | default: {
|
---|
301 | throw new InvalidOperationException(string.Format("Cannot handle {0} in {1}", postfixExpr[exprPos], postfixExpr));
|
---|
302 | }
|
---|
303 | }
|
---|
304 | }
|
---|
305 | }
|
---|
306 | */
|
---|
307 | #endregion
|
---|
308 | }
|
---|
309 | }
|
---|