1 | using System;
|
---|
2 | using System.Collections.Generic;
|
---|
3 | using System.Drawing;
|
---|
4 | using System.IO;
|
---|
5 | using System.Linq;
|
---|
6 | using HeuristicLab.Analysis;
|
---|
7 | using HeuristicLab.Analysis.Views;
|
---|
8 | using System.Windows.Forms;
|
---|
9 |
|
---|
10 | namespace ExpressionClustering {
|
---|
11 | class Program {
|
---|
12 | private static readonly string folder = Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory);
|
---|
13 | private static readonly string clusterFolder = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "clusters");
|
---|
14 | private static readonly string distinctSentencesFileName = Path.Combine(folder, @"distinctSentences.csv");
|
---|
15 | private static readonly string allSentencesFileName = Path.Combine(folder, "allSentences.csv");
|
---|
16 | private static readonly string outputFileName = Path.Combine(folder, "evaluations.csv.gz");
|
---|
17 | private static int N = 100;
|
---|
18 | private static double[] evalBuf = new double[N];
|
---|
19 |
|
---|
20 | // pagie-1 (univariate)
|
---|
21 | private static double min = -5.0;
|
---|
22 | private static double max = +5.0;
|
---|
23 | private static double[] xs = Enumerable.Range(1, N).Select(xi => ((double)xi / N) * (max - min) + min).ToArray(); // input
|
---|
24 | private static double[] ys = xs.Select(xi => 1.0 / (1 + Math.Pow(xi, -4))).ToArray(); // target (necessary for scaling and clustering
|
---|
25 |
|
---|
26 |
|
---|
27 | public static int MAX_STACK = 20;
|
---|
28 | public static double[][] stack = new double[MAX_STACK][];
|
---|
29 | static Program() {
|
---|
30 | for (int i = 0; i < MAX_STACK; i++)
|
---|
31 | stack[i] = new double[N];
|
---|
32 | }
|
---|
33 |
|
---|
34 | // loads symbolic expressions in postfix notation from a stream and identifies clusters of expressions
|
---|
35 | static void Main(string[] args) {
|
---|
36 |
|
---|
37 | var hash2Sentences = new Dictionary<string, List<string>>();
|
---|
38 | // for debugging only
|
---|
39 | var postfix2infix = new Dictionary<string, string>();
|
---|
40 |
|
---|
41 |
|
---|
42 |
|
---|
43 | // read all sentences and determine shortest sentences
|
---|
44 | using (var reader = new StreamReader(allSentencesFileName)) {
|
---|
45 | // read header
|
---|
46 | reader.ReadLine();
|
---|
47 | int nSentences = 0;
|
---|
48 | while (!reader.EndOfStream) {
|
---|
49 | var line = reader.ReadLine();
|
---|
50 | var toks = line.Split(';');
|
---|
51 | var hash = toks[2];
|
---|
52 | List<string> ls;
|
---|
53 | if (!hash2Sentences.TryGetValue(hash, out ls)) {
|
---|
54 | ls = new List<string>(1);
|
---|
55 | hash2Sentences.Add(hash, ls);
|
---|
56 | }
|
---|
57 | ls.Add(toks[1]);
|
---|
58 | postfix2infix.Add(toks[1], toks[0]);
|
---|
59 | nSentences++;
|
---|
60 | }
|
---|
61 |
|
---|
62 | Console.WriteLine("{0} {1}", nSentences, hash2Sentences.Count);
|
---|
63 | //Evaluate(toks[1], xs, evalBuf);
|
---|
64 | }
|
---|
65 |
|
---|
66 | List<double[]> functions = new List<double[]>();
|
---|
67 | List<string> sentences = new List<string>();
|
---|
68 | List<double> qualities = new List<double>();
|
---|
69 |
|
---|
70 | foreach (var kvp in hash2Sentences) {
|
---|
71 | var ls = kvp.Value;
|
---|
72 | var sentence = FindShortest(ls);
|
---|
73 | Evaluate(sentence, xs, evalBuf);
|
---|
74 | if (evalBuf.Any(ei => float.IsInfinity((float)ei) || float.IsNaN((float)ei))) {
|
---|
75 | Console.WriteLine("skipping {0} {1}", evalBuf.Average(), sentence);
|
---|
76 | } else {
|
---|
77 | try {
|
---|
78 | Scale(evalBuf, ys);
|
---|
79 | functions.Add((double[])evalBuf.Clone());
|
---|
80 | sentences.Add(sentence);
|
---|
81 | HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError error;
|
---|
82 | qualities.Add(HeuristicLab.Problems.DataAnalysis.OnlinePearsonsRSquaredCalculator.Calculate(evalBuf, ys, out error));
|
---|
83 | } catch (ArgumentException e) {
|
---|
84 | // scaling failed
|
---|
85 | }
|
---|
86 | }
|
---|
87 | }
|
---|
88 |
|
---|
89 | List<int> clusters;
|
---|
90 | List<double> distances;
|
---|
91 | Flann.FindClusters(functions, out clusters, out distances, 100);
|
---|
92 |
|
---|
93 | // output all clusters and functions
|
---|
94 | using (var writer = new StreamWriter(new System.IO.Compression.GZipStream(new FileStream(outputFileName, FileMode.OpenOrCreate), System.IO.Compression.CompressionMode.Compress))) {
|
---|
95 | for (int i = 0; i < functions.Count; i++) {
|
---|
96 | writer.WriteLine("{0};{1};{2};{3};{4};{5}", clusters[i], distances[i], qualities[i], sentences[i], postfix2infix[sentences[i]], string.Join(";", functions[i].Select(fi => fi.ToString())));
|
---|
97 | }
|
---|
98 | }
|
---|
99 |
|
---|
100 | var funClusters = functions.Zip(clusters, (f, c) => Tuple.Create(f, c)).GroupBy(t => t.Item2);
|
---|
101 | var dtView = new DataTableView();
|
---|
102 | dtView.Size = new Size(800, 600);
|
---|
103 |
|
---|
104 | foreach (var funCluster in funClusters) {
|
---|
105 | // draw the functions for each cluster into a separate png
|
---|
106 | var dtName = string.Format("R² {0}", Enumerable.Range(0, qualities.Count).Where(idx => clusters[idx] == funCluster.Key).Select(idx => qualities[idx]).Average());
|
---|
107 | var dt = new DataTable(dtName, dtName);
|
---|
108 | var rows = new List<DataRow>();
|
---|
109 | int i = 0;
|
---|
110 | foreach (var fun in funCluster.Select(t => t.Item1)) {
|
---|
111 | var name = i.ToString();
|
---|
112 | var dr = new DataRow(name, name, fun);
|
---|
113 | rows.Add(dr);
|
---|
114 | i++;
|
---|
115 | }
|
---|
116 | dt.Rows.AddRange(rows);
|
---|
117 | dtView.Content = dt;
|
---|
118 | using (var bm = new Bitmap(800, 600)) {
|
---|
119 | dtView.DrawToBitmap(bm, new Rectangle(0, 0, 800, 600));
|
---|
120 | bm.Save(Path.Combine(clusterFolder, string.Format("cluster_{0,3}.png", funCluster.Key)));
|
---|
121 | }
|
---|
122 | }
|
---|
123 | }
|
---|
124 |
|
---|
125 |
|
---|
126 |
|
---|
127 | private static string FindShortest(List<string> ls) {
|
---|
128 | var minElem = ls.First();
|
---|
129 | for (int i = 1; i < ls.Count; i++) {
|
---|
130 | if (ls[i].Length < minElem.Length) minElem = ls[i];
|
---|
131 | }
|
---|
132 | return minElem;
|
---|
133 | }
|
---|
134 |
|
---|
135 |
|
---|
136 |
|
---|
137 | #region evaluation
|
---|
138 | // linear scaling
|
---|
139 | private static void Scale(double[] evalBuf, double[] ys) {
|
---|
140 | double alpha;
|
---|
141 | double beta;
|
---|
142 | HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError error;
|
---|
143 |
|
---|
144 | HeuristicLab.Problems.DataAnalysis.OnlineLinearScalingParameterCalculator.Calculate(evalBuf, ys, out alpha, out beta, out error);
|
---|
145 | if (error != HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError.None) {
|
---|
146 | throw new ArgumentException();
|
---|
147 | }
|
---|
148 |
|
---|
149 | for (int i = 0; i < evalBuf.Length; i++) {
|
---|
150 | evalBuf[i] = beta * evalBuf[i] + alpha;
|
---|
151 | }
|
---|
152 | }
|
---|
153 |
|
---|
154 | // evaluates postfix expressions (only for a very specific format)
|
---|
155 | private static void Evaluate(string postfixExpr, double[] xs, double[] evalBuf) {
|
---|
156 | int topOfStack = -1;
|
---|
157 | Evaluate(postfixExpr, 0, xs, ref topOfStack);
|
---|
158 | Array.Copy(stack[topOfStack], evalBuf, evalBuf.Length);
|
---|
159 | }
|
---|
160 |
|
---|
161 | private static void Evaluate(string postfixExpr, int exprPos, double[] xs, ref int topOfStack) {
|
---|
162 | while (exprPos < postfixExpr.Length) {
|
---|
163 | switch (postfixExpr[exprPos]) {
|
---|
164 | case '+': {
|
---|
165 | exprPos += 2;
|
---|
166 | var a = stack[topOfStack];
|
---|
167 | var b = stack[topOfStack - 1];
|
---|
168 | for (int i = 0; i < N; i++) {
|
---|
169 | b[i] += a[i];
|
---|
170 | }
|
---|
171 | topOfStack--;
|
---|
172 | break;
|
---|
173 | }
|
---|
174 | case '*': {
|
---|
175 | exprPos += 2;
|
---|
176 | var a = stack[topOfStack];
|
---|
177 | var b = stack[topOfStack - 1];
|
---|
178 | for (int i = 0; i < N; i++) {
|
---|
179 | b[i] *= a[i];
|
---|
180 | }
|
---|
181 | topOfStack--;
|
---|
182 | break;
|
---|
183 | }
|
---|
184 | case 'X': {
|
---|
185 | exprPos += 2;
|
---|
186 | topOfStack++;
|
---|
187 | Array.Copy(xs, stack[topOfStack], N);
|
---|
188 | break;
|
---|
189 | }
|
---|
190 | case 'c': {
|
---|
191 | // cos
|
---|
192 | exprPos += 4;
|
---|
193 | var a = stack[topOfStack];
|
---|
194 | for (int i = 0; i < N; i++) {
|
---|
195 | a[i] = Math.Cos(a[i]);
|
---|
196 | }
|
---|
197 | break;
|
---|
198 | }
|
---|
199 | case 's': {
|
---|
200 | // sin
|
---|
201 | exprPos += 4;
|
---|
202 | var a = stack[topOfStack];
|
---|
203 | for (int i = 0; i < N; i++) {
|
---|
204 | a[i] = Math.Sin(a[i]);
|
---|
205 | }
|
---|
206 | break;
|
---|
207 | }
|
---|
208 | case 'l': {
|
---|
209 | // log
|
---|
210 | exprPos += 4;
|
---|
211 | var a = stack[topOfStack];
|
---|
212 | for (int i = 0; i < N; i++) {
|
---|
213 | a[i] = Math.Log(a[i]);
|
---|
214 | }
|
---|
215 |
|
---|
216 | break;
|
---|
217 | }
|
---|
218 | case 'e': {
|
---|
219 | // exp
|
---|
220 | exprPos += 4;
|
---|
221 | var a = stack[topOfStack];
|
---|
222 | for (int i = 0; i < N; i++) {
|
---|
223 | a[i] = Math.Exp(a[i]);
|
---|
224 | }
|
---|
225 |
|
---|
226 | break;
|
---|
227 | }
|
---|
228 | case 'i': {
|
---|
229 | // inv
|
---|
230 | exprPos += 4;
|
---|
231 | var a = stack[topOfStack];
|
---|
232 | for (int i = 0; i < N; i++) {
|
---|
233 | a[i] = 1.0 / a[i];
|
---|
234 | }
|
---|
235 | break;
|
---|
236 | }
|
---|
237 | default: {
|
---|
238 | throw new InvalidOperationException(string.Format("Cannot handle {0} in {1}", postfixExpr[exprPos], postfixExpr));
|
---|
239 | }
|
---|
240 | }
|
---|
241 | }
|
---|
242 | }
|
---|
243 | #endregion
|
---|
244 | }
|
---|
245 | }
|
---|