1 | using System;
|
---|
2 | using System.Collections.Generic;
|
---|
3 | using System.IO;
|
---|
4 | using System.Linq;
|
---|
5 |
|
---|
6 | namespace ExpressionClustering {
|
---|
7 | class Program {
|
---|
8 | private static readonly string folder = Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory);
|
---|
9 | private static readonly string distinctSentencesFileName = Path.Combine(folder, @"distinctSentences.csv");
|
---|
10 | private static readonly string allSentencesFileName = Path.Combine(folder, "allSentences.csv");
|
---|
11 | private static readonly string outputFileName = Path.Combine(folder, "evaluations.csv.gz");
|
---|
12 | private static readonly string columnDelimiter = ";";
|
---|
13 | private static int N = 100;
|
---|
14 | private static double[] evalBuf = new double[N];
|
---|
15 |
|
---|
16 | // pagie-1 (univariate)
|
---|
17 | private static double min = -5.0;
|
---|
18 | private static double max = +5.0;
|
---|
19 | private static double[] xs = Enumerable.Range(1, N).Select(xi => ((double)xi / N) * (max - min) + min).ToArray(); // input
|
---|
20 | private static double[] ys = xs.Select(xi => 1.0 / (1 + Math.Pow(xi, -4))).ToArray(); // target (necessary for scaling and clustering
|
---|
21 |
|
---|
22 |
|
---|
23 | public static int MAX_STACK = 20;
|
---|
24 | public static double[][] stack = new double[MAX_STACK][];
|
---|
25 | static Program() {
|
---|
26 | for (int i = 0; i < MAX_STACK; i++)
|
---|
27 | stack[i] = new double[N];
|
---|
28 | }
|
---|
29 |
|
---|
30 | // loads symbolic expressions in postfix notation from a stream and identifies clusters of expressions
|
---|
31 | static void Main(string[] args) {
|
---|
32 |
|
---|
33 | TestFLANN();
|
---|
34 |
|
---|
35 | var hash2Sentences = new Dictionary<string, List<string>>();
|
---|
36 |
|
---|
37 |
|
---|
38 |
|
---|
39 | // read all sentences and determine shortest sentences
|
---|
40 | using (var reader = new StreamReader(allSentencesFileName)) {
|
---|
41 | // read header
|
---|
42 | reader.ReadLine();
|
---|
43 | int nSentences = 0;
|
---|
44 | while (!reader.EndOfStream) {
|
---|
45 | var line = reader.ReadLine();
|
---|
46 | var toks = line.Split(';');
|
---|
47 | var hash = toks[2];
|
---|
48 | List<string> ls;
|
---|
49 | if (!hash2Sentences.TryGetValue(hash, out ls)) {
|
---|
50 | ls = new List<string>(1);
|
---|
51 | hash2Sentences.Add(hash, ls);
|
---|
52 | }
|
---|
53 | ls.Add(toks[1]);
|
---|
54 | nSentences++;
|
---|
55 | }
|
---|
56 |
|
---|
57 | Console.WriteLine("{0} {1}", nSentences, hash2Sentences.Count);
|
---|
58 | //Evaluate(toks[1], xs, evalBuf);
|
---|
59 | }
|
---|
60 |
|
---|
61 | using (var writer = new StreamWriter(new System.IO.Compression.GZipStream(new FileStream(outputFileName, FileMode.OpenOrCreate), System.IO.Compression.CompressionMode.Compress)))
|
---|
62 | foreach (var kvp in hash2Sentences) {
|
---|
63 | var ls = kvp.Value;
|
---|
64 | var sentence = FindShortest(ls);
|
---|
65 | Evaluate(sentence, xs, evalBuf);
|
---|
66 | Add(writer, sentence, evalBuf);
|
---|
67 | }
|
---|
68 | }
|
---|
69 |
|
---|
70 | private static void TestFLANN() {
|
---|
71 | var rand = new Random(1234);
|
---|
72 | var dim = 100;
|
---|
73 | var N = 10000;
|
---|
74 | var dataset = new List<double[]>();
|
---|
75 | for(int i=0;i<N;i++) {
|
---|
76 | var x = new double[dim];
|
---|
77 | for (int j = 0; j < dim; j++) x[j] = rand.NextDouble();
|
---|
78 | dataset.Add(x);
|
---|
79 | }
|
---|
80 | List<int> nnResults;
|
---|
81 | List<double> nnDists;
|
---|
82 | Flann.FindNearestNeighbours(dataset, dataset, out nnResults, out nnDists);
|
---|
83 |
|
---|
84 | }
|
---|
85 |
|
---|
86 | private static string FindShortest(List<string> ls) {
|
---|
87 | var minElem = ls.First();
|
---|
88 | for (int i = 1; i < ls.Count; i++) {
|
---|
89 | if (ls[i].Length < minElem.Length) minElem = ls[i];
|
---|
90 | }
|
---|
91 | return minElem;
|
---|
92 | }
|
---|
93 |
|
---|
94 |
|
---|
95 | #region evaluation
|
---|
96 | // linear scaling
|
---|
97 | private static void Scale(double[] evalBuf, double[] ys) {
|
---|
98 | double alpha;
|
---|
99 | double beta;
|
---|
100 | HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError error;
|
---|
101 |
|
---|
102 | HeuristicLab.Problems.DataAnalysis.OnlineLinearScalingParameterCalculator.Calculate(evalBuf, ys, out alpha, out beta, out error);
|
---|
103 | if (error != HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError.None) {
|
---|
104 | alpha = 0.0;
|
---|
105 | beta = 1.0;
|
---|
106 | Console.WriteLine("WARNING: error in scaling");
|
---|
107 | }
|
---|
108 |
|
---|
109 | for (int i = 0; i < evalBuf.Length; i++) {
|
---|
110 | evalBuf[i] = beta * evalBuf[i] + alpha;
|
---|
111 | }
|
---|
112 |
|
---|
113 |
|
---|
114 | //
|
---|
115 | // var meanE = 0.0;
|
---|
116 | // var meanE2 = 0.0;
|
---|
117 | // var meanY = 0.0;
|
---|
118 | // var meanY2 = 0.0;
|
---|
119 | // for(int i=0;i<evalBuf.Length;i++) {
|
---|
120 | // var deltaE = evalBuf[i] - meanE;
|
---|
121 | // meanE += deltaE / (i+1);
|
---|
122 | // var deltaE2 = evalBuf[i] - meanE;
|
---|
123 | // meanE2 += deltaE * deltaE2;
|
---|
124 | //
|
---|
125 | //
|
---|
126 | // var deltaY = ys[i] - meanY;
|
---|
127 | // meanY += deltaY / (i + 1);
|
---|
128 | // var deltaY2 = ys[i] - meanY;
|
---|
129 | // meanY2 += deltaY * deltaY2;
|
---|
130 | // TODO COVARIANCE
|
---|
131 | // Linear Scaling: b = cov(y,e) / var(e); a = meanY - b*meanE;
|
---|
132 | // }
|
---|
133 | //
|
---|
134 | // var varE = meanE2 / evalBuf.Length;
|
---|
135 | // var varY = meanY2 / evalBuf.Length;
|
---|
136 | // Console.WriteLine("{0} {1} {2} {3}", meanE, evalBuf.Average(), meanY, ys.Average());
|
---|
137 | // Console.WriteLine("{0} {1}", varE, varY);
|
---|
138 | //
|
---|
139 | // var factor = varY / varE;
|
---|
140 | // for(int i=0;i<evalBuf.Length;i++) {
|
---|
141 | // evalBuf[i] = (evalBuf[i] - meanE) * factor + meanY;
|
---|
142 | // }
|
---|
143 | }
|
---|
144 |
|
---|
145 | // evaluates postfix expressions (only for a very specific format)
|
---|
146 | private static void Evaluate(string postfixExpr, double[] xs, double[] evalBuf) {
|
---|
147 | int topOfStack = -1;
|
---|
148 | Evaluate(postfixExpr, 0, xs, ref topOfStack);
|
---|
149 | Array.Copy(stack[topOfStack], evalBuf, evalBuf.Length);
|
---|
150 | }
|
---|
151 |
|
---|
152 | private static void Evaluate(string postfixExpr, int exprPos, double[] xs, ref int topOfStack) {
|
---|
153 | while (exprPos < postfixExpr.Length) {
|
---|
154 | switch (postfixExpr[exprPos]) {
|
---|
155 | case '+': {
|
---|
156 | exprPos += 2;
|
---|
157 | var a = stack[topOfStack];
|
---|
158 | var b = stack[topOfStack - 1];
|
---|
159 | for (int i = 0; i < N; i++) {
|
---|
160 | a[i] += b[i];
|
---|
161 | }
|
---|
162 | break;
|
---|
163 | }
|
---|
164 | case '*': {
|
---|
165 | exprPos += 2;
|
---|
166 | var a = stack[topOfStack];
|
---|
167 | var b = stack[topOfStack - 1];
|
---|
168 | for (int i = 0; i < N; i++) {
|
---|
169 | a[i] *= b[i];
|
---|
170 | }
|
---|
171 | break;
|
---|
172 | }
|
---|
173 | case 'X': {
|
---|
174 | exprPos += 2;
|
---|
175 | topOfStack++;
|
---|
176 | Array.Copy(xs, stack[topOfStack], N);
|
---|
177 | break;
|
---|
178 | }
|
---|
179 | case 'c': {
|
---|
180 | // cos
|
---|
181 | exprPos += 4;
|
---|
182 | var a = stack[topOfStack];
|
---|
183 | for (int i = 0; i < N; i++) {
|
---|
184 | a[i] = Math.Cos(a[i]);
|
---|
185 | }
|
---|
186 | break;
|
---|
187 | }
|
---|
188 | case 's': {
|
---|
189 | // sin
|
---|
190 | exprPos += 4;
|
---|
191 | var a = stack[topOfStack];
|
---|
192 | for (int i = 0; i < N; i++) {
|
---|
193 | a[i] = Math.Sin(a[i]);
|
---|
194 | }
|
---|
195 | break;
|
---|
196 | }
|
---|
197 | case 'l': {
|
---|
198 | // log
|
---|
199 | exprPos += 4;
|
---|
200 | var a = stack[topOfStack];
|
---|
201 | for (int i = 0; i < N; i++) {
|
---|
202 | a[i] = Math.Log(a[i]);
|
---|
203 | }
|
---|
204 |
|
---|
205 | break;
|
---|
206 | }
|
---|
207 | case 'e': {
|
---|
208 | // exp
|
---|
209 | exprPos += 4;
|
---|
210 | var a = stack[topOfStack];
|
---|
211 | for (int i = 0; i < N; i++) {
|
---|
212 | a[i] = Math.Exp(a[i]);
|
---|
213 | }
|
---|
214 |
|
---|
215 | break;
|
---|
216 | }
|
---|
217 | case 'i': {
|
---|
218 | // inv
|
---|
219 | exprPos += 4;
|
---|
220 | var a = stack[topOfStack];
|
---|
221 | for (int i = 0; i < N; i++) {
|
---|
222 | a[i] = 1.0 / a[i];
|
---|
223 | }
|
---|
224 | break;
|
---|
225 | }
|
---|
226 | default: {
|
---|
227 | throw new InvalidOperationException(string.Format("Cannot handle {0} in {1}", postfixExpr[exprPos], postfixExpr));
|
---|
228 | }
|
---|
229 | }
|
---|
230 | }
|
---|
231 | }
|
---|
232 | #endregion
|
---|
233 |
|
---|
234 |
|
---|
235 |
|
---|
236 | // add the line with it's evaluation result to a data structure for clustering
|
---|
237 | private static void Add(StreamWriter writer, string line, double[] evalBuf) {
|
---|
238 | var avg = evalBuf.Average();
|
---|
239 | if (double.IsNaN(avg) || double.IsInfinity(avg)) {
|
---|
240 | Console.WriteLine("skipping {0} {1}", evalBuf.Average(), line);
|
---|
241 | } else {
|
---|
242 | Scale(evalBuf, ys);
|
---|
243 |
|
---|
244 | writer.WriteLine(string.Join("\t", evalBuf.Select(ei => ei.ToString(System.Globalization.CultureInfo.InvariantCulture))));
|
---|
245 | }
|
---|
246 | }
|
---|
247 | }
|
---|
248 | }
|
---|