1 | using System;
|
---|
2 | using System.Collections;
|
---|
3 | using System.Collections.Generic;
|
---|
4 | using System.Diagnostics;
|
---|
5 | using System.IO;
|
---|
6 | using System.Linq;
|
---|
7 | using HeuristicLab.Problems.DataAnalysis;
|
---|
8 | using HeuristicLab.Problems.DataAnalysis.Symbolic;
|
---|
9 |
|
---|
10 | // Reads sentences from files, determines the shortest infix expression for a hash and evaluates sentences on randomly generated data, evaluation result is written to file
|
---|
11 | namespace ExpressionClustering {
|
---|
12 | class Program {
|
---|
13 | //private static readonly string folder = @"D:\heal\documents\trunk\Publications\2018\GPTP\data";
|
---|
14 | //private static readonly string clusterFolder = folder;
|
---|
15 | //private static readonly string distinctSentencesFileName = Path.Combine(folder, @"distinctSentences_2018-04-13_16-40_TreeSize-7.csv.gz");
|
---|
16 | //private static readonly string allSentencesFileName = Path.Combine(folder, "allSentences_2018-04-16_14-49_TreeSize-8_1d.csv.gz");
|
---|
17 | //private static readonly string outputFileName = Path.Combine(folder, "evaluations_2018-04-16_14-49_TreeSize-8_1d.csv.gz");
|
---|
18 | private static int N = 100;
|
---|
19 | private static int PERF_STATS_UPDATE_INTERVAL = 100000;
|
---|
20 | private static double[] evalBuf = new double[N];
|
---|
21 |
|
---|
22 | // pagie-1 (univariate)
|
---|
23 | private static double min = -5.0;
|
---|
24 | private static double max = +5.0;
|
---|
25 | private static double[] xs = Enumerable.Range(1, N).Select(xi => ((double)xi / N) * (max - min) + min).ToArray(); // input
|
---|
26 |
|
---|
27 | private static double[] ys_pagie = xs.Select(xi => 1.0 / (1 + Math.Pow(xi, -4))).ToArray(); // a potential target (not used for search)
|
---|
28 |
|
---|
29 | // x³ * exp(-x) * cos(x) * sin(x) * (sin(x)² * cos(x) - 1)
|
---|
30 | // for keijzer x should be in scale 0 - 10 inclusive
|
---|
31 | private static double[] ys_keijzer4 = xs
|
---|
32 | .Select(xi => xi + 10.0) // scale
|
---|
33 | .Select(xi => xi * xi * xi + Math.Exp(-xi) * Math.Cos(xi) * Math.Sin(xi) * (Math.Sin(xi) * Math.Sin(xi) * Math.Cos(xi) - 1))
|
---|
34 | .ToArray();
|
---|
35 |
|
---|
36 |
|
---|
37 | // loads symbolic expressions in postfix notation from a stream and identifies clusters of expressions
|
---|
38 | static void Main(string[] args) {
|
---|
39 | var sentencesFileName = args[0];
|
---|
40 | var outputFileName = Path.Combine(Path.GetDirectoryName(sentencesFileName), "evaluations_" + Path.GetFileName(sentencesFileName));
|
---|
41 |
|
---|
42 |
|
---|
43 | var hashToRowIdx = new Dictionary<string, int>();
|
---|
44 | var hashToInfix = new Dictionary<string, string>();
|
---|
45 |
|
---|
46 | // read all sentences and determine shortest sentences
|
---|
47 | using (var reader = new StreamReader(
|
---|
48 | new System.IO.Compression.GZipStream(
|
---|
49 | new FileStream(sentencesFileName, FileMode.Open, FileAccess.Read),
|
---|
50 | System.IO.Compression.CompressionMode.Decompress))) {
|
---|
51 | // read header
|
---|
52 | reader.ReadLine();
|
---|
53 | int nSentences = 0;
|
---|
54 | var sw = new Stopwatch();
|
---|
55 | sw.Start();
|
---|
56 | while (!reader.EndOfStream) {
|
---|
57 | var line = reader.ReadLine();
|
---|
58 | nSentences++;
|
---|
59 | var toks = line.Split(';');
|
---|
60 | var hash = toks[0];
|
---|
61 | var length = toks[1];
|
---|
62 | //var postfix = toks[2];
|
---|
63 | var infix = toks[3];
|
---|
64 | string expr;
|
---|
65 | if (!hashToInfix.TryGetValue(hash, out expr)) {
|
---|
66 | hashToInfix.Add(hash, infix);
|
---|
67 | hashToRowIdx.Add(hash, nSentences);
|
---|
68 | }
|
---|
69 | else if(expr.Length > infix.Length) {
|
---|
70 | hashToInfix[hash] = infix; // keep only shortest
|
---|
71 | hashToRowIdx[hash] = nSentences;
|
---|
72 | }
|
---|
73 | if (nSentences % PERF_STATS_UPDATE_INTERVAL == PERF_STATS_UPDATE_INTERVAL-1) {
|
---|
74 | Console.WriteLine("Read perf: {0} sentences in {1}ms", PERF_STATS_UPDATE_INTERVAL, sw.ElapsedMilliseconds);
|
---|
75 | sw.Restart();
|
---|
76 | }
|
---|
77 | }
|
---|
78 |
|
---|
79 | Console.WriteLine("{0} {1}", nSentences, hashToInfix.Count);
|
---|
80 | //Evaluate(toks[1], xs, evalBuf);
|
---|
81 | }
|
---|
82 |
|
---|
83 | Scale(ys_keijzer4);
|
---|
84 | Scale(ys_pagie);
|
---|
85 |
|
---|
86 | // output all functions
|
---|
87 | using (var writer = new StreamWriter(
|
---|
88 | new System.IO.Compression.GZipStream(
|
---|
89 | new FileStream(outputFileName, FileMode.OpenOrCreate),
|
---|
90 | System.IO.Compression.CompressionMode.Compress))) {
|
---|
91 | var sw = new Stopwatch();
|
---|
92 | sw.Start();
|
---|
93 |
|
---|
94 | var ds = new Dataset(new string[] { "X" }, new IList[] { xs });
|
---|
95 | writer.WriteLine("{0};{1};{2};{3};{4};{5}", "Hash", "RowIdx (in allSentences)", "NMSE pagie", "NMSE keijzer4", "infix",
|
---|
96 | string.Join(";", Enumerable.Range(0, xs.Length).Select(i => "eval" + i)));
|
---|
97 | int nSentences = 0;
|
---|
98 | foreach (var kvp in hashToInfix) {
|
---|
99 | var hash = kvp.Key;
|
---|
100 | var infixExpr = kvp.Value;
|
---|
101 | evalBuf = EvaluateInfix(infixExpr, ds).ToArray();
|
---|
102 | if (evalBuf.Any(ei => double.IsInfinity(ei) || double.IsNaN(ei))) {
|
---|
103 | //Console.WriteLine("skipping {0} {1}", evalBuf.Average(), infixExpr);
|
---|
104 | //Console.Write(".");
|
---|
105 | } else {
|
---|
106 | try {
|
---|
107 | Scale(evalBuf);
|
---|
108 | // functions.Add((double[])evalBuf.Clone());
|
---|
109 | // sentences.Add(sentence);
|
---|
110 | OnlineCalculatorError error;
|
---|
111 | var nmse_pagie = OnlineNormalizedMeanSquaredErrorCalculator.Calculate(evalBuf, ys_pagie, out error);
|
---|
112 | if (error != OnlineCalculatorError.None) nmse_pagie = 10;
|
---|
113 | var nmse_keijzer = OnlineNormalizedMeanSquaredErrorCalculator.Calculate(evalBuf, ys_keijzer4, out error);
|
---|
114 | if (error != OnlineCalculatorError.None) nmse_keijzer = 10;
|
---|
115 | writer.WriteLine("{0};{1};{2};{3};{4};{5}", hash, hashToRowIdx[hash], nmse_pagie, nmse_keijzer, infixExpr,
|
---|
116 | string.Join(";", evalBuf.Select(fi => fi.ToString())));
|
---|
117 | } catch (ArgumentException e) {
|
---|
118 | // scaling failed
|
---|
119 | }
|
---|
120 | }
|
---|
121 |
|
---|
122 | if (nSentences++ % PERF_STATS_UPDATE_INTERVAL == PERF_STATS_UPDATE_INTERVAL-1) {
|
---|
123 | Console.WriteLine("Eval perf: {0} sentences in {1}ms expected time remaining: {2}min",
|
---|
124 | PERF_STATS_UPDATE_INTERVAL, sw.ElapsedMilliseconds,
|
---|
125 | (hashToRowIdx.Count - nSentences) / (double)PERF_STATS_UPDATE_INTERVAL * sw.ElapsedMilliseconds / 1000 / 60);
|
---|
126 | sw.Restart();
|
---|
127 | }
|
---|
128 | }
|
---|
129 | }
|
---|
130 | }
|
---|
131 |
|
---|
132 |
|
---|
133 | #region evaluation
|
---|
134 |
|
---|
135 | // scaling to zero-mean unit variance (ignore NaN and +/-Inf.
|
---|
136 | private static void Scale(double[] evalBuf) {
|
---|
137 | double mean;
|
---|
138 | double variance;
|
---|
139 | var max = evalBuf.Select(xi=>Math.Abs(xi)).Max();
|
---|
140 | for (int i = 0; i < evalBuf.Length; i++) {
|
---|
141 | evalBuf[i] /= max;
|
---|
142 | }
|
---|
143 |
|
---|
144 | OnlineCalculatorError error, varError;
|
---|
145 | OnlineMeanAndVarianceCalculator.Calculate(evalBuf.Where(xi => !double.IsNaN(xi) && !double.IsInfinity(xi)), out mean, out variance, out error, out varError);
|
---|
146 | if(error!=OnlineCalculatorError.None || varError != OnlineCalculatorError.None) {
|
---|
147 | throw new ArgumentException("Cannot scale vector");
|
---|
148 | }
|
---|
149 |
|
---|
150 | for (int i = 0; i < evalBuf.Length; i++) {
|
---|
151 | if (double.IsNaN(evalBuf[i])) evalBuf[i] = mean;
|
---|
152 | else if (double.IsPositiveInfinity(evalBuf[i])) evalBuf[i] = 10;
|
---|
153 | else if (double.IsNegativeInfinity(evalBuf[i])) evalBuf[i] = -10.0;
|
---|
154 | evalBuf[i] = 1.0 / variance * evalBuf[i] + mean;
|
---|
155 | }
|
---|
156 | }
|
---|
157 |
|
---|
158 | // linear scaling to match target
|
---|
159 | private static void Scale(double[] evalBuf, double[] ys) {
|
---|
160 | double alpha;
|
---|
161 | double beta;
|
---|
162 | HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError error;
|
---|
163 |
|
---|
164 | HeuristicLab.Problems.DataAnalysis.OnlineLinearScalingParameterCalculator.Calculate(evalBuf, ys, out alpha, out beta, out error);
|
---|
165 | if (error != HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError.None) {
|
---|
166 | throw new ArgumentException();
|
---|
167 | }
|
---|
168 |
|
---|
169 | for (int i = 0; i < evalBuf.Length; i++) {
|
---|
170 | evalBuf[i] = beta * evalBuf[i] + alpha;
|
---|
171 | }
|
---|
172 | }
|
---|
173 |
|
---|
174 | // evaluates infix expressions (using the infix parser)
|
---|
175 | private static IEnumerable<double> EvaluateInfix(string infixExpr, Dataset ds) {
|
---|
176 | var parser = new HeuristicLab.Problems.DataAnalysis.Symbolic.InfixExpressionParser();
|
---|
177 | var tree = parser.Parse(infixExpr);
|
---|
178 | var interpreter = new SymbolicDataAnalysisExpressionTreeLinearInterpreter();
|
---|
179 | return interpreter.GetSymbolicExpressionTreeValues(tree, ds, Enumerable.Range(0, ds.Rows));
|
---|
180 | }
|
---|
181 | #endregion
|
---|
182 | }
|
---|
183 | }
|
---|