source: branches/2886_SymRegGrammarEnumeration/ExpressionClustering/Program.cs @ 15924

Last change on this file since 15924 was 15924, checked in by gkronber, 4 years ago

#2886 remove obsolete code in C# program for the evaluation of sentences, switch to NSME as quality measure. Tried plotting functions within clusters in R

File size: 8.5 KB
Line 
1using System;
2using System.Collections;
3using System.Collections.Generic;
4using System.Diagnostics;                                                                                             
5using System.IO;
6using System.Linq;                                                                                                     
7using HeuristicLab.Problems.DataAnalysis;
8using HeuristicLab.Problems.DataAnalysis.Symbolic;
9
10// Reads sentences from files, determines the shortest infix expression for a hash and evaluates sentences on randomly generated data, evaluation result is written to file
11namespace ExpressionClustering {
12  class Program {
13    //private static readonly string folder = @"D:\heal\documents\trunk\Publications\2018\GPTP\data";
14    //private static readonly string clusterFolder = folder;
15    //private static readonly string distinctSentencesFileName = Path.Combine(folder, @"distinctSentences_2018-04-13_16-40_TreeSize-7.csv.gz");
16    //private static readonly string allSentencesFileName = Path.Combine(folder, "allSentences_2018-04-16_14-49_TreeSize-8_1d.csv.gz");
17    //private static readonly string outputFileName = Path.Combine(folder, "evaluations_2018-04-16_14-49_TreeSize-8_1d.csv.gz");
18    private static int N = 100;
19    private static int PERF_STATS_UPDATE_INTERVAL = 100000;
20    private static double[] evalBuf = new double[N];
21
22    // pagie-1 (univariate)
23    private static double min = -5.0;
24    private static double max = +5.0;
25    private static double[] xs = Enumerable.Range(1, N).Select(xi => ((double)xi / N) * (max - min) + min).ToArray(); // input
26
27    private static double[] ys_pagie = xs.Select(xi => 1.0 / (1 + Math.Pow(xi, -4))).ToArray(); // a potential target (not used for search)
28
29    // x³  * exp(-x) * cos(x) * sin(x) * (sin(x)² * cos(x) - 1)
30    // for keijzer x should be in scale 0 - 10 inclusive
31    private static double[] ys_keijzer4 = xs
32      .Select(xi => xi + 10.0) // scale
33      .Select(xi => xi * xi * xi + Math.Exp(-xi) * Math.Cos(xi) * Math.Sin(xi) * (Math.Sin(xi) * Math.Sin(xi) * Math.Cos(xi) - 1))
34      .ToArray();
35
36   
37    // loads symbolic expressions in postfix notation from a stream and identifies clusters of expressions
38    static void Main(string[] args) {
39      var sentencesFileName = args[0];
40      var outputFileName = Path.Combine(Path.GetDirectoryName(sentencesFileName), "evaluations_" + Path.GetFileName(sentencesFileName));
41                               
42
43      var hashToRowIdx = new Dictionary<string, int>();
44      var hashToInfix = new Dictionary<string, string>();
45
46      // read all sentences and determine shortest sentences
47      using (var reader = new StreamReader(
48        new System.IO.Compression.GZipStream(
49          new FileStream(sentencesFileName, FileMode.Open, FileAccess.Read),
50          System.IO.Compression.CompressionMode.Decompress))) {
51        // read header
52        reader.ReadLine();
53        int nSentences = 0;
54        var sw = new Stopwatch();
55        sw.Start();
56        while (!reader.EndOfStream) {
57          var line = reader.ReadLine();
58          nSentences++;
59          var toks = line.Split(';');
60          var hash = toks[0];
61          var length = toks[1];
62          //var postfix = toks[2];
63          var infix = toks[3];
64          string expr;
65          if (!hashToInfix.TryGetValue(hash, out expr)) {
66            hashToInfix.Add(hash, infix);
67            hashToRowIdx.Add(hash, nSentences);
68          }
69          else if(expr.Length > infix.Length) {
70            hashToInfix[hash] = infix;  // keep only shortest
71            hashToRowIdx[hash] = nSentences;
72          }
73          if (nSentences % PERF_STATS_UPDATE_INTERVAL == PERF_STATS_UPDATE_INTERVAL-1) {
74            Console.WriteLine("Read perf: {0} sentences in {1}ms", PERF_STATS_UPDATE_INTERVAL, sw.ElapsedMilliseconds);
75            sw.Restart();
76          }
77        }
78
79        Console.WriteLine("{0} {1}", nSentences, hashToInfix.Count);
80        //Evaluate(toks[1], xs, evalBuf);
81      }
82
83      Scale(ys_keijzer4);
84      Scale(ys_pagie);
85
86      // output all functions
87      using (var writer = new StreamWriter(
88        new System.IO.Compression.GZipStream(
89          new FileStream(outputFileName, FileMode.OpenOrCreate),
90          System.IO.Compression.CompressionMode.Compress))) {
91        var sw = new Stopwatch();
92        sw.Start();
93
94        var ds = new Dataset(new string[] { "X" }, new IList[] { xs });
95        writer.WriteLine("{0};{1};{2};{3};{4};{5}", "Hash", "RowIdx (in allSentences)", "NMSE pagie", "NMSE keijzer4", "infix",
96          string.Join(";", Enumerable.Range(0, xs.Length).Select(i => "eval" + i)));
97        int nSentences = 0;
98        foreach (var kvp in hashToInfix) {
99          var hash = kvp.Key;
100          var infixExpr = kvp.Value;
101          evalBuf = EvaluateInfix(infixExpr, ds).ToArray();
102          if (evalBuf.Any(ei => double.IsInfinity(ei) || double.IsNaN(ei))) {
103            //Console.WriteLine("skipping {0} {1}", evalBuf.Average(), infixExpr);
104            //Console.Write(".");
105          } else {
106            try {
107              Scale(evalBuf);
108              // functions.Add((double[])evalBuf.Clone());
109              // sentences.Add(sentence);
110              OnlineCalculatorError error;
111              var nmse_pagie = OnlineNormalizedMeanSquaredErrorCalculator.Calculate(evalBuf, ys_pagie, out error);
112              if (error != OnlineCalculatorError.None) nmse_pagie = 10;
113              var nmse_keijzer = OnlineNormalizedMeanSquaredErrorCalculator.Calculate(evalBuf, ys_keijzer4, out error);
114              if (error != OnlineCalculatorError.None) nmse_keijzer = 10;
115              writer.WriteLine("{0};{1};{2};{3};{4};{5}", hash,  hashToRowIdx[hash], nmse_pagie, nmse_keijzer, infixExpr,
116                string.Join(";", evalBuf.Select(fi => fi.ToString())));
117            } catch (ArgumentException e) {
118              // scaling failed
119            }
120          }
121         
122          if (nSentences++ % PERF_STATS_UPDATE_INTERVAL == PERF_STATS_UPDATE_INTERVAL-1) {
123            Console.WriteLine("Eval perf: {0} sentences in {1}ms expected time remaining: {2}min",
124              PERF_STATS_UPDATE_INTERVAL, sw.ElapsedMilliseconds,
125              (hashToRowIdx.Count - nSentences) / (double)PERF_STATS_UPDATE_INTERVAL * sw.ElapsedMilliseconds / 1000 / 60);
126            sw.Restart();
127          }
128        }
129      }
130    }
131
132
133    #region evaluation
134
135    // scaling to zero-mean unit variance  (ignore NaN and +/-Inf.
136    private static void Scale(double[] evalBuf) {
137      double mean;
138      double variance;
139      var max = evalBuf.Select(xi=>Math.Abs(xi)).Max();
140      for (int i = 0; i < evalBuf.Length; i++) {
141        evalBuf[i] /= max;
142      }
143
144      OnlineCalculatorError error, varError;
145      OnlineMeanAndVarianceCalculator.Calculate(evalBuf.Where(xi => !double.IsNaN(xi) && !double.IsInfinity(xi)), out mean, out variance, out error, out varError);
146      if(error!=OnlineCalculatorError.None || varError != OnlineCalculatorError.None) {
147        throw new ArgumentException("Cannot scale vector");
148      }
149
150      for (int i = 0; i < evalBuf.Length; i++) {
151        if (double.IsNaN(evalBuf[i])) evalBuf[i] = mean;
152        else if (double.IsPositiveInfinity(evalBuf[i])) evalBuf[i] = 10;
153        else if (double.IsNegativeInfinity(evalBuf[i])) evalBuf[i] = -10.0;
154        evalBuf[i] = 1.0 / variance * evalBuf[i] + mean;
155      }
156    }
157
158    // linear scaling to match target
159    private static void Scale(double[] evalBuf, double[] ys) {
160      double alpha;
161      double beta;
162      HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError error;
163
164      HeuristicLab.Problems.DataAnalysis.OnlineLinearScalingParameterCalculator.Calculate(evalBuf, ys, out alpha, out beta, out error);
165      if (error != HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError.None) {
166        throw new ArgumentException();
167      }
168
169      for (int i = 0; i < evalBuf.Length; i++) {
170        evalBuf[i] = beta * evalBuf[i] + alpha;
171      }
172    }
173
174    // evaluates infix expressions (using the infix parser)
175    private static IEnumerable<double> EvaluateInfix(string infixExpr, Dataset ds) {
176      var parser = new HeuristicLab.Problems.DataAnalysis.Symbolic.InfixExpressionParser();
177      var tree = parser.Parse(infixExpr);
178      var interpreter = new SymbolicDataAnalysisExpressionTreeLinearInterpreter();
179      return interpreter.GetSymbolicExpressionTreeValues(tree, ds, Enumerable.Range(0, ds.Rows));
180    }         
181    #endregion
182  }
183}
Note: See TracBrowser for help on using the repository browser.