1  using System;


2  using System.Collections.Generic;


3  using System.IO;


4  using System.Linq;


5 


6  namespace ExpressionClustering {


7  class Program {


8  private static readonly string folder = Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory);


9  private static readonly string distinctSentencesFileName = Path.Combine(folder, @"distinctSentences.csv");


10  private static readonly string allSentencesFileName = Path.Combine(folder, "allSentences.csv");


11  private static readonly string outputFileName = Path.Combine(folder, "evaluations.csv.gz");


12  private static readonly string columnDelimiter = ";";


13  private static int N = 100;


14  private static double[] evalBuf = new double[N];


15 


16  // pagie1 (univariate)


17  private static double min = 5.0;


18  private static double max = +5.0;


19  private static double[] xs = Enumerable.Range(1, N).Select(xi => ((double)xi / N) * (max  min) + min).ToArray(); // input


20  private static double[] ys = xs.Select(xi => 1.0 / (1 + Math.Pow(xi, 4))).ToArray(); // target (necessary for scaling and clustering


21 


22 


23  public static int MAX_STACK = 20;


24  public static double[][] stack = new double[MAX_STACK][];


25  static Program() {


26  for (int i = 0; i < MAX_STACK; i++)


27  stack[i] = new double[N];


28  }


29 


30  // loads symbolic expressions in postfix notation from a stream and identifies clusters of expressions


31  static void Main(string[] args) {


32 


33  TestFLANN();


34 


35  var hash2Sentences = new Dictionary<string, List<string>>();


36 


37 


38 


39  // read all sentences and determine shortest sentences


40  using (var reader = new StreamReader(allSentencesFileName)) {


41  // read header


42  reader.ReadLine();


43  int nSentences = 0;


44  while (!reader.EndOfStream) {


45  var line = reader.ReadLine();


46  var toks = line.Split(';');


47  var hash = toks[2];


48  List<string> ls;


49  if (!hash2Sentences.TryGetValue(hash, out ls)) {


50  ls = new List<string>(1);


51  hash2Sentences.Add(hash, ls);


52  }


53  ls.Add(toks[1]);


54  nSentences++;


55  }


56 


57  Console.WriteLine("{0} {1}", nSentences, hash2Sentences.Count);


58  //Evaluate(toks[1], xs, evalBuf);


59  }


60 


61  using (var writer = new StreamWriter(new System.IO.Compression.GZipStream(new FileStream(outputFileName, FileMode.OpenOrCreate), System.IO.Compression.CompressionMode.Compress)))


62  foreach (var kvp in hash2Sentences) {


63  var ls = kvp.Value;


64  var sentence = FindShortest(ls);


65  Evaluate(sentence, xs, evalBuf);


66  Add(writer, sentence, evalBuf);


67  }


68  }


69 


70  private static void TestFLANN() {


71  var rand = new Random(1234);


72  var dim = 100;


73  var N = 10000;


74  var dataset = new List<double[]>();


75  for(int i=0;i<N;i++) {


76  var x = new double[dim];


77  for (int j = 0; j < dim; j++) x[j] = rand.NextDouble();


78  dataset.Add(x);


79  }


80  List<int> nnResults;


81  List<double> nnDists;


82  Flann.FindNearestNeighbours(dataset, dataset, out nnResults, out nnDists);


83 


84  }


85 


86  private static string FindShortest(List<string> ls) {


87  var minElem = ls.First();


88  for (int i = 1; i < ls.Count; i++) {


89  if (ls[i].Length < minElem.Length) minElem = ls[i];


90  }


91  return minElem;


92  }


93 


94 


95  #region evaluation


96  // linear scaling


97  private static void Scale(double[] evalBuf, double[] ys) {


98  double alpha;


99  double beta;


100  HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError error;


101 


102  HeuristicLab.Problems.DataAnalysis.OnlineLinearScalingParameterCalculator.Calculate(evalBuf, ys, out alpha, out beta, out error);


103  if (error != HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError.None) {


104  alpha = 0.0;


105  beta = 1.0;


106  Console.WriteLine("WARNING: error in scaling");


107  }


108 


109  for (int i = 0; i < evalBuf.Length; i++) {


110  evalBuf[i] = beta * evalBuf[i] + alpha;


111  }


112 


113 


114  //


115  // var meanE = 0.0;


116  // var meanE2 = 0.0;


117  // var meanY = 0.0;


118  // var meanY2 = 0.0;


119  // for(int i=0;i<evalBuf.Length;i++) {


120  // var deltaE = evalBuf[i]  meanE;


121  // meanE += deltaE / (i+1);


122  // var deltaE2 = evalBuf[i]  meanE;


123  // meanE2 += deltaE * deltaE2;


124  //


125  //


126  // var deltaY = ys[i]  meanY;


127  // meanY += deltaY / (i + 1);


128  // var deltaY2 = ys[i]  meanY;


129  // meanY2 += deltaY * deltaY2;


130  // TODO COVARIANCE


131  // Linear Scaling: b = cov(y,e) / var(e); a = meanY  b*meanE;


132  // }


133  //


134  // var varE = meanE2 / evalBuf.Length;


135  // var varY = meanY2 / evalBuf.Length;


136  // Console.WriteLine("{0} {1} {2} {3}", meanE, evalBuf.Average(), meanY, ys.Average());


137  // Console.WriteLine("{0} {1}", varE, varY);


138  //


139  // var factor = varY / varE;


140  // for(int i=0;i<evalBuf.Length;i++) {


141  // evalBuf[i] = (evalBuf[i]  meanE) * factor + meanY;


142  // }


143  }


144 


145  // evaluates postfix expressions (only for a very specific format)


146  private static void Evaluate(string postfixExpr, double[] xs, double[] evalBuf) {


147  int topOfStack = 1;


148  Evaluate(postfixExpr, 0, xs, ref topOfStack);


149  Array.Copy(stack[topOfStack], evalBuf, evalBuf.Length);


150  }


151 


152  private static void Evaluate(string postfixExpr, int exprPos, double[] xs, ref int topOfStack) {


153  while (exprPos < postfixExpr.Length) {


154  switch (postfixExpr[exprPos]) {


155  case '+': {


156  exprPos += 2;


157  var a = stack[topOfStack];


158  var b = stack[topOfStack  1];


159  for (int i = 0; i < N; i++) {


160  a[i] += b[i];


161  }


162  break;


163  }


164  case '*': {


165  exprPos += 2;


166  var a = stack[topOfStack];


167  var b = stack[topOfStack  1];


168  for (int i = 0; i < N; i++) {


169  a[i] *= b[i];


170  }


171  break;


172  }


173  case 'X': {


174  exprPos += 2;


175  topOfStack++;


176  Array.Copy(xs, stack[topOfStack], N);


177  break;


178  }


179  case 'c': {


180  // cos


181  exprPos += 4;


182  var a = stack[topOfStack];


183  for (int i = 0; i < N; i++) {


184  a[i] = Math.Cos(a[i]);


185  }


186  break;


187  }


188  case 's': {


189  // sin


190  exprPos += 4;


191  var a = stack[topOfStack];


192  for (int i = 0; i < N; i++) {


193  a[i] = Math.Sin(a[i]);


194  }


195  break;


196  }


197  case 'l': {


198  // log


199  exprPos += 4;


200  var a = stack[topOfStack];


201  for (int i = 0; i < N; i++) {


202  a[i] = Math.Log(a[i]);


203  }


204 


205  break;


206  }


207  case 'e': {


208  // exp


209  exprPos += 4;


210  var a = stack[topOfStack];


211  for (int i = 0; i < N; i++) {


212  a[i] = Math.Exp(a[i]);


213  }


214 


215  break;


216  }


217  case 'i': {


218  // inv


219  exprPos += 4;


220  var a = stack[topOfStack];


221  for (int i = 0; i < N; i++) {


222  a[i] = 1.0 / a[i];


223  }


224  break;


225  }


226  default: {


227  throw new InvalidOperationException(string.Format("Cannot handle {0} in {1}", postfixExpr[exprPos], postfixExpr));


228  }


229  }


230  }


231  }


232  #endregion


233 


234 


235 


236  // add the line with it's evaluation result to a data structure for clustering


237  private static void Add(StreamWriter writer, string line, double[] evalBuf) {


238  var avg = evalBuf.Average();


239  if (double.IsNaN(avg)  double.IsInfinity(avg)) {


240  Console.WriteLine("skipping {0} {1}", evalBuf.Average(), line);


241  } else {


242  Scale(evalBuf, ys);


243 


244  writer.WriteLine(string.Join("\t", evalBuf.Select(ei => ei.ToString(System.Globalization.CultureInfo.InvariantCulture))));


245  }


246  }


247  }


248  }

