Changeset 15924 for branches/2886_SymRegGrammarEnumeration
- Timestamp:
- 04/30/18 20:24:17 (7 years ago)
- Location:
- branches/2886_SymRegGrammarEnumeration
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/2886_SymRegGrammarEnumeration/ExpressionClustering/ExpressionClustering.csproj
r15903 r15924 32 32 <ErrorReport>prompt</ErrorReport> 33 33 <WarningLevel>4</WarningLevel> 34 </PropertyGroup> 35 <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'"> 36 <DebugSymbols>true</DebugSymbols> 37 <OutputPath>bin\x64\Debug\</OutputPath> 38 <DefineConstants>DEBUG;TRACE</DefineConstants> 39 <DebugType>full</DebugType> 40 <PlatformTarget>x64</PlatformTarget> 41 <ErrorReport>prompt</ErrorReport> 42 <CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet> 43 <Prefer32Bit>true</Prefer32Bit> 44 </PropertyGroup> 45 <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|x64'"> 46 <OutputPath>bin\x64\Release\</OutputPath> 47 <DefineConstants>TRACE</DefineConstants> 48 <Optimize>true</Optimize> 49 <DebugType>pdbonly</DebugType> 50 <PlatformTarget>x64</PlatformTarget> 51 <ErrorReport>prompt</ErrorReport> 52 <CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet> 53 <Prefer32Bit>true</Prefer32Bit> 34 54 </PropertyGroup> 35 55 <ItemGroup> -
branches/2886_SymRegGrammarEnumeration/ExpressionClustering/Program.cs
r15903 r15924 2 2 using System.Collections; 3 3 using System.Collections.Generic; 4 using System.D rawing;4 using System.Diagnostics; 5 5 using System.IO; 6 using System.Linq; 7 using HeuristicLab.Analysis; 8 using HeuristicLab.Analysis.Views; 9 using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding; 6 using System.Linq; 10 7 using HeuristicLab.Problems.DataAnalysis; 11 8 using HeuristicLab.Problems.DataAnalysis.Symbolic; 12 9 13 // Evaluates sentences on randomly generated data10 // Reads sentences from files, determines the shortest infix expression for a hash and evaluates sentences on randomly generated data, evaluation result is written to file 14 11 namespace ExpressionClustering { 15 12 class Program { 16 private static readonly string folder = Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory);17 private static readonly string clusterFolder = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "clusters");18 private static readonly string distinctSentencesFileName = Path.Combine(folder, @"distinctSentences_2018-04-13_09-52_TreeSize-10.csv");19 private static readonly string allSentencesFileName = Path.Combine(folder, "allSentences_2018-04-13_09-52_TreeSize-10.csv");20 private static readonly string outputFileName = Path.Combine(folder, "evaluations_2018-04-13_09-52_TreeSize-10.csv.gz");13 //private static readonly string folder = @"D:\heal\documents\trunk\Publications\2018\GPTP\data"; 14 //private static readonly string clusterFolder = folder; 15 //private static readonly string distinctSentencesFileName = Path.Combine(folder, @"distinctSentences_2018-04-13_16-40_TreeSize-7.csv.gz"); 16 //private static readonly string allSentencesFileName = Path.Combine(folder, "allSentences_2018-04-16_14-49_TreeSize-8_1d.csv.gz"); 17 //private static readonly string outputFileName = Path.Combine(folder, "evaluations_2018-04-16_14-49_TreeSize-8_1d.csv.gz"); 21 18 private static int N = 100; 19 private static int PERF_STATS_UPDATE_INTERVAL = 100000; 22 20 private static double[] evalBuf = new double[N]; 23 21 … … 36 34 .ToArray(); 37 35 38 39 public static int MAX_STACK = 20; 40 public static double[][] stack = new double[MAX_STACK][]; 41 static Program() { 42 for (int i = 0; i < MAX_STACK; i++) 43 stack[i] = new double[N]; 44 } 45 36 46 37 // loads symbolic expressions in postfix notation from a stream and identifies clusters of expressions 47 38 static void Main(string[] args) { 39 var sentencesFileName = args[0]; 40 var outputFileName = Path.Combine(Path.GetDirectoryName(sentencesFileName), "evaluations_" + Path.GetFileName(sentencesFileName)); 41 48 42 49 var hash 2Postfix = new Dictionary<string, List<string>>();50 var postfix2infix = new Dictionary<string, string>();43 var hashToRowIdx = new Dictionary<string, int>(); 44 var hashToInfix = new Dictionary<string, string>(); 51 45 52 46 // read all sentences and determine shortest sentences 53 using (var reader = new StreamReader(allSentencesFileName)) { 47 using (var reader = new StreamReader( 48 new System.IO.Compression.GZipStream( 49 new FileStream(sentencesFileName, FileMode.Open, FileAccess.Read), 50 System.IO.Compression.CompressionMode.Decompress))) { 54 51 // read header 55 52 reader.ReadLine(); 56 53 int nSentences = 0; 54 var sw = new Stopwatch(); 55 sw.Start(); 57 56 while (!reader.EndOfStream) { 58 57 var line = reader.ReadLine(); 58 nSentences++; 59 59 var toks = line.Split(';'); 60 60 var hash = toks[0]; 61 61 var length = toks[1]; 62 var postfix = toks[2];62 //var postfix = toks[2]; 63 63 var infix = toks[3]; 64 List<string> alternativesList;65 if (!hash 2Postfix.TryGetValue(hash, out alternativesList)) {66 alternativesList = new List<string>(1);67 hash 2Postfix.Add(hash, alternativesList);64 string expr; 65 if (!hashToInfix.TryGetValue(hash, out expr)) { 66 hashToInfix.Add(hash, infix); 67 hashToRowIdx.Add(hash, nSentences); 68 68 } 69 alternativesList.Add(postfix); 70 postfix2infix.Add(postfix, infix); 71 nSentences++; 69 else if(expr.Length > infix.Length) { 70 hashToInfix[hash] = infix; // keep only shortest 71 hashToRowIdx[hash] = nSentences; 72 } 73 if (nSentences % PERF_STATS_UPDATE_INTERVAL == PERF_STATS_UPDATE_INTERVAL-1) { 74 Console.WriteLine("Read perf: {0} sentences in {1}ms", PERF_STATS_UPDATE_INTERVAL, sw.ElapsedMilliseconds); 75 sw.Restart(); 76 } 72 77 } 73 78 74 Console.WriteLine("{0} {1}", nSentences, hash 2Postfix.Count);79 Console.WriteLine("{0} {1}", nSentences, hashToInfix.Count); 75 80 //Evaluate(toks[1], xs, evalBuf); 76 81 } 77 82 78 List<double[]> functions = new List<double[]>(); 79 List<string> sentences = new List<string>(); 80 List<double[]> qualities = new List<double[]>(); // we might have multiple target functions to which we might compare 83 Scale(ys_keijzer4); 84 Scale(ys_pagie); 81 85 82 var ds = new Dataset(new string[] { "X" }, new IList[] { xs }); 83 foreach (var kvp in hash2Postfix) { 84 var ls = kvp.Value; 85 var sentence = FindShortest(ls); 86 //EvaluatePostfix(sentence, xs, evalBuf); 87 evalBuf = EvaluateInfix(postfix2infix[sentence], ds).ToArray(); 88 if (evalBuf.Any(ei => double.IsInfinity(ei) || double.IsNaN(ei))) { 89 Console.WriteLine("skipping {0} {1}", evalBuf.Average(), sentence); 90 } else { 91 try { 92 Scale(evalBuf); 93 functions.Add((double[])evalBuf.Clone()); 94 sentences.Add(sentence); 95 OnlineCalculatorError error; 96 var r2_pagie = OnlinePearsonsRSquaredCalculator.Calculate(evalBuf, ys_pagie, out error); 97 if (error != OnlineCalculatorError.None) r2_pagie = 0.0; 98 var r2_keijzer4 = OnlinePearsonsRSquaredCalculator.Calculate(evalBuf, ys_keijzer4, out error); 99 if (error != OnlineCalculatorError.None) r2_keijzer4 = 0.0; 100 qualities.Add(new double[] { r2_pagie, r2_keijzer4}); 101 } catch (ArgumentException e) { 102 // scaling failed 86 // output all functions 87 using (var writer = new StreamWriter( 88 new System.IO.Compression.GZipStream( 89 new FileStream(outputFileName, FileMode.OpenOrCreate), 90 System.IO.Compression.CompressionMode.Compress))) { 91 var sw = new Stopwatch(); 92 sw.Start(); 93 94 var ds = new Dataset(new string[] { "X" }, new IList[] { xs }); 95 writer.WriteLine("{0};{1};{2};{3};{4};{5}", "Hash", "RowIdx (in allSentences)", "NMSE pagie", "NMSE keijzer4", "infix", 96 string.Join(";", Enumerable.Range(0, xs.Length).Select(i => "eval" + i))); 97 int nSentences = 0; 98 foreach (var kvp in hashToInfix) { 99 var hash = kvp.Key; 100 var infixExpr = kvp.Value; 101 evalBuf = EvaluateInfix(infixExpr, ds).ToArray(); 102 if (evalBuf.Any(ei => double.IsInfinity(ei) || double.IsNaN(ei))) { 103 //Console.WriteLine("skipping {0} {1}", evalBuf.Average(), infixExpr); 104 //Console.Write("."); 105 } else { 106 try { 107 Scale(evalBuf); 108 // functions.Add((double[])evalBuf.Clone()); 109 // sentences.Add(sentence); 110 OnlineCalculatorError error; 111 var nmse_pagie = OnlineNormalizedMeanSquaredErrorCalculator.Calculate(evalBuf, ys_pagie, out error); 112 if (error != OnlineCalculatorError.None) nmse_pagie = 10; 113 var nmse_keijzer = OnlineNormalizedMeanSquaredErrorCalculator.Calculate(evalBuf, ys_keijzer4, out error); 114 if (error != OnlineCalculatorError.None) nmse_keijzer = 10; 115 writer.WriteLine("{0};{1};{2};{3};{4};{5}", hash, hashToRowIdx[hash], nmse_pagie, nmse_keijzer, infixExpr, 116 string.Join(";", evalBuf.Select(fi => fi.ToString()))); 117 } catch (ArgumentException e) { 118 // scaling failed 119 } 120 } 121 122 if (nSentences++ % PERF_STATS_UPDATE_INTERVAL == PERF_STATS_UPDATE_INTERVAL-1) { 123 Console.WriteLine("Eval perf: {0} sentences in {1}ms expected time remaining: {2}min", 124 PERF_STATS_UPDATE_INTERVAL, sw.ElapsedMilliseconds, 125 (hashToRowIdx.Count - nSentences) / (double)PERF_STATS_UPDATE_INTERVAL * sw.ElapsedMilliseconds / 1000 / 60); 126 sw.Restart(); 103 127 } 104 128 } 105 129 } 106 107 108 List<int> clusters;109 List<double> distances;110 // DEACTIVATED FOR NOW -> USE LARGEVIS in R instead111 // Flann.FindClusters(functions, out clusters, out distances, 100);112 clusters = functions.Select(_ => 0).ToList();113 distances = functions.Select(_ => 0.0).ToList();114 //115 // output all clusters and functions116 using (var writer = new StreamWriter(new System.IO.Compression.GZipStream(new FileStream(outputFileName, FileMode.OpenOrCreate), System.IO.Compression.CompressionMode.Compress))) {117 for (int i = 0; i < functions.Count; i++) {118 writer.WriteLine("{0};{1};{2};{3};{4};{5}", clusters[i], distances[i], string.Join(";", qualities[i]), sentences[i], postfix2infix[sentences[i]], string.Join(";", functions[i].Select(fi => fi.ToString())));119 }120 }121 //122 // var funClusters = functions.Zip(clusters, (f, c) => Tuple.Create(f, c)).GroupBy(t => t.Item2);123 // var dtView = new DataTableView();124 // dtView.Size = new Size(800, 600);125 //126 // foreach (var funCluster in funClusters) {127 // // draw the functions for each cluster into a separate png128 // // var dtName = string.Format("R² {0}", Enumerable.Range(0, qualities.Count).Where(idx => clusters[idx] == funCluster.Key).Select(idx => qualities[idx]).Average());129 // var dtName = "Cluster";130 // var dt = new DataTable(dtName, dtName);131 // var rows = new List<DataRow>();132 // int i = 0;133 // foreach (var fun in funCluster.Select(t => t.Item1)) {134 // var name = i.ToString();135 // var dr = new DataRow(name, name, fun);136 // rows.Add(dr);137 // i++;138 // }139 // dt.Rows.AddRange(rows);140 // dtView.Content = dt;141 // using (var bm = new Bitmap(800, 600)) {142 // dtView.DrawToBitmap(bm, new Rectangle(0, 0, 800, 600));143 // bm.Save(Path.Combine(clusterFolder, string.Format("cluster_{0,3}.png", funCluster.Key)));144 // }145 // }146 130 } 147 148 149 150 private static string FindShortest(List<string> ls) {151 var minElem = ls.First();152 for (int i = 1; i < ls.Count; i++) {153 if (ls[i].Length < minElem.Length) minElem = ls[i];154 }155 return minElem;156 }157 158 131 159 132 160 133 #region evaluation 161 134 162 // scaling to zero-mean unit variance 135 // scaling to zero-mean unit variance (ignore NaN and +/-Inf. 163 136 private static void Scale(double[] evalBuf) { 164 137 double mean; 165 138 double variance; 166 var max = evalBuf. Max();139 var max = evalBuf.Select(xi=>Math.Abs(xi)).Max(); 167 140 for (int i = 0; i < evalBuf.Length; i++) { 168 141 evalBuf[i] /= max; … … 170 143 171 144 OnlineCalculatorError error, varError; 172 OnlineMeanAndVarianceCalculator.Calculate(evalBuf , out mean, out variance, out error, out varError);145 OnlineMeanAndVarianceCalculator.Calculate(evalBuf.Where(xi => !double.IsNaN(xi) && !double.IsInfinity(xi)), out mean, out variance, out error, out varError); 173 146 if(error!=OnlineCalculatorError.None || varError != OnlineCalculatorError.None) { 174 147 throw new ArgumentException("Cannot scale vector"); … … 176 149 177 150 for (int i = 0; i < evalBuf.Length; i++) { 151 if (double.IsNaN(evalBuf[i])) evalBuf[i] = mean; 152 else if (double.IsPositiveInfinity(evalBuf[i])) evalBuf[i] = 10; 153 else if (double.IsNegativeInfinity(evalBuf[i])) evalBuf[i] = -10.0; 178 154 evalBuf[i] = 1.0 / variance * evalBuf[i] + mean; 179 155 } … … 202 178 var interpreter = new SymbolicDataAnalysisExpressionTreeLinearInterpreter(); 203 179 return interpreter.GetSymbolicExpressionTreeValues(tree, ds, Enumerable.Range(0, ds.Rows)); 204 } 205 206 /* 207 // evaluates postfix expressions (only for a very specific format) 208 private static void EvaluatePostfix(string postfixExpr, double[] xs, double[] evalBuf) { 209 int topOfStack = -1; 210 Evaluate(postfixExpr, 0, xs, ref topOfStack); 211 Array.Copy(stack[topOfStack], evalBuf, evalBuf.Length); 212 } 213 214 215 private static void Evaluate(string postfixExpr, int exprPos, double[] xs, ref int topOfStack) { 216 while (exprPos < postfixExpr.Length) { 217 switch (postfixExpr[exprPos]) { 218 case '+': { 219 exprPos += 2; 220 var a = stack[topOfStack]; 221 var b = stack[topOfStack - 1]; 222 for (int i = 0; i < N; i++) { 223 b[i] += a[i]; 224 } 225 topOfStack--; 226 break; 227 } 228 case '*': { 229 exprPos += 2; 230 var a = stack[topOfStack]; 231 var b = stack[topOfStack - 1]; 232 for (int i = 0; i < N; i++) { 233 b[i] *= a[i]; 234 } 235 topOfStack--; 236 break; 237 } 238 case 'X': { 239 exprPos += 2; 240 topOfStack++; 241 Array.Copy(xs, stack[topOfStack], N); 242 break; 243 } 244 case 'c': { 245 if (postfixExpr[exprPos + 1] == 'o') { 246 // cos 247 exprPos += 4; 248 var a = stack[topOfStack]; 249 for (int i = 0; i < N; i++) { 250 a[i] = Math.Cos(a[i]); 251 } 252 break; 253 } else { 254 exprPos += 2; 255 // put 1 onto top of stack // BUG! 256 topOfStack++; 257 var a = stack[topOfStack]; 258 for (int i = 0; i < N; i++) a[i] = 1.0; 259 break; 260 } 261 } 262 case 's': { 263 // sin 264 exprPos += 4; 265 var a = stack[topOfStack]; 266 for (int i = 0; i < N; i++) { 267 a[i] = Math.Sin(a[i]); 268 } 269 break; 270 } 271 case 'l': { 272 // log 273 exprPos += 4; 274 var a = stack[topOfStack]; 275 for (int i = 0; i < N; i++) { 276 a[i] = Math.Log(a[i]); 277 } 278 279 break; 280 } 281 case 'e': { 282 // exp 283 exprPos += 4; 284 var a = stack[topOfStack]; 285 for (int i = 0; i < N; i++) { 286 a[i] = Math.Exp(a[i]); 287 } 288 289 break; 290 } 291 case 'i': { 292 // inv 293 exprPos += 4; 294 var a = stack[topOfStack]; 295 for (int i = 0; i < N; i++) { 296 a[i] = 1.0 / a[i]; 297 } 298 break; 299 } 300 default: { 301 throw new InvalidOperationException(string.Format("Cannot handle {0} in {1}", postfixExpr[exprPos], postfixExpr)); 302 } 303 } 304 } 305 } 306 */ 180 } 307 181 #endregion 308 182 } -
branches/2886_SymRegGrammarEnumeration/ExpressionClustering_R/ClusteringScript.R
r15903 r15924 1 1 library(largeVis) 2 2 library(ggplot2) 3 4 eval7 <- read.csv2("c:/Users/P24581/Desktop/evaluations_2018-04-11_12-50_TreeSize-7.csv",header = TRUE, sep = ";", dec=",") 5 eval7_qs <- eval7[,3] 6 eval7_x <- eval7[,5:105] 7 lv <- largeVis(eval7_x, dim=2, K = 50, n_trees = 50) # TODO scale? 8 clusters <- hdbscan(lv, minPts = 3, K=30) 9 10 m <- data.frame(x=t(lv$coords)[,1], y=t(lv$coords)[,2], c=clusters$clusters, q=eval7_qs) 3 library(dplyr) 11 4 12 5 13 ggplot(data=m, aes(x=x, y=y)) + geom_point(aes(color=q)) 6 7 sentenceFileName <- "D:/heal/documents/trunk/Publications/2018/GPTP/data/evaluations_allSentences_2018-04-13_16-40_TreeSize-7_1d.csv.gz"; 8 evalData <- read.csv(sentenceFileName,header = TRUE, sep = ";", dec=","); 9 qualities <- evalData$R2.keijzer4; 10 outputs <- evalData[,6:105]; 11 12 lv <- largeVis(outputs, dim=2, K = 50, n_trees = 50) # TODO scale? 13 clusters <- hdbscan(lv, minPts = 3, K=50); 14 15 16 17 m <- data.frame(x=t(lv$coords)[,1], y=t(lv$coords)[,2], c=clusters$clusters, q=qualities, outputs) 18 # plot mapped points 19 ggplot(data=m, aes(x=x, y=y)) + geom_point(aes(color=c)) + theme(legend.position = "none") ; 20 21 cluster_n <- dplyr::filter(m, c==5); 22 cluster_evals <- data.frame(x=seq(1,100,1), t(cluster_n[,5:104])) 23 evals_cluster_n <- tidyr::gather(cluster_evals,"f", "fx", 2:ncol(cluster_evals)) 24 25 p <- ggplot(evals_cluster_n, aes(x=x, y=fx,color=f)) + geom_line(); 26 p 27
Note: See TracChangeset
for help on using the changeset viewer.