Index: /branches/2886_SymRegGrammarEnumeration/ExpressionClustering/ExpressionClustering.csproj
===================================================================
--- /branches/2886_SymRegGrammarEnumeration/ExpressionClustering/ExpressionClustering.csproj (revision 15841)
+++ /branches/2886_SymRegGrammarEnumeration/ExpressionClustering/ExpressionClustering.csproj (revision 15842)
@@ -38,12 +38,42 @@
..\..\..\trunk\bin\ALGLIB-3.7.0.dll
+
+ ..\..\..\trunk\bin\HeuristicLab.Analysis-3.3.dll
+
+
+ ..\..\..\trunk\bin\HeuristicLab.Analysis.Views-3.3.dll
+
+
+ False
+ ..\..\..\trunk\bin\HeuristicLab.Collections-3.3.dll
+
..\..\..\trunk\bin\HeuristicLab.Common-3.3.dll
+
+
+
+ False
+ ..\..\..\trunk\bin\HeuristicLab.Core.Views-3.3.dll
+
+
+ False
+ ..\..\..\trunk\bin\HeuristicLab.MainForm-3.3.dll
+
+
+ False
+ ..\..\..\trunk\bin\HeuristicLab.MainForm.WindowsForms-3.3.dll
..\..\..\trunk\bin\HeuristicLab.Problems.DataAnalysis-3.4.dll
+
+ False
+ ..\..\..\trunk\bin\HeuristicLab.Visualization.ChartControlsExtensions-3.3.dll
+
+
+
+
Index: /branches/2886_SymRegGrammarEnumeration/ExpressionClustering/Flann.cs
===================================================================
--- /branches/2886_SymRegGrammarEnumeration/ExpressionClustering/Flann.cs (revision 15841)
+++ /branches/2886_SymRegGrammarEnumeration/ExpressionClustering/Flann.cs (revision 15842)
@@ -86,5 +86,5 @@
public int checks; /* how many leafs (features) to check in one search */
public float cb_index; /* cluster boundary index. Used when searching the kmeans tree */
- public float eps;
+ public float eps;
/* kdtree index parameters */
@@ -153,4 +153,36 @@
[DllImport("flann-1.7.1.dll")]
public static extern int flann_compute_cluster_centers(float[] dataset, int rows, int cols, int clusters, float[] result, ref FLANNParameters flann_params);
+
+ public static int FindClusters(List dataset, out List results, out List distances, int nClusters) {
+ var _nRows = dataset.Count;
+ var _dists = new float[_nRows];
+ var _result = new int[_nRows];
+ var _dim = dataset.First().Length;
+ FLANNParameters p = DEFAULT_FLANN_PARAMETERS;
+ p.algorithm = flann_algorithm_t.FLANN_INDEX_LINEAR;
+ p.centers_init = flann_centers_init_t.FLANN_CENTERS_RANDOM;
+ p.target_precision = 0.9f;
+ p.log_level = flann_log_level_t.FLANN_LOG_INFO;
+ // copy training set
+ var _ds = new float[dataset.Count * _dim];
+ var i = 0;
+ foreach (var e in dataset) {
+ for (int d = 0; d < _dim; d++) {
+ _ds[i++] = (float)e[d];
+ }
+ }
+
+ flann_set_distance_type(flann_distance_t.FLANN_DIST_EUCLIDEAN, 0);
+
+ float[] centers = new float[nClusters * _dim];
+ int actualClusters = flann_compute_cluster_centers(_ds, rows: dataset.Count, cols: _dim, clusters: nClusters, result: centers, flann_params: ref p);
+
+ var res = flann_find_nearest_neighbors(centers, actualClusters, _dim, _ds, _nRows, _result, _dists, 1, ref p);
+
+
+ distances = _dists.Select(fi => (double)fi).ToList();
+ results = _result.ToList();
+ return res;
+ }
public static int FindNearestNeighbours(List dataset, List queryset, out List results, out List distances, int nearestNeighbours = 3) {
@@ -176,18 +208,21 @@
flann_set_distance_type(flann_distance_t.FLANN_DIST_EUCLIDEAN, 0);
- int nClusters = 100;
- float[] centers = new float[nClusters * _dim];
- flann_compute_cluster_centers(_ds, rows: dataset.Count, cols: _dim, clusters: nClusters, result: centers, flann_params: ref p);
-
- float speedup = -1.0f;
+ // int nClusters = 100;
+ // float[] centers = new float[nClusters * _dim];
+ // flann_compute_cluster_centers(_ds, rows: dataset.Count, cols: _dim, clusters: nClusters, result: centers, flann_params: ref p);
+
+
+ // for each point in the training set find the nearest cluster
+
+ // float speedup = -1.0f;
// _ds must be a rows × cols matrix stored in row-major order (one feature on each row)
//var index = flann_build_index(_ds, rows: dataset.Count, cols: _dim, speedup: ref speedup, flann_params: ref p);
-
+
// copy testset
var _testset = new float[_tRows * _dim];
i = 0;
- for(int d = 0; d < _dim; d++) {
- foreach(var e in queryset) {
+ for (int d = 0; d < _dim; d++) {
+ foreach (var e in queryset) {
_testset[i++] = (float)e[d];
}
Index: /branches/2886_SymRegGrammarEnumeration/ExpressionClustering/Program.cs
===================================================================
--- /branches/2886_SymRegGrammarEnumeration/ExpressionClustering/Program.cs (revision 15841)
+++ /branches/2886_SymRegGrammarEnumeration/ExpressionClustering/Program.cs (revision 15842)
@@ -1,14 +1,18 @@
using System;
using System.Collections.Generic;
+using System.Drawing;
using System.IO;
using System.Linq;
+using HeuristicLab.Analysis;
+using HeuristicLab.Analysis.Views;
+using System.Windows.Forms;
namespace ExpressionClustering {
class Program {
private static readonly string folder = Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory);
+ private static readonly string clusterFolder = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "clusters");
private static readonly string distinctSentencesFileName = Path.Combine(folder, @"distinctSentences.csv");
private static readonly string allSentencesFileName = Path.Combine(folder, "allSentences.csv");
private static readonly string outputFileName = Path.Combine(folder, "evaluations.csv.gz");
- private static readonly string columnDelimiter = ";";
private static int N = 100;
private static double[] evalBuf = new double[N];
@@ -31,7 +35,7 @@
static void Main(string[] args) {
- TestFLANN();
-
var hash2Sentences = new Dictionary>();
+ // for debugging only
+ var postfix2infix = new Dictionary();
@@ -52,4 +56,5 @@
}
ls.Add(toks[1]);
+ postfix2infix.Add(toks[1], toks[0]);
nSentences++;
}
@@ -59,28 +64,64 @@
}
- using (var writer = new StreamWriter(new System.IO.Compression.GZipStream(new FileStream(outputFileName, FileMode.OpenOrCreate), System.IO.Compression.CompressionMode.Compress)))
- foreach (var kvp in hash2Sentences) {
- var ls = kvp.Value;
- var sentence = FindShortest(ls);
- Evaluate(sentence, xs, evalBuf);
- Add(writer, sentence, evalBuf);
- }
- }
-
- private static void TestFLANN() {
- var rand = new Random(1234);
- var dim = 100;
- var N = 10000;
- var dataset = new List();
- for(int i=0;i nnResults;
- List nnDists;
- Flann.FindNearestNeighbours(dataset, dataset, out nnResults, out nnDists);
-
- }
+ List functions = new List();
+ List sentences = new List();
+ List qualities = new List();
+
+ foreach (var kvp in hash2Sentences) {
+ var ls = kvp.Value;
+ var sentence = FindShortest(ls);
+ Evaluate(sentence, xs, evalBuf);
+ if (evalBuf.Any(ei => float.IsInfinity((float)ei) || float.IsNaN((float)ei))) {
+ Console.WriteLine("skipping {0} {1}", evalBuf.Average(), sentence);
+ } else {
+ try {
+ Scale(evalBuf, ys);
+ functions.Add((double[])evalBuf.Clone());
+ sentences.Add(sentence);
+ HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError error;
+ qualities.Add(HeuristicLab.Problems.DataAnalysis.OnlinePearsonsRSquaredCalculator.Calculate(evalBuf, ys, out error));
+ } catch (ArgumentException e) {
+ // scaling failed
+ }
+ }
+ }
+
+ List clusters;
+ List distances;
+ Flann.FindClusters(functions, out clusters, out distances, 100);
+
+ // output all clusters and functions
+ using (var writer = new StreamWriter(new System.IO.Compression.GZipStream(new FileStream(outputFileName, FileMode.OpenOrCreate), System.IO.Compression.CompressionMode.Compress))) {
+ for (int i = 0; i < functions.Count; i++) {
+ writer.WriteLine("{0};{1};{2};{3};{4};{5}", clusters[i], distances[i], qualities[i], sentences[i], postfix2infix[sentences[i]], string.Join(";", functions[i].Select(fi => fi.ToString())));
+ }
+ }
+
+ var funClusters = functions.Zip(clusters, (f, c) => Tuple.Create(f, c)).GroupBy(t => t.Item2);
+ var dtView = new DataTableView();
+ dtView.Size = new Size(800, 600);
+
+ foreach (var funCluster in funClusters) {
+ // draw the functions for each cluster into a separate png
+ var dtName = string.Format("R² {0}", Enumerable.Range(0, qualities.Count).Where(idx => clusters[idx] == funCluster.Key).Select(idx => qualities[idx]).Average());
+ var dt = new DataTable(dtName, dtName);
+ var rows = new List();
+ int i = 0;
+ foreach (var fun in funCluster.Select(t => t.Item1)) {
+ var name = i.ToString();
+ var dr = new DataRow(name, name, fun);
+ rows.Add(dr);
+ i++;
+ }
+ dt.Rows.AddRange(rows);
+ dtView.Content = dt;
+ using (var bm = new Bitmap(800, 600)) {
+ dtView.DrawToBitmap(bm, new Rectangle(0, 0, 800, 600));
+ bm.Save(Path.Combine(clusterFolder, string.Format("cluster_{0,3}.png", funCluster.Key)));
+ }
+ }
+ }
+
+
private static string FindShortest(List ls) {
@@ -91,4 +132,5 @@
return minElem;
}
+
@@ -102,7 +144,5 @@
HeuristicLab.Problems.DataAnalysis.OnlineLinearScalingParameterCalculator.Calculate(evalBuf, ys, out alpha, out beta, out error);
if (error != HeuristicLab.Problems.DataAnalysis.OnlineCalculatorError.None) {
- alpha = 0.0;
- beta = 1.0;
- Console.WriteLine("WARNING: error in scaling");
+ throw new ArgumentException();
}
@@ -110,35 +150,4 @@
evalBuf[i] = beta * evalBuf[i] + alpha;
}
-
-
- //
- // var meanE = 0.0;
- // var meanE2 = 0.0;
- // var meanY = 0.0;
- // var meanY2 = 0.0;
- // for(int i=0;i ei.ToString(System.Globalization.CultureInfo.InvariantCulture))));
- }
- }
}
}
Index: /branches/2886_SymRegGrammarEnumeration/HeuristicLab.Algorithms.DataAnalysis.SymRegGrammarEnumeration/GrammarEnumeration/GrammarEnumerationAlgorithm.cs
===================================================================
--- /branches/2886_SymRegGrammarEnumeration/HeuristicLab.Algorithms.DataAnalysis.SymRegGrammarEnumeration/GrammarEnumeration/GrammarEnumerationAlgorithm.cs (revision 15841)
+++ /branches/2886_SymRegGrammarEnumeration/HeuristicLab.Algorithms.DataAnalysis.SymRegGrammarEnumeration/GrammarEnumeration/GrammarEnumerationAlgorithm.cs (revision 15842)
@@ -115,4 +115,5 @@
Analyzers.CheckedItemsChanged += AnalyzersOnCheckedItemsChanged;
Analyzers.SetItemCheckedState(Analyzers.First(analyzer => analyzer is RSquaredEvaluator), true);
+ Analyzers.SetItemCheckedState(Analyzers.First(analyzer => analyzer is SentenceLogger), true);
}