source: branches/HeuristicLab.EvolutionTracking/HeuristicLab.Problems.DataAnalysis.Symbolic/3.4/Tracking/Analyzers/SymbolicDataAnalysisSchemaFrequencyAnalyzer.cs @ 15351

Last change on this file since 15351 was 15351, checked in by bburlacu, 5 years ago

#1772: Merge trunk changes, add a couple of convenience parameters to the SchemaCreator.

File size: 16.0 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25using System.Threading.Tasks;
26using HeuristicLab.Common;
27using HeuristicLab.Core;
28using HeuristicLab.Data;
29using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
30using HeuristicLab.EvolutionTracking;
31using HeuristicLab.Optimization;
32using HeuristicLab.Parameters;
33using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
34
35namespace HeuristicLab.Problems.DataAnalysis.Symbolic {
36  [Item("SymbolicDataAnalysisSchemaFrequencyAnalyzer", "An analyzer which counts schema frequencies in the population.")]
37  [StorableClass]
38  public class SymbolicDataAnalysisSchemaFrequencyAnalyzer : EvolutionTrackingAnalyzer<ISymbolicExpressionTree> {
39    private const string MinimumSchemaLengthParameterName = "MinimumSchemaLength";
40    private const string StrictSchemaMatchingParameterName = "StrictSchemaMatching";
41    private const string ProblemDataParameterName = "ProblemData";
42    private const string InterpreterParameterName = "SymbolicExpressionTreeInterpreter";
43    private const string ExecuteInParallelParameterName = "ExecuteInParallel";
44    private const string MaximumDegreeOfParallelismParameterName = "MaximumDegreeOfParallelism";
45
46    private static readonly Dictionary<string, string> ShortNames = new Dictionary<string, string> {
47    { "Addition", "+" },
48    { "Subtraction", "-" },
49    { "Multiplication", "*" },
50    { "Division", "/" },
51    { "Exponential", "exp" },
52    { "Logarithm", "log" }
53  };
54
55    [Storable]
56    private readonly SymbolicExpressionTreePhenotypicSimilarityCalculator phenotypicSimilarityCalculator;
57
58    [Storable]
59    private readonly SymbolicExpressionTreeBottomUpSimilarityCalculator genotypicSimilarityCalculator;
60
61    [Storable]
62    private readonly ISymbolicExpressionTreeNodeEqualityComparer comparer;
63    private QueryMatch qm;
64
65    public IFixedValueParameter<BoolValue> ExecuteInParallelParameter {
66      get { return (IFixedValueParameter<BoolValue>)Parameters[ExecuteInParallelParameterName]; }
67    }
68
69    public IFixedValueParameter<IntValue> MaximumDegreeOfParallelismParameter {
70      get { return (IFixedValueParameter<IntValue>)Parameters[MaximumDegreeOfParallelismParameterName]; }
71    }
72
73    public IFixedValueParameter<IntValue> MinimumSchemaLengthParameter {
74      get { return (IFixedValueParameter<IntValue>)Parameters[MinimumSchemaLengthParameterName]; }
75    }
76
77    public IFixedValueParameter<BoolValue> StrictSchemaMatchingParameter {
78      get { return (IFixedValueParameter<BoolValue>)Parameters[StrictSchemaMatchingParameterName]; }
79    }
80
81    public ILookupParameter<IRegressionProblemData> ProblemDataParameter {
82      get { return (ILookupParameter<IRegressionProblemData>)Parameters[ProblemDataParameterName]; }
83    }
84
85    public ILookupParameter<ISymbolicDataAnalysisExpressionTreeInterpreter> InterpreterParameter {
86      get { return (ILookupParameter<ISymbolicDataAnalysisExpressionTreeInterpreter>)Parameters[InterpreterParameterName]; }
87    }
88
89    public bool ExecuteInParallel {
90      get { return ExecuteInParallelParameter.Value.Value; }
91      set { ExecuteInParallelParameter.Value.Value = value; }
92    }
93
94    public int MaximumDegreeOfParallelism {
95      get { return MaximumDegreeOfParallelismParameter.Value.Value; }
96      set { MaximumDegreeOfParallelismParameter.Value.Value = value; }
97    }
98
99    public int MinimumSchemaLength {
100      get { return MinimumSchemaLengthParameter.Value.Value; }
101      set { MinimumSchemaLengthParameter.Value.Value = value; }
102    }
103
104    public bool StrictSchemaMatching {
105      get { return StrictSchemaMatchingParameter.Value.Value; }
106      set { StrictSchemaMatchingParameter.Value.Value = value; }
107    }
108
109    public SymbolicDataAnalysisSchemaFrequencyAnalyzer() {
110      comparer = new SymbolicExpressionTreeNodeEqualityComparer {
111        MatchConstantValues = false,
112        MatchVariableNames = true,
113        MatchVariableWeights = false
114      };
115      qm = new QueryMatch(comparer) { MatchParents = true };
116      phenotypicSimilarityCalculator = new SymbolicExpressionTreePhenotypicSimilarityCalculator();
117      genotypicSimilarityCalculator = new SymbolicExpressionTreeBottomUpSimilarityCalculator { SolutionVariableName = "SymbolicExpressionTree" };
118      Parameters.Add(new FixedValueParameter<IntValue>(MinimumSchemaLengthParameterName, new IntValue(10)));
119      Parameters.Add(new FixedValueParameter<IntValue>(MaximumDegreeOfParallelismParameterName, new IntValue(4)));
120      Parameters.Add(new FixedValueParameter<BoolValue>(StrictSchemaMatchingParameterName, new BoolValue(true)));
121      Parameters.Add(new FixedValueParameter<BoolValue>(ExecuteInParallelParameterName, new BoolValue(true)));
122      Parameters.Add(new LookupParameter<IRegressionProblemData>(ProblemDataParameterName));
123      Parameters.Add(new LookupParameter<ISymbolicDataAnalysisExpressionTreeInterpreter>(InterpreterParameterName));
124    }
125
126    protected SymbolicDataAnalysisSchemaFrequencyAnalyzer(SymbolicDataAnalysisSchemaFrequencyAnalyzer original,
127      Cloner cloner) : base(original, cloner) {
128      comparer = cloner.Clone(original.comparer);
129      phenotypicSimilarityCalculator = cloner.Clone(original.phenotypicSimilarityCalculator);
130      genotypicSimilarityCalculator = cloner.Clone(original.genotypicSimilarityCalculator);
131      MinimumSchemaLength = original.MinimumSchemaLength;
132      StrictSchemaMatching = original.StrictSchemaMatching;
133      qm = new QueryMatch(comparer) { MatchParents = true };
134    }
135
136    public override IDeepCloneable Clone(Cloner cloner) {
137      return new SymbolicDataAnalysisSchemaFrequencyAnalyzer(this, cloner);
138    }
139
140    [StorableConstructor]
141    protected SymbolicDataAnalysisSchemaFrequencyAnalyzer(bool deserializing) : base(deserializing) { }
142
143    [StorableHook(HookType.AfterDeserialization)]
144    private void AfterDeserialization() {
145      qm = new QueryMatch(comparer) { MatchParents = true };
146    }
147
148    public override IOperation Apply() {
149      if (PopulationGraph == null || Generation.Value == 0 ||
150          (Generation.Value > 1 && Generation.Value % UpdateInterval.Value != 0))
151        return base.Apply();
152
153      comparer.MatchVariableWeights = comparer.MatchConstantValues = StrictSchemaMatching;
154      phenotypicSimilarityCalculator.Interpreter = InterpreterParameter.ActualValue;
155      phenotypicSimilarityCalculator.ProblemData = ProblemDataParameter.ActualValue;
156      var generation = Generation.Value;
157
158      // use all offspring produced by crossover (including those in the intermediate rank)
159      var vertices = PopulationGraph.Vertices.Where(x => x.InDegree == 2 && x.Rank > generation - 1).OrderByDescending(x => x.Quality).ToList();
160
161      ResultCollection resultCollection;
162      if (Results.ContainsKey("Schema Frequencies")) {
163        resultCollection = (ResultCollection)Results["Schema Frequencies"].Value;
164      } else {
165        resultCollection = new ResultCollection();
166        var result = new Result("Schema Frequencies", resultCollection);
167        Results.Add(result);
168      }
169
170      var mostFrequentPerGeneration = new List<Tuple<string, double[]>>();
171
172      // match the obtained schemas against the whole population
173      var population = PopulationGraph.Vertices.Where(x => x.Rank.IsAlmost(generation)).ToList();
174      var trees = population.Select(x => x.Data).ToList();
175      var qualities = population.Select(x => x.Quality).ToList();
176      // cache similarity measures to speed up calculations
177      var genSimMatrix = new double[trees.Count, trees.Count];
178      var phenSimMatrix = new double[trees.Count, trees.Count];
179
180      for (int i = 0; i < trees.Count - 1; ++i) {
181        for (int j = i + 1; j < trees.Count; ++j) {
182          genSimMatrix[i, j] = double.NaN;
183          phenSimMatrix[i, j] = double.NaN;
184        }
185      }
186
187      var schemas = SchemaCreator.GenerateCombinedSchemas(vertices, MinimumSchemaLength, 0, StrictSchemaMatching).ToList();
188      var schemaStrings = schemas.Select(x => x.Root.GetSubtree(0).GetSubtree(0).FormatToString(StrictSchemaMatching)).ToList();
189      int[][] matchingIndices;
190      if (ExecuteInParallel) {
191        matchingIndices = new int[schemas.Count][];
192        Parallel.For(0, schemas.Count, new ParallelOptions { MaxDegreeOfParallelism = MaximumDegreeOfParallelism }, i => {
193          var schema = schemas[i];
194          matchingIndices[i] = Enumerable.Range(0, trees.Count).Where(v => qm.Match(trees[v], schema)).ToArray();
195        });
196      } else {
197        matchingIndices = schemas.Select(x => Enumerable.Range(0, trees.Count).Where(v => qm.Match(trees[v], x)).ToArray()).ToArray();
198      }
199
200      var schemaStatistics = new List<Tuple<string, double[]>>();
201      var avgPopQuality = qualities.Average();
202
203      if (ExecuteInParallel) {
204        var locker = new object();
205        Parallel.For(0, schemas.Count, new ParallelOptions { MaxDegreeOfParallelism = MaximumDegreeOfParallelism }, i => {
206          var indices = matchingIndices[i];
207          if (indices.Length > 1) {
208            var avgSchemaQuality = indices.Average(x => qualities[x]);
209            var avgLength = indices.Average(x => trees[x].Length);
210            var avgGenSim = AverageSimilarity(indices, trees, genSimMatrix, genotypicSimilarityCalculator.CalculateSimilarity);
211            var avgPhenSim = AverageSimilarity(indices, trees, phenSimMatrix, phenotypicSimilarityCalculator.CalculateSimilarity);
212            var array = new[] { indices.Length, avgSchemaQuality, avgLength, avgGenSim, avgPhenSim, avgPopQuality };
213            var t = new Tuple<string, double[]>(schemaStrings[i], array);
214            lock (locker) {
215              schemaStatistics.Add(t);
216            }
217          }
218        });
219      } else {
220        for (int i = 0; i < schemas.Count; ++i) {
221          var indices = matchingIndices[i];
222          if (indices.Length > 1) {
223            var avgSchemaQuality = indices.Average(x => qualities[x]);
224            var avgLength = indices.Average(x => trees[x].Length);
225            var avgGenSim = AverageSimilarity(indices, trees, genSimMatrix, genotypicSimilarityCalculator.CalculateSimilarity);
226            var avgPhenSim = AverageSimilarity(indices, trees, phenSimMatrix, phenotypicSimilarityCalculator.CalculateSimilarity);
227            var array = new[] { indices.Length, avgSchemaQuality, avgLength, avgGenSim, avgPhenSim, avgPopQuality };
228            var t = new Tuple<string, double[]>(schemaStrings[i], array);
229            schemaStatistics.Add(t);
230          }
231        }
232      }
233
234      if (!schemaStatistics.Any()) return base.Apply(); // shouldn't ever happen
235      var columnNames = new[] { "Count", "Avg Quality", "Avg Length", "Avg Genotype Similarity", "Avg Phenotype Similarity", "Avg Population Quality" };
236      var mostFrequent = new DoubleMatrix(schemaStatistics.Count, schemaStatistics[0].Item2.Length) {
237        SortableView = true
238      };
239      schemaStatistics.Sort((a, b) => { if (a.Item2[0].Equals(b.Item2[0])) return b.Item2[1].CompareTo(a.Item2[1]); return b.Item2[0].CompareTo(a.Item2[0]); });
240      mostFrequentPerGeneration.Add(Tuple.Create(schemaStatistics[0].Item1, new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray()));
241      mostFrequent.RowNames = schemaStatistics.Select(x => x.Item1);
242      mostFrequent.ColumnNames = columnNames;
243      for (int i = 0; i < schemaStatistics.Count; ++i) {
244        var values = schemaStatistics[i].Item2;
245        for (int j = 0; j < values.Length; ++j) {
246          mostFrequent[i, j] = values[j];
247        }
248      }
249      resultCollection.Add(new Result("Generation " + generation + " Most Frequent Schemas", mostFrequent));
250
251      columnNames = new[] { "Generation", "Count", "Avg Quality", "Avg Length", "Avg Genotype Similarity", "Avg Phenotype Similarity", "Avg Population Quality" };
252      // sort according to quality, then count
253      schemaStatistics.Sort((a, b) => { if (a.Item2[1].Equals(b.Item2[1])) return b.Item2[0].CompareTo(a.Item2[0]); return b.Item2[1].CompareTo(a.Item2[1]); });
254      DoubleMatrix bestSchemasMatrix;
255      if (!resultCollection.ContainsKey("Best Schemas")) {
256        bestSchemasMatrix = new DoubleMatrix(1, 7);
257        bestSchemasMatrix.RowNames = new[] { schemaStatistics[0].Item1 };
258        bestSchemasMatrix.ColumnNames = columnNames;
259        var values = new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray();
260        for (int i = 0; i < values.Length; ++i)
261          bestSchemasMatrix[0, i] = values[i];
262        resultCollection.Add(new Result("Best Schemas", bestSchemasMatrix));
263      } else {
264        bestSchemasMatrix = (DoubleMatrix)resultCollection["Best Schemas"].Value;
265        resultCollection["Best Schemas"].Value = AddRow(bestSchemasMatrix, new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray(), schemaStatistics[0].Item1);
266      }
267
268      // sort according to count, then quality
269      schemaStatistics.Sort((a, b) => { if (a.Item2[0].Equals(b.Item2[0])) return b.Item2[1].CompareTo(a.Item2[1]); return b.Item2[0].CompareTo(a.Item2[0]); });
270      DoubleMatrix frequentSchemasMatrix;
271      if (!resultCollection.ContainsKey("Most Frequent Schemas")) {
272        frequentSchemasMatrix = new DoubleMatrix(1, 7);
273        frequentSchemasMatrix.RowNames = new[] { schemaStatistics[0].Item1 };
274        frequentSchemasMatrix.ColumnNames = columnNames;
275        var values = new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray();
276        for (int i = 0; i < values.Length; ++i)
277          frequentSchemasMatrix[0, i] = values[i];
278        resultCollection.Add(new Result("Most Frequent Schemas", frequentSchemasMatrix));
279      } else {
280        frequentSchemasMatrix = (DoubleMatrix)resultCollection["Most Frequent Schemas"].Value;
281        resultCollection["Most Frequent Schemas"].Value = AddRow(frequentSchemasMatrix, new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray(), schemaStatistics[0].Item1);
282      }
283      return base.Apply();
284    }
285
286    private static DoubleMatrix AddRow(DoubleMatrix m, double[] row, string rowName) {
287      if (row.Length != m.Columns)
288        throw new Exception("Row value count must match matrix column count.");
289      var x = new DoubleMatrix(m.Rows + 1, m.Columns);
290      x.RowNames = m.RowNames.Concat(new[] { rowName });
291      x.ColumnNames = m.ColumnNames;
292      for (int i = 0; i < m.Rows; ++i)
293        for (int j = 0; j < m.Columns; ++j)
294          x[i, j] = m[i, j];
295      for (int j = 0; j < m.Columns; ++j)
296        x[m.Rows, j] = row[j];
297      return x;
298    }
299
300    private static double AverageSimilarity(int[] indices, List<ISymbolicExpressionTree> trees, double[,] similarityMatrix, Func<ISymbolicExpressionTree, ISymbolicExpressionTree, double> similarityFunction) {
301      var agg = 0d;
302      int len = indices.Length;
303      var count = len * (len - 1) / 2d;
304      for (int i = 0; i < indices.Length - 1; ++i) {
305        var a = indices[i];
306        for (int j = i + 1; j < indices.Length; ++j) {
307          var b = indices[j];
308          if (double.IsNaN(similarityMatrix[a, b]))
309            similarityMatrix[a, b] = similarityFunction(trees[a], trees[b]);
310          agg += similarityMatrix[a, b];
311        }
312      }
313      return agg / count;
314    }
315  }
316}
Note: See TracBrowser for help on using the repository browser.