source: branches/HeuristicLab.EvolutionTracking/HeuristicLab.Problems.DataAnalysis.Symbolic/3.4/Tracking/Analyzers/SymbolicDataAnalysisSchemaFrequencyAnalyzer.cs @ 15906

Last change on this file since 15906 was 15906, checked in by bburlacu, 4 years ago

#1772: Refactoring and speed optimization of diversification operators

File size: 17.6 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Linq;
25using System.Threading.Tasks;
26using HeuristicLab.Common;
27using HeuristicLab.Core;
28using HeuristicLab.Data;
29using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
30using HeuristicLab.EvolutionTracking;
31using HeuristicLab.Optimization;
32using HeuristicLab.Parameters;
33using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
34
35namespace HeuristicLab.Problems.DataAnalysis.Symbolic {
36  [Item("SymbolicDataAnalysisSchemaFrequencyAnalyzer", "An analyzer which counts schema frequencies in the population.")]
37  [StorableClass]
38  public class SymbolicDataAnalysisSchemaFrequencyAnalyzer : EvolutionTrackingAnalyzer<ISymbolicExpressionTree> {
39    private const string MinimumSchemaLengthParameterName = "MinimumSchemaLength";
40    private const string StrictSchemaMatchingParameterName = "StrictSchemaMatching";
41    private const string ProblemDataParameterName = "ProblemData";
42    private const string InterpreterParameterName = "SymbolicExpressionTreeInterpreter";
43    private const string ExecuteInParallelParameterName = "ExecuteInParallel";
44    private const string MaximumDegreeOfParallelismParameterName = "MaximumDegreeOfParallelism";
45    private const string SchemaDefinitionParameterName = "SchemaDefinition";
46
47    private static readonly Dictionary<string, string> ShortNames = new Dictionary<string, string> {
48      { "Addition", "+" },
49      { "Subtraction", "-" },
50      { "Multiplication", "*" },
51      { "Division", "/" },
52      { "Exponential", "exp" },
53      { "Logarithm", "log" }
54    };
55
56    [Storable]
57    private readonly SymbolicExpressionTreePhenotypicSimilarityCalculator phenotypicSimilarityCalculator;
58
59    [Storable]
60    private readonly SymbolicExpressionTreeBottomUpSimilarityCalculator genotypicSimilarityCalculator;
61
62    [Storable]
63    private readonly ISymbolicExpressionTreeNodeEqualityComparer comparer;
64    private QueryMatch qm;
65
66
67    public IConstrainedValueParameter<StringValue> SchemaDefinitionParameter {
68      get { return (IConstrainedValueParameter<StringValue>)Parameters[SchemaDefinitionParameterName]; }
69    }
70    public IFixedValueParameter<BoolValue> ExecuteInParallelParameter {
71      get { return (IFixedValueParameter<BoolValue>)Parameters[ExecuteInParallelParameterName]; }
72    }
73
74    public IFixedValueParameter<IntValue> MaximumDegreeOfParallelismParameter {
75      get { return (IFixedValueParameter<IntValue>)Parameters[MaximumDegreeOfParallelismParameterName]; }
76    }
77
78    public IFixedValueParameter<IntValue> MinimumSchemaLengthParameter {
79      get { return (IFixedValueParameter<IntValue>)Parameters[MinimumSchemaLengthParameterName]; }
80    }
81
82    public IFixedValueParameter<BoolValue> StrictSchemaMatchingParameter {
83      get { return (IFixedValueParameter<BoolValue>)Parameters[StrictSchemaMatchingParameterName]; }
84    }
85
86    public ILookupParameter<IRegressionProblemData> ProblemDataParameter {
87      get { return (ILookupParameter<IRegressionProblemData>)Parameters[ProblemDataParameterName]; }
88    }
89
90    public ILookupParameter<ISymbolicDataAnalysisExpressionTreeInterpreter> InterpreterParameter {
91      get { return (ILookupParameter<ISymbolicDataAnalysisExpressionTreeInterpreter>)Parameters[InterpreterParameterName]; }
92    }
93
94    public bool ExecuteInParallel {
95      get { return ExecuteInParallelParameter.Value.Value; }
96      set { ExecuteInParallelParameter.Value.Value = value; }
97    }
98
99    public int MaximumDegreeOfParallelism {
100      get { return MaximumDegreeOfParallelismParameter.Value.Value; }
101      set { MaximumDegreeOfParallelismParameter.Value.Value = value; }
102    }
103
104    public int MinimumSchemaLength {
105      get { return MinimumSchemaLengthParameter.Value.Value; }
106      set { MinimumSchemaLengthParameter.Value.Value = value; }
107    }
108
109    public bool StrictSchemaMatching {
110      get { return StrictSchemaMatchingParameter.Value.Value; }
111      set { StrictSchemaMatchingParameter.Value.Value = value; }
112    }
113
114    public SymbolicDataAnalysisSchemaFrequencyAnalyzer() {
115      comparer = new SymbolicExpressionTreeNodeEqualityComparer {
116        MatchConstantValues = false,
117        MatchVariableNames = true,
118        MatchVariableWeights = false
119      };
120      qm = new QueryMatch(comparer) { MatchParents = true };
121      phenotypicSimilarityCalculator = new SymbolicExpressionTreePhenotypicSimilarityCalculator();
122      genotypicSimilarityCalculator = new SymbolicExpressionTreeBottomUpSimilarityCalculator { SolutionVariableName = "SymbolicExpressionTree" };
123      Parameters.Add(new FixedValueParameter<IntValue>(MinimumSchemaLengthParameterName, new IntValue(10)));
124      Parameters.Add(new FixedValueParameter<IntValue>(MaximumDegreeOfParallelismParameterName, new IntValue(4)));
125      Parameters.Add(new FixedValueParameter<BoolValue>(StrictSchemaMatchingParameterName, new BoolValue(true)));
126      Parameters.Add(new FixedValueParameter<BoolValue>(ExecuteInParallelParameterName, new BoolValue(true)));
127      Parameters.Add(new LookupParameter<IRegressionProblemData>(ProblemDataParameterName));
128      Parameters.Add(new LookupParameter<ISymbolicDataAnalysisExpressionTreeInterpreter>(InterpreterParameterName));
129
130      var schemaDefinitions = new ItemSet<StringValue>(new[] {
131         new StringValue("="),
132         new StringValue("#"),
133         new StringValue("=,#")
134      });
135      var schemaDefinitionParameter = new ConstrainedValueParameter<StringValue>(SchemaDefinitionParameterName, schemaDefinitions);
136      schemaDefinitionParameter.Value = schemaDefinitions.First();
137      Parameters.Add(schemaDefinitionParameter);
138    }
139
140    protected SymbolicDataAnalysisSchemaFrequencyAnalyzer(SymbolicDataAnalysisSchemaFrequencyAnalyzer original,
141      Cloner cloner) : base(original, cloner) {
142      comparer = cloner.Clone(original.comparer);
143      phenotypicSimilarityCalculator = cloner.Clone(original.phenotypicSimilarityCalculator);
144      genotypicSimilarityCalculator = cloner.Clone(original.genotypicSimilarityCalculator);
145      MinimumSchemaLength = original.MinimumSchemaLength;
146      StrictSchemaMatching = original.StrictSchemaMatching;
147      qm = new QueryMatch(comparer) { MatchParents = true };
148    }
149
150    public override IDeepCloneable Clone(Cloner cloner) {
151      return new SymbolicDataAnalysisSchemaFrequencyAnalyzer(this, cloner);
152    }
153
154    [StorableConstructor]
155    protected SymbolicDataAnalysisSchemaFrequencyAnalyzer(bool deserializing) : base(deserializing) { }
156
157    [StorableHook(HookType.AfterDeserialization)]
158    private void AfterDeserialization() {
159      qm = new QueryMatch(comparer) { MatchParents = true };
160    }
161
162    public override IOperation Apply() {
163      if (PopulationGraph == null || Generation.Value == 0 ||
164          (Generation.Value > 1 && Generation.Value % UpdateInterval.Value != 0))
165        return base.Apply();
166
167      comparer.MatchVariableWeights = comparer.MatchConstantValues = StrictSchemaMatching;
168      phenotypicSimilarityCalculator.Interpreter = InterpreterParameter.ActualValue;
169      phenotypicSimilarityCalculator.ProblemData = ProblemDataParameter.ActualValue;
170      var generation = Generation.Value;
171
172      // use all offspring produced by crossover (including those in the intermediate rank)
173      var vertices = PopulationGraph.Vertices.Where(x => x.InDegree == 2 && x.Rank > generation - 1).OrderByDescending(x => x.Quality).ToList();
174
175      ResultCollection resultCollection;
176      if (Results.ContainsKey("Schema Frequencies")) {
177        resultCollection = (ResultCollection)Results["Schema Frequencies"].Value;
178      } else {
179        resultCollection = new ResultCollection();
180        var result = new Result("Schema Frequencies", resultCollection);
181        Results.Add(result);
182      }
183
184      var mostFrequentPerGeneration = new List<Tuple<string, double[]>>();
185
186      // match the obtained schemas against the whole population
187      var population = PopulationGraph.Vertices.Where(x => x.Rank.IsAlmost(generation)).ToList();
188      var trees = population.Select(x => x.Data).ToList();
189      var qualities = population.Select(x => x.Quality).ToList();
190      // cache similarity measures to speed up calculations
191      var genSimMatrix = new double[trees.Count, trees.Count];
192      var phenSimMatrix = new double[trees.Count, trees.Count];
193
194      for (int i = 0; i < trees.Count - 1; ++i) {
195        for (int j = i + 1; j < trees.Count; ++j) {
196          genSimMatrix[i, j] = double.NaN;
197          phenSimMatrix[i, j] = double.NaN;
198        }
199      }
200
201      List<ISymbolicExpressionTree> schemas;
202      var def = SchemaDefinitionParameter.Value.Value;
203      switch (def) {
204        case "=":
205          schemas = SchemaCreator.GenerateAnyNodeSchemas(vertices, MinimumSchemaLength, 0, StrictSchemaMatching).ToList();
206          break;
207        case "#":
208          schemas = SchemaCreator.GenerateAnySubtreeSchemas(vertices, MinimumSchemaLength, 0, StrictSchemaMatching).ToList();
209          break;
210        case "=,#":
211          schemas = SchemaCreator.GenerateCombinedSchemas(vertices, MinimumSchemaLength, 0, StrictSchemaMatching).ToList();
212          break;
213        default:
214          return base.Apply();
215      }
216      var schemaStrings = schemas.Select(x => x.Root.GetSubtree(0).GetSubtree(0).FormatToString(StrictSchemaMatching)).ToList();
217      int[][] matchingIndices = new int[schemas.Count][];
218
219      var tNodes = trees.Select(x => QueryMatch.InitializePostOrder(x.Root.GetSubtree(0).GetSubtree(0))).ToList();
220      if (ExecuteInParallel) {
221        Parallel.For(0, schemas.Count, new ParallelOptions { MaxDegreeOfParallelism = MaximumDegreeOfParallelism }, i => {
222          var schema = schemas[i];
223          var sNodes = QueryMatch.InitializePostOrder(schema.Root.GetSubtree(0).GetSubtree(0));
224          matchingIndices[i] = Enumerable.Range(0, trees.Count).Where(idx => qm.Match(tNodes[idx], sNodes)).ToArray();
225        });
226      } else {
227        for (int i = 0; i < schemas.Count; ++i) {
228          var schema = schemas[i];
229          var sNodes = QueryMatch.InitializePostOrder(schema.Root.GetSubtree(0).GetSubtree(0));
230          matchingIndices[i] = Enumerable.Range(0, trees.Count).Where(idx => qm.Match(tNodes[idx], sNodes)).ToArray();
231        }
232      }
233
234      var schemaStatistics = new List<Tuple<string, double[]>>();
235      var avgPopQuality = qualities.Average();
236
237      if (ExecuteInParallel) {
238        var locker = new object();
239        Parallel.For(0, schemas.Count, new ParallelOptions {
240          MaxDegreeOfParallelism = MaximumDegreeOfParallelism
241        }, i => {
242          var indices = matchingIndices[i];
243          if (indices.Length > 1) {
244            var avgSchemaQuality = indices.Average(x => qualities[x]);
245            var avgLength = indices.Average(x => trees[x].Length);
246            var avgGenSim = AverageSimilarity(indices, trees, genSimMatrix, genotypicSimilarityCalculator.CalculateSimilarity);
247            var avgPhenSim = AverageSimilarity(indices, trees, phenSimMatrix, phenotypicSimilarityCalculator.CalculateSimilarity);
248            var array = new[] { indices.Length, avgSchemaQuality, avgLength, avgGenSim, avgPhenSim, avgPopQuality };
249            var t = new Tuple<string, double[]>(schemaStrings[i], array);
250            lock (locker) {
251              schemaStatistics.Add(t);
252            }
253          }
254        });
255      } else {
256        for (int i = 0; i < schemas.Count; ++i) {
257          var indices = matchingIndices[i];
258          if (indices.Length > 1) {
259            var avgSchemaQuality = indices.Average(x => qualities[x]);
260            var avgLength = indices.Average(x => trees[x].Length);
261            var avgGenSim = AverageSimilarity(indices, trees, genSimMatrix, genotypicSimilarityCalculator.CalculateSimilarity);
262            var avgPhenSim = AverageSimilarity(indices, trees, phenSimMatrix, phenotypicSimilarityCalculator.CalculateSimilarity);
263            var array = new[] { indices.Length, avgSchemaQuality, avgLength, avgGenSim, avgPhenSim, avgPopQuality };
264            var t = new Tuple<string, double[]>(schemaStrings[i], array);
265            schemaStatistics.Add(t);
266          }
267        }
268      }
269
270      if (!schemaStatistics.Any()) return base.Apply(); // shouldn't ever happen
271      var columnNames = new[] { "Count", "Avg Quality", "Avg Length", "Avg Genotype Similarity", "Avg Phenotype Similarity", "Avg Population Quality" };
272      var mostFrequent = new DoubleMatrix(schemaStatistics.Count, schemaStatistics[0].Item2.Length) {
273        SortableView = true
274      };
275      schemaStatistics.Sort((a, b) => { if (a.Item2[0].Equals(b.Item2[0])) return b.Item2[1].CompareTo(a.Item2[1]); return b.Item2[0].CompareTo(a.Item2[0]); });
276      mostFrequentPerGeneration.Add(Tuple.Create(schemaStatistics[0].Item1, new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray()));
277      mostFrequent.RowNames = schemaStatistics.Select(x => x.Item1);
278      mostFrequent.ColumnNames = columnNames;
279      for (int i = 0; i < schemaStatistics.Count; ++i) {
280        var values = schemaStatistics[i].Item2;
281        for (int j = 0; j < values.Length; ++j) {
282          mostFrequent[i, j] = values[j];
283        }
284      }
285      resultCollection.Add(new Result("Generation " + generation + " Most Frequent Schemas", mostFrequent));
286
287      columnNames = new[] { "Generation", "Count", "Avg Quality", "Avg Length", "Avg Genotype Similarity", "Avg Phenotype Similarity", "Avg Population Quality" };
288      // sort according to quality, then count
289      schemaStatistics.Sort((a, b) => { if (a.Item2[1].Equals(b.Item2[1])) return b.Item2[0].CompareTo(a.Item2[0]); return b.Item2[1].CompareTo(a.Item2[1]); });
290      DoubleMatrix bestSchemasMatrix;
291      if (!resultCollection.ContainsKey("Best Schemas")) {
292        bestSchemasMatrix = new DoubleMatrix(1, 7);
293        bestSchemasMatrix.RowNames = new[] { schemaStatistics[0].Item1 };
294        bestSchemasMatrix.ColumnNames = columnNames;
295        var values = new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray();
296        for (int i = 0; i < values.Length; ++i)
297          bestSchemasMatrix[0, i] = values[i];
298        resultCollection.Add(new Result("Best Schemas", bestSchemasMatrix));
299      } else {
300        bestSchemasMatrix = (DoubleMatrix)resultCollection["Best Schemas"].Value;
301        resultCollection["Best Schemas"].Value = AddRow(bestSchemasMatrix, new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray(), schemaStatistics[0].Item1);
302      }
303
304      // sort according to count, then quality
305      schemaStatistics.Sort((a, b) => { if (a.Item2[0].Equals(b.Item2[0])) return b.Item2[1].CompareTo(a.Item2[1]); return b.Item2[0].CompareTo(a.Item2[0]); });
306      DoubleMatrix frequentSchemasMatrix;
307      if (!resultCollection.ContainsKey("Most Frequent Schemas")) {
308        frequentSchemasMatrix = new DoubleMatrix(1, 7);
309        frequentSchemasMatrix.RowNames = new[] { schemaStatistics[0].Item1 };
310        frequentSchemasMatrix.ColumnNames = columnNames;
311        var values = new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray();
312        for (int i = 0; i < values.Length; ++i)
313          frequentSchemasMatrix[0, i] = values[i];
314        resultCollection.Add(new Result("Most Frequent Schemas", frequentSchemasMatrix));
315      } else {
316        frequentSchemasMatrix = (DoubleMatrix)resultCollection["Most Frequent Schemas"].Value;
317        resultCollection["Most Frequent Schemas"].Value = AddRow(frequentSchemasMatrix, new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray(), schemaStatistics[0].Item1);
318      }
319      return base.Apply();
320    }
321
322    private static DoubleMatrix AddRow(DoubleMatrix m, double[] row, string rowName) {
323      if (row.Length != m.Columns)
324        throw new Exception("Row value count must match matrix column count.");
325      var x = new DoubleMatrix(m.Rows + 1, m.Columns);
326      x.RowNames = m.RowNames.Concat(new[] { rowName });
327      x.ColumnNames = m.ColumnNames;
328      for (int i = 0; i < m.Rows; ++i)
329        for (int j = 0; j < m.Columns; ++j)
330          x[i, j] = m[i, j];
331      for (int j = 0; j < m.Columns; ++j)
332        x[m.Rows, j] = row[j];
333      return x;
334    }
335
336    private static double AverageSimilarity(int[] indices, List<ISymbolicExpressionTree> trees, double[,] similarityMatrix, Func<ISymbolicExpressionTree, ISymbolicExpressionTree, double> similarityFunction) {
337      var agg = 0d;
338      int len = indices.Length;
339      var count = len * (len - 1) / 2d;
340      for (int i = 0; i < indices.Length - 1; ++i) {
341        var a = indices[i];
342        for (int j = i + 1; j < indices.Length; ++j) {
343          var b = indices[j];
344          if (double.IsNaN(similarityMatrix[a, b]))
345            similarityMatrix[a, b] = similarityFunction(trees[a], trees[b]);
346          agg += similarityMatrix[a, b];
347        }
348      }
349      return agg / count;
350    }
351  }
352}
Note: See TracBrowser for help on using the repository browser.