Free cookie consent management tool by TermsFeed Policy Generator

Changeset 13624


Ignore:
Timestamp:
02/18/16 19:33:54 (9 years ago)
Author:
bburlacu
Message:

#1772: Improve the SchemaFrequencyAnalyzer and add calculation of schema genotype and phenotype diversities.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/HeuristicLab.EvolutionTracking/HeuristicLab.Problems.DataAnalysis.Symbolic/3.4/Tracking/Analyzers/SymbolicDataAnalysisSchemaFrequencyAnalyzer.cs

    r13481 r13624  
    2020#endregion
    2121
     22using System;
     23using System.Collections.Generic;
    2224using System.Linq;
     25using System.Text;
     26using System.Threading.Tasks;
    2327using HeuristicLab.Common;
    2428using HeuristicLab.Core;
     
    3539  public class SymbolicDataAnalysisSchemaFrequencyAnalyzer : EvolutionTrackingAnalyzer<ISymbolicExpressionTree> {
    3640    private const string MinimumSchemaLengthParameterName = "MinimumSchemaLength";
     41    private const string StrictSchemaMatchingParameterName = "StrictSchemaMatching";
     42    private const string ProblemDataParameterName = "ProblemData";
     43    private const string InterpreterParameterName = "SymbolicExpressionTreeInterpreter";
     44    private const string ExecuteInParallelParameterName = "ExecuteInParallel";
     45    private const string MaximumDegreeOfParallelismParameterName = "MaximumDegreeOfParallelism";
     46
     47    private static readonly Dictionary<string, string> ShortNames = new Dictionary<string, string> {
     48    { "Addition", "+" },
     49    { "Subtraction", "-" },
     50    { "Multiplication", "*" },
     51    { "Division", "/" },
     52    { "Exponential", "exp" },
     53    { "Logarithm", "log" }
     54  };
    3755
    3856    [Storable]
     
    4664    private QueryMatch qm;
    4765
     66    public IFixedValueParameter<BoolValue> ExecuteInParallelParameter {
     67      get { return (IFixedValueParameter<BoolValue>)Parameters[ExecuteInParallelParameterName]; }
     68    }
     69
     70    public IFixedValueParameter<IntValue> MaximumDegreeOfParallelismParameter {
     71      get { return (IFixedValueParameter<IntValue>)Parameters[MaximumDegreeOfParallelismParameterName]; }
     72    }
     73
    4874    public IFixedValueParameter<IntValue> MinimumSchemaLengthParameter {
    4975      get { return (IFixedValueParameter<IntValue>)Parameters[MinimumSchemaLengthParameterName]; }
     76    }
     77
     78    public IFixedValueParameter<BoolValue> StrictSchemaMatchingParameter {
     79      get { return (IFixedValueParameter<BoolValue>)Parameters[StrictSchemaMatchingParameterName]; }
     80    }
     81
     82    public ILookupParameter<IRegressionProblemData> ProblemDataParameter {
     83      get { return (ILookupParameter<IRegressionProblemData>)Parameters[ProblemDataParameterName]; }
     84    }
     85
     86    public ILookupParameter<ISymbolicDataAnalysisExpressionTreeInterpreter> InterpreterParameter {
     87      get { return (ILookupParameter<ISymbolicDataAnalysisExpressionTreeInterpreter>)Parameters[InterpreterParameterName]; }
     88    }
     89
     90    public bool ExecuteInParallel {
     91      get { return ExecuteInParallelParameter.Value.Value; }
     92      set { ExecuteInParallelParameter.Value.Value = value; }
     93    }
     94
     95    public int MaximumDegreeOfParallelism {
     96      get { return MaximumDegreeOfParallelismParameter.Value.Value; }
     97      set { MaximumDegreeOfParallelismParameter.Value.Value = value; }
     98    }
     99
     100    public int MinimumSchemaLength {
     101      get { return MinimumSchemaLengthParameter.Value.Value; }
     102      set { MinimumSchemaLengthParameter.Value.Value = value; }
     103    }
     104
     105    public bool StrictSchemaMatching {
     106      get { return StrictSchemaMatchingParameter.Value.Value; }
     107      set { StrictSchemaMatchingParameter.Value.Value = value; }
    50108    }
    51109
     
    60118      genotypicSimilarityCalculator = new SymbolicExpressionTreeBottomUpSimilarityCalculator { SolutionVariableName = "SymbolicExpressionTree" };
    61119      Parameters.Add(new FixedValueParameter<IntValue>(MinimumSchemaLengthParameterName, new IntValue(10)));
     120      Parameters.Add(new FixedValueParameter<IntValue>(MaximumDegreeOfParallelismParameterName, new IntValue(4)));
     121      Parameters.Add(new FixedValueParameter<BoolValue>(StrictSchemaMatchingParameterName, new BoolValue(true)));
     122      Parameters.Add(new FixedValueParameter<BoolValue>(ExecuteInParallelParameterName, new BoolValue(true)));
     123      Parameters.Add(new LookupParameter<IRegressionProblemData>(ProblemDataParameterName));
     124      Parameters.Add(new LookupParameter<ISymbolicDataAnalysisExpressionTreeInterpreter>(InterpreterParameterName));
    62125    }
    63126
    64127    protected SymbolicDataAnalysisSchemaFrequencyAnalyzer(SymbolicDataAnalysisSchemaFrequencyAnalyzer original,
    65128      Cloner cloner) : base(original, cloner) {
    66       comparer = original.comparer;
    67       phenotypicSimilarityCalculator = original.phenotypicSimilarityCalculator;
    68       genotypicSimilarityCalculator = original.genotypicSimilarityCalculator;
     129      comparer = cloner.Clone(original.comparer);
     130      phenotypicSimilarityCalculator = cloner.Clone(original.phenotypicSimilarityCalculator);
     131      genotypicSimilarityCalculator = cloner.Clone(original.genotypicSimilarityCalculator);
     132      MinimumSchemaLength = original.MinimumSchemaLength;
     133      StrictSchemaMatching = original.StrictSchemaMatching;
    69134      qm = new QueryMatch(comparer) { MatchParents = true };
    70135    }
     
    76141    [StorableConstructor]
    77142    protected SymbolicDataAnalysisSchemaFrequencyAnalyzer(bool deserializing) : base(deserializing) { }
    78 
    79143
    80144    [StorableHook(HookType.AfterDeserialization)]
     
    84148
    85149    public override IOperation Apply() {
    86       int updateInterval = UpdateIntervalParameter.Value.Value;
    87       IntValue updateCounter = UpdateCounterParameter.ActualValue;
    88       // if counter does not yet exist then initialize it with update interval
    89       // to make sure the solutions are analyzed on the first application of this operator
    90       if (updateCounter == null) {
    91         updateCounter = new IntValue(updateInterval);
    92         UpdateCounterParameter.ActualValue = updateCounter;
    93       }
    94       //analyze solutions only every 'updateInterval' times
    95       if (updateCounter.Value != updateInterval) {
    96         updateCounter.Value++;
     150      if (PopulationGraph == null || Generation.Value == 0 ||
     151          (Generation.Value > 1 && Generation.Value % UpdateInterval.Value != 0))
    97152        return base.Apply();
    98       }
    99       updateCounter.Value = 1;
    100 
    101       if (PopulationGraph == null || Generation.Value == 0)
    102         return base.Apply();
    103 
    104       var minimumSchemaLength = MinimumSchemaLengthParameter.Value.Value;
    105       var vertices = PopulationGraph.GetByRank(Generation.Value).Cast<IGenealogyGraphNode<ISymbolicExpressionTree>>().ToList();
    106       var formatter = new SymbolicExpressionTreeStringFormatter { Indent = false, AppendNewLines = false };
    107 
    108       var schemas = SchemaCreator.GenerateSchemas(vertices, minimumSchemaLength).ToList();
    109       var trees = vertices.Select(x => x.Data).ToList();
    110       var qualities = vertices.Select(x => x.Quality).ToList();
    111       var scopes = ExecutionContext.Scope.SubScopes; // the scopes of all the individuals, needed because the tree evaluated values are stored in the scope and we use them for the phenotypic similarity
    112 
    113       var matrix = new DoubleMatrix(schemas.Count, 5) {
    114         RowNames = schemas.Select(x => formatter.Format(x.Root.GetSubtree(0).GetSubtree(0))),
    115         ColumnNames = new[] { "Avg. Length", "Avg. Quality", "Relative Frequency", "Avg. Phen. Sim.", "Avg. Gen. Sim" },
    116         SortableView = true
    117       };
    118 
    119       for (int i = 0; i < schemas.Count; ++i) {
    120         var schema = schemas[i];
    121         int count = 0;
    122         double avgLength = 0;
    123         double avgQuality = 0;
    124         var matchingScopes = new ScopeList();
    125         for (int j = 0; j < trees.Count; ++j) {
    126           var tree = trees[j];
    127           if (qm.Match(tree, schema)) {
    128             count++;
    129             avgLength += tree.Length;
    130             avgQuality += qualities[j];
    131             matchingScopes.Add(scopes[j]);
     153
     154      comparer.MatchVariableWeights = comparer.MatchConstantValues = StrictSchemaMatching;
     155      phenotypicSimilarityCalculator.Interpreter = InterpreterParameter.ActualValue;
     156      phenotypicSimilarityCalculator.ProblemData = ProblemDataParameter.ActualValue;
     157      var generation = Generation.Value;
     158
     159      var population = PopulationGraph.GetByRank(generation).Cast<IGenealogyGraphNode<ISymbolicExpressionTree>>().ToList();
     160      var vertices = population.Where(x => x.InDegree == 2).OrderByDescending(x => x.Quality).ToList();
     161      ResultCollection resultCollection;
     162      if (Results.ContainsKey("Schema Frequencies")) {
     163        resultCollection = (ResultCollection)Results["Schema Frequencies"].Value;
     164      } else {
     165        resultCollection = new ResultCollection();
     166        var result = new Result("Schema Frequencies", resultCollection);
     167        Results.Add(result);
     168      }
     169
     170      var mostFrequentPerGeneration = new List<Tuple<string, double[]>>();
     171
     172      var trees = population.Select(x => x.Data).ToList();
     173      var qualities = population.Select(x => x.Quality).ToList();
     174      var genSimMatrix = new double[trees.Count, trees.Count];
     175      var phenSimMatrix = new double[trees.Count, trees.Count];
     176
     177      for (int i = 0; i < trees.Count - 1; ++i) {
     178        for (int j = i + 1; j < trees.Count; ++j) {
     179          genSimMatrix[i, j] = double.NaN;
     180          phenSimMatrix[i, j] = double.NaN;
     181        }
     182      }
     183      var schemas = SchemaCreator.GenerateSchemas(vertices, MinimumSchemaLength, StrictSchemaMatching).ToList();
     184      var schemaStrings = schemas.Select(x => SubtreeToString(x.Root.GetSubtree(0).GetSubtree(0), StrictSchemaMatching)).ToList();
     185      int[][] matchingIndices;
     186      if (ExecuteInParallel) {
     187        matchingIndices = new int[schemas.Count][];
     188        Parallel.For(0, schemas.Count, new ParallelOptions { MaxDegreeOfParallelism = MaximumDegreeOfParallelism }, i => {
     189          var schema = schemas[i];
     190          matchingIndices[i] = Enumerable.Range(0, trees.Count).Where(v => qm.Match(trees[v].Root, schema.Root)).ToArray();
     191        });
     192      } else {
     193        matchingIndices = schemas.Select(x => Enumerable.Range(0, trees.Count).Where(v => qm.Match(trees[v].Root, x.Root)).ToArray()).ToArray();
     194      }
     195
     196      var schemaStatistics = new List<Tuple<string, double[]>>();
     197      var avgPopQuality = qualities.Average();
     198
     199      if (ExecuteInParallel) {
     200        var locker = new object();
     201        Parallel.For(0, schemas.Count, new ParallelOptions { MaxDegreeOfParallelism = MaximumDegreeOfParallelism }, i => {
     202          var indices = matchingIndices[i];
     203          if (indices.Length > 1) {
     204            var avgSchemaQuality = indices.Average(x => qualities[x]);
     205            var avgLength = indices.Average(x => trees[x].Length);
     206            var avgGenSim = AverageSimilarity(indices, trees, genSimMatrix, genotypicSimilarityCalculator.CalculateSimilarity);
     207            var avgPhenSim = AverageSimilarity(indices, trees, phenSimMatrix, phenotypicSimilarityCalculator.CalculateSimilarity);
     208            var array = new[] { indices.Length, avgSchemaQuality, avgLength, avgGenSim, avgPhenSim, avgPopQuality };
     209            var t = new Tuple<string, double[]>(schemaStrings[i], array);
     210            lock (locker) {
     211              schemaStatistics.Add(t);
     212            }
    132213          }
    133         }
    134         avgLength /= count;
    135         avgQuality /= count;
    136         double relativeFreq = (double)count / trees.Count;
    137         double avgPhenotypicSimilarity = SchemaEvaluator.CalculateSimilarity(matchingScopes, phenotypicSimilarityCalculator, true, 4);
    138         double avgGenotypicSimilarity = SchemaEvaluator.CalculateSimilarity(matchingScopes, genotypicSimilarityCalculator, true, 4);
    139 
    140         matrix[i, 0] = avgLength;
    141         matrix[i, 1] = avgQuality;
    142         matrix[i, 2] = relativeFreq;
    143         matrix[i, 3] = avgPhenotypicSimilarity;
    144         matrix[i, 4] = avgGenotypicSimilarity;
    145       }
    146 
    147       if (Results.ContainsKey("Schema Frequencies")) {
    148         var result = Results["Schema Frequencies"];
    149         result.Value = matrix;
    150       } else {
    151         var result = new Result("Schema Frequencies", matrix);
    152         Results.Add(result);
    153       }
    154 
     214        });
     215      } else {
     216        for (int i = 0; i < schemas.Count; ++i) {
     217          var indices = matchingIndices[i];
     218          if (indices.Length > 1) {
     219            var avgSchemaQuality = indices.Average(x => qualities[x]);
     220            var avgLength = indices.Average(x => trees[x].Length);
     221            var avgGenSim = AverageSimilarity(indices, trees, genSimMatrix, genotypicSimilarityCalculator.CalculateSimilarity);
     222            var avgPhenSim = AverageSimilarity(indices, trees, phenSimMatrix, phenotypicSimilarityCalculator.CalculateSimilarity);
     223            var array = new[] { indices.Length, avgSchemaQuality, avgLength, avgGenSim, avgPhenSim, avgPopQuality };
     224            var t = new Tuple<string, double[]>(schemaStrings[i], array);
     225            schemaStatistics.Add(t);
     226          }
     227        }
     228      }
     229
     230      if (!schemaStatistics.Any()) return base.Apply(); // shouldn't ever happen
     231      var columnNames = new[] { "Count", "Avg Quality", "Avg Length", "Avg Genotype Similarity", "Avg Phenotype Similarity", "Avg Population Quality" };
     232      var mostFrequent = new DoubleMatrix(schemaStatistics.Count, schemaStatistics[0].Item2.Length);
     233      mostFrequent.SortableView = true;
     234      schemaStatistics.Sort((a, b) => { if (a.Item2[0].Equals(b.Item2[0])) return b.Item2[1].CompareTo(a.Item2[1]); return b.Item2[0].CompareTo(a.Item2[0]); });
     235      mostFrequentPerGeneration.Add(new Tuple<string, double[]>(schemaStatistics[0].Item1, new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray()));
     236      mostFrequent.RowNames = schemaStatistics.Select(x => x.Item1);
     237      mostFrequent.ColumnNames = columnNames;
     238      for (int i = 0; i < schemaStatistics.Count; ++i) {
     239        var values = schemaStatistics[i].Item2;
     240        for (int j = 0; j < values.Length; ++j) {
     241          mostFrequent[i, j] = values[j];
     242        }
     243      }
     244      resultCollection.Add(new Result("Generation " + generation + " Most Frequent Schemas", mostFrequent));
     245
     246      columnNames = new[] { "Generation", "Count", "Avg Quality", "Avg Length", "Avg Genotype Similarity", "Avg Phenotype Similarity", "Avg Population Quality" };
     247      // sort according to quality, then count
     248      schemaStatistics.Sort((a, b) => { if (a.Item2[1].Equals(b.Item2[1])) return b.Item2[0].CompareTo(a.Item2[0]); return b.Item2[1].CompareTo(a.Item2[1]); });
     249      DoubleMatrix bestSchemasMatrix;
     250      if (!resultCollection.ContainsKey("Best Schemas")) {
     251        bestSchemasMatrix = new DoubleMatrix(1, 7);
     252        bestSchemasMatrix.RowNames = new[] { schemaStatistics[0].Item1 };
     253        bestSchemasMatrix.ColumnNames = columnNames;
     254        var values = new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray();
     255        for (int i = 0; i < values.Length; ++i)
     256          bestSchemasMatrix[0, i] = values[i];
     257        resultCollection.Add(new Result("Best Schemas", bestSchemasMatrix));
     258      } else {
     259        bestSchemasMatrix = (DoubleMatrix)resultCollection["Best Schemas"].Value;
     260        resultCollection["Best Schemas"].Value = AddRow(bestSchemasMatrix, new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray(), schemaStatistics[0].Item1);
     261      }
     262
     263      // sort according to count, then quality
     264      schemaStatistics.Sort((a, b) => { if (a.Item2[0].Equals(b.Item2[0])) return b.Item2[1].CompareTo(a.Item2[1]); return b.Item2[0].CompareTo(a.Item2[0]); });
     265      DoubleMatrix frequentSchemasMatrix;
     266      if (!resultCollection.ContainsKey("Most Frequent Schemas")) {
     267        frequentSchemasMatrix = new DoubleMatrix(1, 7);
     268        frequentSchemasMatrix.RowNames = new[] { schemaStatistics[0].Item1 };
     269        frequentSchemasMatrix.ColumnNames = columnNames;
     270        var values = new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray();
     271        for (int i = 0; i < values.Length; ++i)
     272          frequentSchemasMatrix[0, i] = values[i];
     273        resultCollection.Add(new Result("Most Frequent Schemas", frequentSchemasMatrix));
     274      } else {
     275        frequentSchemasMatrix = (DoubleMatrix)resultCollection["Most Frequent Schemas"].Value;
     276        resultCollection["Most Frequent Schemas"].Value = AddRow(frequentSchemasMatrix, new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray(), schemaStatistics[0].Item1);
     277      }
    155278      return base.Apply();
     279    }
     280
     281    private static DoubleMatrix AddRow(DoubleMatrix m, double[] row, string rowName) {
     282      if (row.Length != m.Columns)
     283        throw new Exception("Row value count must match matrix column count.");
     284      var x = new DoubleMatrix(m.Rows + 1, m.Columns);
     285      x.RowNames = m.RowNames.Concat(new[] { rowName });
     286      x.ColumnNames = m.ColumnNames;
     287      for (int i = 0; i < m.Rows; ++i)
     288        for (int j = 0; j < m.Columns; ++j)
     289          x[i, j] = m[i, j];
     290      for (int j = 0; j < m.Columns; ++j)
     291        x[m.Rows, j] = row[j];
     292      return x;
     293    }
     294
     295    private static double AverageSimilarity(int[] indices, List<ISymbolicExpressionTree> trees, double[,] similarityMatrix, Func<ISymbolicExpressionTree, ISymbolicExpressionTree, double> similarityFunction) {
     296      var agg = 0d;
     297      int len = indices.Length;
     298      var count = len * (len - 1) / 2d;
     299      for (int i = 0; i < indices.Length - 1; ++i) {
     300        var a = indices[i];
     301        for (int j = i + 1; j < indices.Length; ++j) {
     302          var b = indices[j];
     303          if (double.IsNaN(similarityMatrix[a, b])) similarityMatrix[a, b] = similarityFunction(trees[a], trees[b]);
     304          agg += similarityMatrix[a, b];
     305        }
     306      }
     307      return agg / count;
     308    }
     309
     310    private static string SubtreeToString(ISymbolicExpressionTreeNode node, bool strict = false) {
     311      StringBuilder strBuilder = new StringBuilder();
     312      // internal nodes or leaf nodes?
     313      if (node is AnySubtree)
     314        return "# ";
     315
     316      if (node.SubtreeCount > 0) {
     317        strBuilder.Append("(");
     318        // symbol on same line as '('
     319        string label = string.Empty;
     320        if (node is AnyNode)
     321          label = "=";
     322        else {
     323          var name = node.Symbol.Name;
     324          label = ShortNames.ContainsKey(name) ? ShortNames[name] : name;
     325        }
     326        strBuilder.Append(label + " ");
     327        // each subtree expression on a new line
     328        // and closing ')' also on new line
     329        foreach (var subtree in node.Subtrees) {
     330          strBuilder.Append(SubtreeToString(subtree, strict));
     331        }
     332        strBuilder.Append(") ");
     333      } else {
     334        // symbol in the same line with as '(' and ')'
     335        var v = node as VariableTreeNode;
     336        var c = node as ConstantTreeNode;
     337        var w = node as AnyNode; // wildcard
     338        string label = string.Empty;
     339        if (w != null)
     340          label = "=";
     341        else if (v != null)
     342          label = strict ? string.Format("{0:0.00}_{1}", v.Weight, v.VariableName) : string.Format("{0}", v.VariableName);
     343        else if (c != null)
     344          label = strict ? string.Format("{0:0.00}", c.Value) : "C";
     345        strBuilder.Append(label);
     346        if (node.Parent != null && node != node.Parent.Subtrees.Last())
     347          strBuilder.Append(" ");
     348      }
     349      return strBuilder.ToString();
    156350    }
    157351  }
Note: See TracChangeset for help on using the changeset viewer.