#region License Information /* HeuristicLab * Copyright (C) 2002-2015 Heuristic and Evolutionary Algorithms Laboratory (HEAL) * * This file is part of HeuristicLab. * * HeuristicLab is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * HeuristicLab is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with HeuristicLab. If not, see . */ #endregion using System; using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; using HEAL.Attic; using HeuristicLab.Common; using HeuristicLab.Core; using HeuristicLab.Data; using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding; using HeuristicLab.EvolutionTracking; using HeuristicLab.Optimization; using HeuristicLab.Parameters; namespace HeuristicLab.Problems.DataAnalysis.Symbolic { [Item("SymbolicDataAnalysisSchemaFrequencyAnalyzer", "An analyzer which counts schema frequencies in the population.")] [StorableType("61ECE492-B4C2-4130-A7AE-FF280407D089")] public class SymbolicDataAnalysisSchemaFrequencyAnalyzer : EvolutionTrackingAnalyzer { private const string MinimumSchemaLengthParameterName = "MinimumSchemaLength"; private const string StrictSchemaMatchingParameterName = "StrictSchemaMatching"; private const string ProblemDataParameterName = "ProblemData"; private const string InterpreterParameterName = "SymbolicExpressionTreeInterpreter"; private const string ExecuteInParallelParameterName = "ExecuteInParallel"; private const string MaximumDegreeOfParallelismParameterName = "MaximumDegreeOfParallelism"; private const string SchemaDefinitionParameterName = "SchemaDefinition"; private static readonly Dictionary ShortNames = new Dictionary { { "Addition", "+" }, { "Subtraction", "-" }, { "Multiplication", "*" }, { "Division", "/" }, { "Exponential", "exp" }, { "Logarithm", "log" } }; [Storable] private readonly SymbolicExpressionTreePhenotypicSimilarityCalculator phenotypicSimilarityCalculator; [Storable] private readonly SymbolicExpressionTreeBottomUpSimilarityCalculator genotypicSimilarityCalculator; [Storable] private readonly ISymbolicExpressionTreeNodeEqualityComparer comparer; private QueryMatch qm; public IConstrainedValueParameter SchemaDefinitionParameter { get { return (IConstrainedValueParameter)Parameters[SchemaDefinitionParameterName]; } } public IFixedValueParameter ExecuteInParallelParameter { get { return (IFixedValueParameter)Parameters[ExecuteInParallelParameterName]; } } public IFixedValueParameter MaximumDegreeOfParallelismParameter { get { return (IFixedValueParameter)Parameters[MaximumDegreeOfParallelismParameterName]; } } public IFixedValueParameter MinimumSchemaLengthParameter { get { return (IFixedValueParameter)Parameters[MinimumSchemaLengthParameterName]; } } public IFixedValueParameter StrictSchemaMatchingParameter { get { return (IFixedValueParameter)Parameters[StrictSchemaMatchingParameterName]; } } public ILookupParameter ProblemDataParameter { get { return (ILookupParameter)Parameters[ProblemDataParameterName]; } } public ILookupParameter InterpreterParameter { get { return (ILookupParameter)Parameters[InterpreterParameterName]; } } public bool ExecuteInParallel { get { return ExecuteInParallelParameter.Value.Value; } set { ExecuteInParallelParameter.Value.Value = value; } } public int MaximumDegreeOfParallelism { get { return MaximumDegreeOfParallelismParameter.Value.Value; } set { MaximumDegreeOfParallelismParameter.Value.Value = value; } } public int MinimumSchemaLength { get { return MinimumSchemaLengthParameter.Value.Value; } set { MinimumSchemaLengthParameter.Value.Value = value; } } public bool StrictSchemaMatching { get { return StrictSchemaMatchingParameter.Value.Value; } set { StrictSchemaMatchingParameter.Value.Value = value; } } public SymbolicDataAnalysisSchemaFrequencyAnalyzer() { comparer = new SymbolicExpressionTreeNodeEqualityComparer { MatchConstantValues = false, MatchVariableNames = true, MatchVariableWeights = false }; qm = new QueryMatch(comparer) { MatchParents = true }; phenotypicSimilarityCalculator = new SymbolicExpressionTreePhenotypicSimilarityCalculator(); genotypicSimilarityCalculator = new SymbolicExpressionTreeBottomUpSimilarityCalculator { SolutionVariableName = "SymbolicExpressionTree" }; Parameters.Add(new FixedValueParameter(MinimumSchemaLengthParameterName, new IntValue(10))); Parameters.Add(new FixedValueParameter(MaximumDegreeOfParallelismParameterName, new IntValue(4))); Parameters.Add(new FixedValueParameter(StrictSchemaMatchingParameterName, new BoolValue(true))); Parameters.Add(new FixedValueParameter(ExecuteInParallelParameterName, new BoolValue(true))); Parameters.Add(new LookupParameter(ProblemDataParameterName)); Parameters.Add(new LookupParameter(InterpreterParameterName)); var schemaDefinitions = new ItemSet(new[] { new StringValue("="), new StringValue("#"), new StringValue("=,#") }); var schemaDefinitionParameter = new ConstrainedValueParameter(SchemaDefinitionParameterName, schemaDefinitions); schemaDefinitionParameter.Value = schemaDefinitions.First(); Parameters.Add(schemaDefinitionParameter); } protected SymbolicDataAnalysisSchemaFrequencyAnalyzer(SymbolicDataAnalysisSchemaFrequencyAnalyzer original, Cloner cloner) : base(original, cloner) { comparer = cloner.Clone(original.comparer); phenotypicSimilarityCalculator = cloner.Clone(original.phenotypicSimilarityCalculator); genotypicSimilarityCalculator = cloner.Clone(original.genotypicSimilarityCalculator); MinimumSchemaLength = original.MinimumSchemaLength; StrictSchemaMatching = original.StrictSchemaMatching; qm = new QueryMatch(comparer) { MatchParents = true }; } public override IDeepCloneable Clone(Cloner cloner) { return new SymbolicDataAnalysisSchemaFrequencyAnalyzer(this, cloner); } [StorableConstructor] protected SymbolicDataAnalysisSchemaFrequencyAnalyzer(StorableConstructorFlag _) : base(_) { } [StorableHook(HookType.AfterDeserialization)] private void AfterDeserialization() { qm = new QueryMatch(comparer) { MatchParents = true }; } public override IOperation Apply() { if (PopulationGraph == null || Generation.Value == 0 || (Generation.Value > 1 && Generation.Value % UpdateInterval.Value != 0)) return base.Apply(); comparer.MatchVariableWeights = comparer.MatchConstantValues = StrictSchemaMatching; phenotypicSimilarityCalculator.Interpreter = InterpreterParameter.ActualValue; phenotypicSimilarityCalculator.ProblemData = ProblemDataParameter.ActualValue; var generation = Generation.Value; // use all offspring produced by crossover (including those in the intermediate rank) var vertices = PopulationGraph.Vertices.Where(x => x.InDegree == 2 && x.Rank > generation - 1).OrderByDescending(x => x.Quality).ToList(); ResultCollection resultCollection; if (Results.ContainsKey("Schema Frequencies")) { resultCollection = (ResultCollection)Results["Schema Frequencies"].Value; } else { resultCollection = new ResultCollection(); var result = new Result("Schema Frequencies", resultCollection); Results.Add(result); } var mostFrequentPerGeneration = new List>(); // match the obtained schemas against the whole population var population = PopulationGraph.Vertices.Where(x => x.Rank.IsAlmost(generation)).ToList(); var trees = population.Select(x => x.Data).ToList(); var qualities = population.Select(x => x.Quality).ToList(); // cache similarity measures to speed up calculations var genSimMatrix = new double[trees.Count, trees.Count]; var phenSimMatrix = new double[trees.Count, trees.Count]; for (int i = 0; i < trees.Count - 1; ++i) { for (int j = i + 1; j < trees.Count; ++j) { genSimMatrix[i, j] = double.NaN; phenSimMatrix[i, j] = double.NaN; } } List schemas; var def = SchemaDefinitionParameter.Value.Value; switch (def) { case "=": schemas = SchemaCreator.GenerateAnyNodeSchemas(vertices, MinimumSchemaLength, 0, StrictSchemaMatching).ToList(); break; case "#": schemas = SchemaCreator.GenerateAnySubtreeSchemas(vertices, MinimumSchemaLength, 0, StrictSchemaMatching).ToList(); break; case "=,#": schemas = SchemaCreator.GenerateCombinedSchemas(vertices, MinimumSchemaLength, 0, StrictSchemaMatching).ToList(); break; default: return base.Apply(); } var schemaStrings = schemas.Select(x => x.Root.GetSubtree(0).GetSubtree(0).FormatToString(StrictSchemaMatching)).ToList(); int[][] matchingIndices = new int[schemas.Count][]; var tNodes = trees.Select(x => QueryMatch.InitializePostOrder(x.Root.GetSubtree(0).GetSubtree(0))).ToList(); if (ExecuteInParallel) { Parallel.For(0, schemas.Count, new ParallelOptions { MaxDegreeOfParallelism = MaximumDegreeOfParallelism }, i => { var schema = schemas[i]; var sNodes = QueryMatch.InitializePostOrder(schema.Root.GetSubtree(0).GetSubtree(0)); matchingIndices[i] = Enumerable.Range(0, trees.Count).Where(idx => qm.Match(tNodes[idx], sNodes)).ToArray(); }); } else { for (int i = 0; i < schemas.Count; ++i) { var schema = schemas[i]; var sNodes = QueryMatch.InitializePostOrder(schema.Root.GetSubtree(0).GetSubtree(0)); matchingIndices[i] = Enumerable.Range(0, trees.Count).Where(idx => qm.Match(tNodes[idx], sNodes)).ToArray(); } } var schemaStatistics = new List>(); var avgPopQuality = qualities.Average(); if (ExecuteInParallel) { var locker = new object(); Parallel.For(0, schemas.Count, new ParallelOptions { MaxDegreeOfParallelism = MaximumDegreeOfParallelism }, i => { var indices = matchingIndices[i]; if (indices.Length > 1) { var avgSchemaQuality = indices.Average(x => qualities[x]); var avgLength = indices.Average(x => trees[x].Length); var avgGenSim = AverageSimilarity(indices, trees, genSimMatrix, genotypicSimilarityCalculator.CalculateSimilarity); var avgPhenSim = AverageSimilarity(indices, trees, phenSimMatrix, phenotypicSimilarityCalculator.CalculateSimilarity); var array = new[] { indices.Length, avgSchemaQuality, avgLength, avgGenSim, avgPhenSim, avgPopQuality }; var t = new Tuple(schemaStrings[i], array); lock (locker) { schemaStatistics.Add(t); } } }); } else { for (int i = 0; i < schemas.Count; ++i) { var indices = matchingIndices[i]; if (indices.Length > 1) { var avgSchemaQuality = indices.Average(x => qualities[x]); var avgLength = indices.Average(x => trees[x].Length); var avgGenSim = AverageSimilarity(indices, trees, genSimMatrix, genotypicSimilarityCalculator.CalculateSimilarity); var avgPhenSim = AverageSimilarity(indices, trees, phenSimMatrix, phenotypicSimilarityCalculator.CalculateSimilarity); var array = new[] { indices.Length, avgSchemaQuality, avgLength, avgGenSim, avgPhenSim, avgPopQuality }; var t = new Tuple(schemaStrings[i], array); schemaStatistics.Add(t); } } } if (!schemaStatistics.Any()) return base.Apply(); // shouldn't ever happen var columnNames = new[] { "Count", "Avg Quality", "Avg Length", "Avg Genotype Similarity", "Avg Phenotype Similarity", "Avg Population Quality" }; var mostFrequent = new DoubleMatrix(schemaStatistics.Count, schemaStatistics[0].Item2.Length) { SortableView = true }; schemaStatistics.Sort((a, b) => { if (a.Item2[0].Equals(b.Item2[0])) return b.Item2[1].CompareTo(a.Item2[1]); return b.Item2[0].CompareTo(a.Item2[0]); }); mostFrequentPerGeneration.Add(Tuple.Create(schemaStatistics[0].Item1, new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray())); mostFrequent.RowNames = schemaStatistics.Select(x => x.Item1); mostFrequent.ColumnNames = columnNames; for (int i = 0; i < schemaStatistics.Count; ++i) { var values = schemaStatistics[i].Item2; for (int j = 0; j < values.Length; ++j) { mostFrequent[i, j] = values[j]; } } resultCollection.Add(new Result("Generation " + generation + " Most Frequent Schemas", mostFrequent)); columnNames = new[] { "Generation", "Count", "Avg Quality", "Avg Length", "Avg Genotype Similarity", "Avg Phenotype Similarity", "Avg Population Quality" }; // sort according to quality, then count schemaStatistics.Sort((a, b) => { if (a.Item2[1].Equals(b.Item2[1])) return b.Item2[0].CompareTo(a.Item2[0]); return b.Item2[1].CompareTo(a.Item2[1]); }); DoubleMatrix bestSchemasMatrix; if (!resultCollection.ContainsKey("Best Schemas")) { bestSchemasMatrix = new DoubleMatrix(1, 7); bestSchemasMatrix.RowNames = new[] { schemaStatistics[0].Item1 }; bestSchemasMatrix.ColumnNames = columnNames; var values = new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray(); for (int i = 0; i < values.Length; ++i) bestSchemasMatrix[0, i] = values[i]; resultCollection.Add(new Result("Best Schemas", bestSchemasMatrix)); } else { bestSchemasMatrix = (DoubleMatrix)resultCollection["Best Schemas"].Value; resultCollection["Best Schemas"].Value = AddRow(bestSchemasMatrix, new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray(), schemaStatistics[0].Item1); } // sort according to count, then quality schemaStatistics.Sort((a, b) => { if (a.Item2[0].Equals(b.Item2[0])) return b.Item2[1].CompareTo(a.Item2[1]); return b.Item2[0].CompareTo(a.Item2[0]); }); DoubleMatrix frequentSchemasMatrix; if (!resultCollection.ContainsKey("Most Frequent Schemas")) { frequentSchemasMatrix = new DoubleMatrix(1, 7); frequentSchemasMatrix.RowNames = new[] { schemaStatistics[0].Item1 }; frequentSchemasMatrix.ColumnNames = columnNames; var values = new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray(); for (int i = 0; i < values.Length; ++i) frequentSchemasMatrix[0, i] = values[i]; resultCollection.Add(new Result("Most Frequent Schemas", frequentSchemasMatrix)); } else { frequentSchemasMatrix = (DoubleMatrix)resultCollection["Most Frequent Schemas"].Value; resultCollection["Most Frequent Schemas"].Value = AddRow(frequentSchemasMatrix, new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray(), schemaStatistics[0].Item1); } return base.Apply(); } private static DoubleMatrix AddRow(DoubleMatrix m, double[] row, string rowName) { if (row.Length != m.Columns) throw new Exception("Row value count must match matrix column count."); var x = new DoubleMatrix(m.Rows + 1, m.Columns); x.RowNames = m.RowNames.Concat(new[] { rowName }); x.ColumnNames = m.ColumnNames; for (int i = 0; i < m.Rows; ++i) for (int j = 0; j < m.Columns; ++j) x[i, j] = m[i, j]; for (int j = 0; j < m.Columns; ++j) x[m.Rows, j] = row[j]; return x; } private static double AverageSimilarity(int[] indices, List trees, double[,] similarityMatrix, Func similarityFunction) { var agg = 0d; int len = indices.Length; var count = len * (len - 1) / 2d; for (int i = 0; i < indices.Length - 1; ++i) { var a = indices[i]; for (int j = i + 1; j < indices.Length; ++j) { var b = indices[j]; if (double.IsNaN(similarityMatrix[a, b])) similarityMatrix[a, b] = similarityFunction(trees[a], trees[b]); agg += similarityMatrix[a, b]; } } return agg / count; } } }