#region License Information
/* HeuristicLab
* Copyright (C) 2002-2015 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
*
* This file is part of HeuristicLab.
*
* HeuristicLab is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* HeuristicLab is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with HeuristicLab. If not, see .
*/
#endregion
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using HeuristicLab.Common;
using HeuristicLab.Core;
using HeuristicLab.Data;
using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
using HeuristicLab.EvolutionTracking;
using HeuristicLab.Optimization;
using HeuristicLab.Parameters;
using HeuristicLab.Persistence.Default.CompositeSerializers.Storable;
namespace HeuristicLab.Problems.DataAnalysis.Symbolic {
[Item("SymbolicDataAnalysisSchemaFrequencyAnalyzer", "An analyzer which counts schema frequencies in the population.")]
[StorableClass]
public class SymbolicDataAnalysisSchemaFrequencyAnalyzer : EvolutionTrackingAnalyzer {
private const string MinimumSchemaLengthParameterName = "MinimumSchemaLength";
private const string StrictSchemaMatchingParameterName = "StrictSchemaMatching";
private const string ProblemDataParameterName = "ProblemData";
private const string InterpreterParameterName = "SymbolicExpressionTreeInterpreter";
private const string ExecuteInParallelParameterName = "ExecuteInParallel";
private const string MaximumDegreeOfParallelismParameterName = "MaximumDegreeOfParallelism";
private static readonly Dictionary ShortNames = new Dictionary {
{ "Addition", "+" },
{ "Subtraction", "-" },
{ "Multiplication", "*" },
{ "Division", "/" },
{ "Exponential", "exp" },
{ "Logarithm", "log" }
};
[Storable]
private readonly SymbolicExpressionTreePhenotypicSimilarityCalculator phenotypicSimilarityCalculator;
[Storable]
private readonly SymbolicExpressionTreeBottomUpSimilarityCalculator genotypicSimilarityCalculator;
[Storable]
private readonly ISymbolicExpressionTreeNodeEqualityComparer comparer;
private QueryMatch qm;
public IFixedValueParameter ExecuteInParallelParameter {
get { return (IFixedValueParameter)Parameters[ExecuteInParallelParameterName]; }
}
public IFixedValueParameter MaximumDegreeOfParallelismParameter {
get { return (IFixedValueParameter)Parameters[MaximumDegreeOfParallelismParameterName]; }
}
public IFixedValueParameter MinimumSchemaLengthParameter {
get { return (IFixedValueParameter)Parameters[MinimumSchemaLengthParameterName]; }
}
public IFixedValueParameter StrictSchemaMatchingParameter {
get { return (IFixedValueParameter)Parameters[StrictSchemaMatchingParameterName]; }
}
public ILookupParameter ProblemDataParameter {
get { return (ILookupParameter)Parameters[ProblemDataParameterName]; }
}
public ILookupParameter InterpreterParameter {
get { return (ILookupParameter)Parameters[InterpreterParameterName]; }
}
public bool ExecuteInParallel {
get { return ExecuteInParallelParameter.Value.Value; }
set { ExecuteInParallelParameter.Value.Value = value; }
}
public int MaximumDegreeOfParallelism {
get { return MaximumDegreeOfParallelismParameter.Value.Value; }
set { MaximumDegreeOfParallelismParameter.Value.Value = value; }
}
public int MinimumSchemaLength {
get { return MinimumSchemaLengthParameter.Value.Value; }
set { MinimumSchemaLengthParameter.Value.Value = value; }
}
public bool StrictSchemaMatching {
get { return StrictSchemaMatchingParameter.Value.Value; }
set { StrictSchemaMatchingParameter.Value.Value = value; }
}
public SymbolicDataAnalysisSchemaFrequencyAnalyzer() {
comparer = new SymbolicExpressionTreeNodeEqualityComparer {
MatchConstantValues = false,
MatchVariableNames = true,
MatchVariableWeights = false
};
qm = new QueryMatch(comparer) { MatchParents = true };
phenotypicSimilarityCalculator = new SymbolicExpressionTreePhenotypicSimilarityCalculator();
genotypicSimilarityCalculator = new SymbolicExpressionTreeBottomUpSimilarityCalculator { SolutionVariableName = "SymbolicExpressionTree" };
Parameters.Add(new FixedValueParameter(MinimumSchemaLengthParameterName, new IntValue(10)));
Parameters.Add(new FixedValueParameter(MaximumDegreeOfParallelismParameterName, new IntValue(4)));
Parameters.Add(new FixedValueParameter(StrictSchemaMatchingParameterName, new BoolValue(true)));
Parameters.Add(new FixedValueParameter(ExecuteInParallelParameterName, new BoolValue(true)));
Parameters.Add(new LookupParameter(ProblemDataParameterName));
Parameters.Add(new LookupParameter(InterpreterParameterName));
}
protected SymbolicDataAnalysisSchemaFrequencyAnalyzer(SymbolicDataAnalysisSchemaFrequencyAnalyzer original,
Cloner cloner) : base(original, cloner) {
comparer = cloner.Clone(original.comparer);
phenotypicSimilarityCalculator = cloner.Clone(original.phenotypicSimilarityCalculator);
genotypicSimilarityCalculator = cloner.Clone(original.genotypicSimilarityCalculator);
MinimumSchemaLength = original.MinimumSchemaLength;
StrictSchemaMatching = original.StrictSchemaMatching;
qm = new QueryMatch(comparer) { MatchParents = true };
}
public override IDeepCloneable Clone(Cloner cloner) {
return new SymbolicDataAnalysisSchemaFrequencyAnalyzer(this, cloner);
}
[StorableConstructor]
protected SymbolicDataAnalysisSchemaFrequencyAnalyzer(bool deserializing) : base(deserializing) { }
[StorableHook(HookType.AfterDeserialization)]
private void AfterDeserialization() {
qm = new QueryMatch(comparer) { MatchParents = true };
}
public override IOperation Apply() {
if (PopulationGraph == null || Generation.Value == 0 ||
(Generation.Value > 1 && Generation.Value % UpdateInterval.Value != 0))
return base.Apply();
comparer.MatchVariableWeights = comparer.MatchConstantValues = StrictSchemaMatching;
phenotypicSimilarityCalculator.Interpreter = InterpreterParameter.ActualValue;
phenotypicSimilarityCalculator.ProblemData = ProblemDataParameter.ActualValue;
var generation = Generation.Value;
// use all offspring produced by crossover (including those in the intermediate rank)
var vertices = PopulationGraph.Vertices.Where(x => x.InDegree == 2 && x.Rank > generation - 1).OrderByDescending(x => x.Quality).ToList();
ResultCollection resultCollection;
if (Results.ContainsKey("Schema Frequencies")) {
resultCollection = (ResultCollection)Results["Schema Frequencies"].Value;
} else {
resultCollection = new ResultCollection();
var result = new Result("Schema Frequencies", resultCollection);
Results.Add(result);
}
var mostFrequentPerGeneration = new List>();
// match the obtained schemas against the whole population
var population = PopulationGraph.Vertices.Where(x => x.Rank.IsAlmost(generation)).ToList();
var trees = population.Select(x => x.Data).ToList();
var qualities = population.Select(x => x.Quality).ToList();
// cache similarity measures to speed up calculations
var genSimMatrix = new double[trees.Count, trees.Count];
var phenSimMatrix = new double[trees.Count, trees.Count];
for (int i = 0; i < trees.Count - 1; ++i) {
for (int j = i + 1; j < trees.Count; ++j) {
genSimMatrix[i, j] = double.NaN;
phenSimMatrix[i, j] = double.NaN;
}
}
var schemas = SchemaCreator.GenerateSchemas(vertices, MinimumSchemaLength, 0, StrictSchemaMatching).ToList();
var schemaStrings = schemas.Select(x => x.Root.GetSubtree(0).GetSubtree(0).FormatToString(StrictSchemaMatching)).ToList();
int[][] matchingIndices;
if (ExecuteInParallel) {
matchingIndices = new int[schemas.Count][];
Parallel.For(0, schemas.Count, new ParallelOptions { MaxDegreeOfParallelism = MaximumDegreeOfParallelism }, i => {
var schema = schemas[i];
matchingIndices[i] = Enumerable.Range(0, trees.Count).Where(v => qm.Match(trees[v], schema)).ToArray();
});
} else {
matchingIndices = schemas.Select(x => Enumerable.Range(0, trees.Count).Where(v => qm.Match(trees[v], x)).ToArray()).ToArray();
}
var schemaStatistics = new List>();
var avgPopQuality = qualities.Average();
if (ExecuteInParallel) {
var locker = new object();
Parallel.For(0, schemas.Count, new ParallelOptions { MaxDegreeOfParallelism = MaximumDegreeOfParallelism }, i => {
var indices = matchingIndices[i];
if (indices.Length > 1) {
var avgSchemaQuality = indices.Average(x => qualities[x]);
var avgLength = indices.Average(x => trees[x].Length);
var avgGenSim = AverageSimilarity(indices, trees, genSimMatrix, genotypicSimilarityCalculator.CalculateSimilarity);
var avgPhenSim = AverageSimilarity(indices, trees, phenSimMatrix, phenotypicSimilarityCalculator.CalculateSimilarity);
var array = new[] { indices.Length, avgSchemaQuality, avgLength, avgGenSim, avgPhenSim, avgPopQuality };
var t = new Tuple(schemaStrings[i], array);
lock (locker) {
schemaStatistics.Add(t);
}
}
});
} else {
for (int i = 0; i < schemas.Count; ++i) {
var indices = matchingIndices[i];
if (indices.Length > 1) {
var avgSchemaQuality = indices.Average(x => qualities[x]);
var avgLength = indices.Average(x => trees[x].Length);
var avgGenSim = AverageSimilarity(indices, trees, genSimMatrix, genotypicSimilarityCalculator.CalculateSimilarity);
var avgPhenSim = AverageSimilarity(indices, trees, phenSimMatrix, phenotypicSimilarityCalculator.CalculateSimilarity);
var array = new[] { indices.Length, avgSchemaQuality, avgLength, avgGenSim, avgPhenSim, avgPopQuality };
var t = new Tuple(schemaStrings[i], array);
schemaStatistics.Add(t);
}
}
}
if (!schemaStatistics.Any()) return base.Apply(); // shouldn't ever happen
var columnNames = new[] { "Count", "Avg Quality", "Avg Length", "Avg Genotype Similarity", "Avg Phenotype Similarity", "Avg Population Quality" };
var mostFrequent = new DoubleMatrix(schemaStatistics.Count, schemaStatistics[0].Item2.Length) {
SortableView = true
};
schemaStatistics.Sort((a, b) => { if (a.Item2[0].Equals(b.Item2[0])) return b.Item2[1].CompareTo(a.Item2[1]); return b.Item2[0].CompareTo(a.Item2[0]); });
mostFrequentPerGeneration.Add(Tuple.Create(schemaStatistics[0].Item1, new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray()));
mostFrequent.RowNames = schemaStatistics.Select(x => x.Item1);
mostFrequent.ColumnNames = columnNames;
for (int i = 0; i < schemaStatistics.Count; ++i) {
var values = schemaStatistics[i].Item2;
for (int j = 0; j < values.Length; ++j) {
mostFrequent[i, j] = values[j];
}
}
resultCollection.Add(new Result("Generation " + generation + " Most Frequent Schemas", mostFrequent));
columnNames = new[] { "Generation", "Count", "Avg Quality", "Avg Length", "Avg Genotype Similarity", "Avg Phenotype Similarity", "Avg Population Quality" };
// sort according to quality, then count
schemaStatistics.Sort((a, b) => { if (a.Item2[1].Equals(b.Item2[1])) return b.Item2[0].CompareTo(a.Item2[0]); return b.Item2[1].CompareTo(a.Item2[1]); });
DoubleMatrix bestSchemasMatrix;
if (!resultCollection.ContainsKey("Best Schemas")) {
bestSchemasMatrix = new DoubleMatrix(1, 7);
bestSchemasMatrix.RowNames = new[] { schemaStatistics[0].Item1 };
bestSchemasMatrix.ColumnNames = columnNames;
var values = new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray();
for (int i = 0; i < values.Length; ++i)
bestSchemasMatrix[0, i] = values[i];
resultCollection.Add(new Result("Best Schemas", bestSchemasMatrix));
} else {
bestSchemasMatrix = (DoubleMatrix)resultCollection["Best Schemas"].Value;
resultCollection["Best Schemas"].Value = AddRow(bestSchemasMatrix, new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray(), schemaStatistics[0].Item1);
}
// sort according to count, then quality
schemaStatistics.Sort((a, b) => { if (a.Item2[0].Equals(b.Item2[0])) return b.Item2[1].CompareTo(a.Item2[1]); return b.Item2[0].CompareTo(a.Item2[0]); });
DoubleMatrix frequentSchemasMatrix;
if (!resultCollection.ContainsKey("Most Frequent Schemas")) {
frequentSchemasMatrix = new DoubleMatrix(1, 7);
frequentSchemasMatrix.RowNames = new[] { schemaStatistics[0].Item1 };
frequentSchemasMatrix.ColumnNames = columnNames;
var values = new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray();
for (int i = 0; i < values.Length; ++i)
frequentSchemasMatrix[0, i] = values[i];
resultCollection.Add(new Result("Most Frequent Schemas", frequentSchemasMatrix));
} else {
frequentSchemasMatrix = (DoubleMatrix)resultCollection["Most Frequent Schemas"].Value;
resultCollection["Most Frequent Schemas"].Value = AddRow(frequentSchemasMatrix, new[] { (double)generation }.Concat(schemaStatistics[0].Item2).ToArray(), schemaStatistics[0].Item1);
}
return base.Apply();
}
private static DoubleMatrix AddRow(DoubleMatrix m, double[] row, string rowName) {
if (row.Length != m.Columns)
throw new Exception("Row value count must match matrix column count.");
var x = new DoubleMatrix(m.Rows + 1, m.Columns);
x.RowNames = m.RowNames.Concat(new[] { rowName });
x.ColumnNames = m.ColumnNames;
for (int i = 0; i < m.Rows; ++i)
for (int j = 0; j < m.Columns; ++j)
x[i, j] = m[i, j];
for (int j = 0; j < m.Columns; ++j)
x[m.Rows, j] = row[j];
return x;
}
private static double AverageSimilarity(int[] indices, List trees, double[,] similarityMatrix, Func similarityFunction) {
var agg = 0d;
int len = indices.Length;
var count = len * (len - 1) / 2d;
for (int i = 0; i < indices.Length - 1; ++i) {
var a = indices[i];
for (int j = i + 1; j < indices.Length; ++j) {
var b = indices[j];
if (double.IsNaN(similarityMatrix[a, b]))
similarityMatrix[a, b] = similarityFunction(trees[a], trees[b]);
agg += similarityMatrix[a, b];
}
}
return agg / count;
}
}
}