Changeset 16193


Ignore:
Timestamp:
09/28/18 15:22:58 (12 months ago)
Author:
bburlacu
Message:

#2886: Implement new hasher (faster & less collision prone) and update unit tests

Location:
branches/2886_SymRegGrammarEnumeration
Files:
2 added
2 edited

Legend:

Unmodified
Added
Removed
  • branches/2886_SymRegGrammarEnumeration/HeuristicLab.Algorithms.DataAnalysis.SymRegGrammarEnumeration/HeuristicLab.Algorithms.DataAnalysis.SymRegGrammarEnumeration.csproj

    r16053 r16193  
    128128    <Compile Include="Hashing\Hasher.cs" />
    129129    <Compile Include="GrammarEnumeration\SearchDataStructure.cs" />
     130    <Compile Include="Hashing\HashExtensions.cs" />
     131    <Compile Include="Hashing\HashUtil.cs" />
    130132    <Compile Include="ProblemInstances\AircraftMaximumLift.cs" />
    131133    <Compile Include="ProblemInstances\FluidDynamics.cs" />
  • branches/2886_SymRegGrammarEnumeration/Test/TreeHashingTest.cs

    r16159 r16193  
    11using System;
    22using System.Collections.Generic;
     3using System.Diagnostics;
    34using System.Linq;
     5using System.Security.Cryptography;
    46using HeuristicLab.Algorithms.DataAnalysis.SymRegGrammarEnumeration;
    57using Microsoft.VisualStudio.TestTools.UnitTesting;
     8using HierarchicalFormatter = HeuristicLab.Encodings.SymbolicExpressionTreeEncoding.SymbolicExpressionTreeHierarchicalFormatter;
    69
    710namespace Test {
     
    1518    private TerminalSymbol c;
    1619
     20    Func<Grammar, SymbolList, int> ComputeHash = (grammar, sentence) => grammar.ComputeHash(null, sentence);
     21
    1722    [TestInitialize]
    1823    public void InitTest() {
     
    3136      SymbolList s2 = new SymbolList(new[] { varA, varB, grammar.Addition, varC, grammar.Addition });
    3237
    33       int hash1 = grammar.Hasher.CalcHashCode(s1);
    34       int hash2 = grammar.Hasher.CalcHashCode(s2);
     38      int hash1 = ComputeHash(grammar, s1);
     39      int hash2 = ComputeHash(grammar, s2);
    3540
    3641      Assert.AreEqual(hash1, hash2);
     
    4348      SymbolList s2 = new SymbolList(new[] { varB, varB, grammar.Addition, varB, grammar.Addition });
    4449
    45       int hash1 = grammar.Hasher.CalcHashCode(s1);
    46       int hash2 = grammar.Hasher.CalcHashCode(s2);
     50      int hash1 = ComputeHash(grammar, s1);
     51      int hash2 = ComputeHash(grammar, s2);
    4752
    4853      Assert.AreNotEqual(hash1, hash2);
     
    5560      SymbolList s2 = new SymbolList(new[] { varB, varA, grammar.Addition });
    5661
    57       int hash1 = grammar.Hasher.CalcHashCode(s1);
    58       int hash2 = grammar.Hasher.CalcHashCode(s2);
     62      int hash1 = ComputeHash(grammar, s1);
     63      int hash2 = ComputeHash(grammar, s2);
    5964
    6065      Assert.AreEqual(hash1, hash2);
     
    6772      SymbolList s2 = new SymbolList(new[] { varA, varB, varA, grammar.Addition, grammar.Addition });
    6873
    69       int hash1 = grammar.Hasher.CalcHashCode(s1);
    70       int hash2 = grammar.Hasher.CalcHashCode(s2);
     74      int hash1 = ComputeHash(grammar, s1);
     75      int hash2 = ComputeHash(grammar, s2);
    7176
    7277      Assert.AreEqual(hash1, hash2);
     
    7984      SymbolList s2 = new SymbolList(new[] { varA });
    8085
    81       int hash1 = grammar.Hasher.CalcHashCode(s1);
    82       int hash2 = grammar.Hasher.CalcHashCode(s2);
     86      int hash1 = ComputeHash(grammar, s1);
     87      int hash2 = ComputeHash(grammar, s2);
    8388
    8489      Assert.AreEqual(hash1, hash2);
     
    9196      SymbolList s2 = new SymbolList(new[] { varA, varA, varA, grammar.Multiplication, grammar.Addition });
    9297
    93       int hash1 = grammar.Hasher.CalcHashCode(s1);
    94       int hash2 = grammar.Hasher.CalcHashCode(s2);
     98      int hash1 = ComputeHash(grammar, s1);
     99      int hash2 = ComputeHash(grammar, s2);
    95100
    96101      Assert.AreNotEqual(hash1, hash2);
     
    103108      SymbolList s2 = new SymbolList(new Symbol[] { varA, grammar.Expr, grammar.Addition });
    104109
    105       int hash1 = grammar.Hasher.CalcHashCode(s1);
    106       int hash2 = grammar.Hasher.CalcHashCode(s2);
     110      int hash1 = ComputeHash(grammar, s1);
     111      int hash2 = ComputeHash(grammar, s2);
    107112
    108113      Assert.AreEqual(hash1, hash2);
     
    117122      SymbolList s2 = new SymbolList(new Symbol[] { varA, varB, grammar.Multiplication });
    118123
    119       int hash1 = grammar.Hasher.CalcHashCode(s1);
    120       int hash2 = grammar.Hasher.CalcHashCode(s2);
     124      int hash1 = ComputeHash(grammar, s1);
     125      int hash2 = ComputeHash(grammar, s2);
    121126
    122127      Assert.AreEqual(hash1, hash2);
     
    129134      SymbolList s2 = new SymbolList(new Symbol[] { varA, varA, varA, grammar.Multiplication, grammar.Addition, grammar.Sin, varA, grammar.Inv, varA, grammar.Sin, varA, grammar.Multiplication, grammar.Multiplication, grammar.Addition });
    130135
    131       int hash1 = grammar.Hasher.CalcHashCode(s1);
    132       int hash2 = grammar.Hasher.CalcHashCode(s2);
     136      int hash1 = ComputeHash(grammar, s1);
     137      int hash2 = ComputeHash(grammar, s2);
     138
     139      Console.WriteLine(s1);
     140      Console.WriteLine(PrintTree(s1));
     141      Console.WriteLine(grammar.Simplify(null, s1));
     142      Console.WriteLine(hash1);
     143      Console.WriteLine();
     144      Console.WriteLine(s2);
     145      Console.WriteLine(PrintTree(s2));
     146      Console.WriteLine(grammar.Simplify(null, s2));
     147      Console.WriteLine(hash2);
     148
    133149
    134150      Assert.AreEqual(hash1, hash2);
     
    142158      SymbolList s2 = new SymbolList(new Symbol[] { c, varA, grammar.Multiplication, c, varA, grammar.Multiplication, grammar.Addition, c, grammar.Addition });
    143159
    144       int hash1 = grammar.Hasher.CalcHashCode(s1);
    145       int hash2 = grammar.Hasher.CalcHashCode(s2);
     160      int hash1 = ComputeHash(grammar, s1);
     161      int hash2 = ComputeHash(grammar, s2);
    146162
    147163      Assert.AreEqual(hash1, hash2);
     
    153169      //const int nvars = 1;
    154170      //var variables = Enumerable.Range(1, nvars).Select(x => $"x{x}").ToArray();
     171      var variables = new[] { "a", "b" };
     172      var grammar = new Grammar(variables, Enum.GetValues(typeof(GrammarRule)).Cast<GrammarRule>());
     173
     174      Func<SymbolList, int> hash = s => grammar.Hasher.CalcHashCode(s);
     175      int length = 100;
     176      int depth = 20;
     177
     178      //List<SymbolList> sentences = EnumerateGrammarBreadth(grammar, length: length, hashPhrases: false).ToList();
     179      //Console.WriteLine("Breadth: {0} generated, {1} distinct sentences", sentences.Count, sentences.GroupBy(hash).Count());
     180
     181      //var sentences = EnumerateGrammarBreadth(grammar, length: length, hashPhrases: true).ToList();
     182      //Console.WriteLine("Breadth (hashed): {0} generated, {1} distinct sentences", sentences.Count, sentences.GroupBy(hash).Count());
     183
     184      var sentences = EnumerateGrammarDepth(grammar, length: length, depth: depth, hashPhrases: false).ToList();
     185      Console.WriteLine("Depth: {0} generated, {1} distinct sentences", sentences.Count, sentences.GroupBy(hash).Count());
     186
     187      sentences = EnumerateGrammarDepth(grammar, length: length, depth: depth, hashPhrases: true).ToList();
     188      Console.WriteLine("Depth (hashed): {0} generated, {1} distinct sentences", sentences.Count, sentences.GroupBy(hash).Count());
     189    }
     190
     191    [TestMethod]
     192    [TestCategory("TreeHashing")]
     193    public void HashExtensionsTest() {
     194      var variables = new[] { "x", "y", "z" };
     195      var rules = Enum.GetValues(typeof(GrammarRule)).Cast<GrammarRule>();
     196      var grammar = new Grammar(variables, rules);
     197      var add = grammar.Addition;
     198      var mul = grammar.Multiplication;
     199      var exp = grammar.Exp;
     200      var log = grammar.Log;
     201      var inv = grammar.Inv;
     202      var sin = grammar.Sin;
     203
     204      var c = grammar.Const;
     205      var x = grammar.VarTerminals.Single(v => v.StringRepresentation == "x");
     206      var y = grammar.VarTerminals.Single(v => v.StringRepresentation == "y");
     207      var z = grammar.VarTerminals.Single(v => v.StringRepresentation == "z");
     208
     209
     210      var ha = SHA512.Create();
     211
     212      var sentences = new[] {
     213        new SymbolList(c, c, x, mul, c, add, inv, mul, c, add),
     214        new SymbolList(c, c, x, mul, c, add, log, mul, c, add),
     215        new SymbolList(x, x, add),
     216        new SymbolList(x, x, add, x, add),
     217        new SymbolList(x, x, add, y, add),
     218        new SymbolList(x, x, add, x, add, x, add),
     219        new SymbolList(x, x, add, y, add, x, add),
     220        new SymbolList(x, y, mul, x, y, mul, add),
     221        new SymbolList(x, y, mul, y, x, mul, add),
     222        new SymbolList(x, y, mul, z, y, mul, add),
     223        new SymbolList(x, x, add, y, mul),
     224        new SymbolList(x, x, add, y, mul, y, mul),
     225        new SymbolList(x, x, inv, mul),
     226        new SymbolList(x, inv, x, inv, mul, x, mul, x, mul),
     227        new SymbolList(c, x, x, x, x, x, x, mul, mul, mul, mul, mul, mul, c, x, mul, c, add, add),
     228        new SymbolList(c, x, x, x, x, x, x, mul, mul, mul, mul, mul, mul, c, x, mul, c, add, add),
     229        new SymbolList(c, x, mul, c, x, x, x, x, x, x, mul, mul, mul, mul, mul, mul, c, add, add),
     230        new SymbolList(c, x, x, x, x, mul, mul, mul, mul, c, x, mul, c, x, mul, c, add, add, add),
     231        new SymbolList(c, x, mul, c, x, x, x, x, mul, mul, mul, mul, c, x, mul, c, add, add, add)
     232    };
     233
     234      foreach (var sentence in sentences) {
     235        var simplified = grammar.Simplify(ha, sentence);
     236        Console.WriteLine($"{sentence} -> {simplified} {grammar.Hasher.CalcHashCode(sentence)} {grammar.ComputeHash(ha, sentence)}");
     237        Console.WriteLine();
     238      }
     239    }
     240
     241    string PrintTree(SymbolList s) {
     242      var t = grammar.ParseSymbolicExpressionTree(s);
     243      return HierarchicalFormatter.Format(t.Root.GetSubtree(0).GetSubtree(0));
     244    }
     245
     246    [TestMethod]
     247    [TestCategory("TreeHashing")]
     248    public void HashCollisionsTest() {
    155249      var variables = new[] { "x", "y", "z", "w" };
    156       var grammar = new Grammar(variables, Enum.GetValues(typeof(GrammarRule)).Cast<GrammarRule>());
    157 
    158       Func<SymbolList, int> hash = s => grammar.Hasher.CalcHashCode(s);
    159 
    160       List<SymbolList> sentences = EnumerateGrammarBreadth(grammar, length: 20, hashPhrases: false).ToList();
    161       Console.WriteLine("Breadth: {0} generated, {1} distinct sentences", sentences.Count, sentences.GroupBy(hash).Count());
    162 
    163       sentences = EnumerateGrammarBreadth(grammar, length: 20, hashPhrases: true).ToList();
    164       Console.WriteLine("Breadth (hashed): {0} generated, {1} distinct sentences", sentences.Count, sentences.GroupBy(hash).Count());
    165 
    166       sentences = EnumerateGrammarDepth(grammar, length: 20, hashPhrases: false).ToList();
    167       Console.WriteLine("Depth: {0} generated, {1} distinct sentences", sentences.Count, sentences.GroupBy(hash).Count());
    168 
    169       sentences = EnumerateGrammarDepth(grammar, length: 20, hashPhrases: true).ToList();
    170       Console.WriteLine("Depth (hashed): {0} generated, {1} distinct sentences", sentences.Count, sentences.GroupBy(hash).Count());
    171     }
    172 
     250      //var rules = Enum.GetValues(typeof(GrammarRule)).Cast<GrammarRule>().Except(new[] { GrammarRule.InverseTerm, GrammarRule.Exponentiation, GrammarRule.Logarithm, GrammarRule.Sine });
     251      var rules = Enum.GetValues(typeof(GrammarRule)).Cast<GrammarRule>();
     252      var grammar = new Grammar(variables, rules);
     253
     254      int maxLength = 20, maxDepth = int.MaxValue;
     255      var sentences = EnumerateGrammarDepth(grammar, length: maxLength, depth: maxDepth, hashPhrases: false).ToList();
     256      var count = sentences.Count;
     257
     258      var sw = new Stopwatch();
     259      sw.Start();
     260      var hashes = sentences.Select(grammar.Hasher.CalcHashCode).ToList();
     261      sw.Stop();
     262      Console.WriteLine($"Old hash: {sentences.Count} ({hashes.Distinct().Count()}) hashed in {sw.ElapsedMilliseconds / 1000.0} seconds ({1000d * sentences.Count / sw.ElapsedMilliseconds} hashes/s)");
     263
     264      var ha = SHA512.Create();
     265      sw.Restart();
     266      var hashes_new = sentences.Select(x => grammar.ComputeHash(ha, x)).ToList();
     267      sw.Stop();
     268      Console.WriteLine($"New hash: {sentences.Count} ({hashes_new.Distinct().Count()}) hashed in {sw.ElapsedMilliseconds / 1000.0} seconds ({1000.0 * sentences.Count / sw.ElapsedMilliseconds} hashes/s)");
     269
     270      var distinct = Enumerable.Range(0, count).GroupBy(x => hashes_new[x]).Select(g => g.OrderBy(x => x).First()).ToList();
     271      var collisions = distinct.ToLookup(x => hashes[x], x => Tuple.Create(hashes_new[x], sentences[x]));
     272
     273      foreach (var pair in collisions) {
     274        if (pair.Count() > 1) {
     275          Console.WriteLine(pair.Key);
     276          foreach (var t in pair) {
     277            Console.WriteLine($"\t{t}");
     278            Console.WriteLine($"\t{grammar.ToInfixString(t.Item2)}");
     279            var simplified = grammar.Simplify(ha, t.Item2);
     280            Console.WriteLine($"\t{simplified}");
     281            Console.Write($"\t");
     282            PrintTree(t.Item2);
     283            Console.WriteLine();
     284          }
     285        }
     286      }
     287    }
     288
     289    [TestMethod]
     290    [TestCategory("TreeHashing")]
     291    public void HashPerformance() {
     292      var nvars = 4;
     293      var variables = Enumerable.Range(0, nvars).Select(x => $"X{x}").ToArray();
     294      //var rules = Enum.GetValues(typeof(GrammarRule)).Cast<GrammarRule>().Except(new[] { GrammarRule.InverseTerm, GrammarRule.Exponentiation, GrammarRule.Logarithm, GrammarRule.Sine });
     295      var rules = Enum.GetValues(typeof(GrammarRule)).Cast<GrammarRule>();
     296      var grammar = new Grammar(variables, rules);
     297
     298      int maxLength = 20, maxDepth = int.MaxValue;
     299      var sentences = EnumerateGrammarDepth(grammar, length: maxLength, depth: maxDepth, hashPhrases: false).ToList();
     300      var count = sentences.Count;
     301
     302      var ha = SHA512.Create();
     303
     304      var sw = new Stopwatch();
     305      sw.Start();
     306      var hashes = sentences.Select(x => grammar.ComputeHash(ha, x)).ToList();
     307      sw.Stop();
     308
     309      Console.WriteLine($"New: {sentences.Count} ({hashes.Distinct().Count()}) hashed in {sw.ElapsedMilliseconds / 1000.0} seconds ({1000d * sentences.Count / sw.ElapsedMilliseconds} hashes/s)");
     310
     311      sw.Restart();
     312      var hashes_old = sentences.Select(x => grammar.Hasher.CalcHashCode(x)).ToList();
     313      sw.Stop();
     314
     315      Console.WriteLine($"Old: {sentences.Count} ({hashes_old.Distinct().Count()}) hashed in {sw.ElapsedMilliseconds / 1000.0} seconds ({1000d * sentences.Count / sw.ElapsedMilliseconds} hashes/s)");
     316    }
     317
     318    #region enumerate the grammar
    173319    private static IEnumerable<SymbolList> EnumerateGrammarBreadth(Grammar grammar, int length, bool hashPhrases = true) {
    174320      var phrases = new Queue<SymbolList>();
     
    201347    }
    202348
    203     private static IEnumerable<SymbolList> EnumerateGrammarDepth(Grammar grammar, int length, bool hashPhrases = true) {
    204       return Expand(new SymbolList(grammar.StartSymbol), grammar, length, hashPhrases ? new HashSet<int>() : null);
    205     }
    206 
    207     private static IEnumerable<SymbolList> Expand(SymbolList phrase, Grammar grammar, int maxLength, HashSet<int> visited) {
    208       if (phrase.Count > maxLength) {
     349    private static IEnumerable<SymbolList> EnumerateGrammarDepth(Grammar grammar, int length, int depth, bool hashPhrases = true) {
     350      return Expand(new SymbolList(grammar.StartSymbol), grammar, length, 0, depth, hashPhrases ? new HashSet<int>() : null);
     351    }
     352
     353    private static IEnumerable<SymbolList> Expand(SymbolList phrase, Grammar grammar, int maxLength, int depth, int maxDepth, HashSet<int> visited) {
     354      if (maxDepth < depth || maxLength < phrase.Count) {
    209355        yield break;
    210356      }
     
    215361      }
    216362
    217       if (visited != null && !visited.Add(grammar.Hasher.CalcHashCode(phrase))) {
     363      if (visited != null && !visited.Add(HashExtensions.ComputeHash(grammar, null, phrase))) {
    218364        yield break;
    219365      }
     
    222368      var productions = grammar.Productions[phrase[i]];
    223369
    224       foreach (var s in productions.SelectMany(p => Expand(phrase.DerivePhrase(i, p), grammar, maxLength, visited)))
     370      foreach (var s in productions.SelectMany(p => Expand(phrase.DerivePhrase(i, p), grammar, maxLength, depth + 1, maxDepth, visited)))
    225371        yield return s;
    226372    }
     373    #endregion
    227374  }
    228375}
Note: See TracChangeset for help on using the changeset viewer.