Changeset 17503 for branches/2968_infix_string_formatter/HeuristicLab.Problems.DataAnalysis.Symbolic/3.4/Hashing
- Timestamp:
- 04/10/20 13:10:26 (5 years ago)
- Location:
- branches/2968_infix_string_formatter/HeuristicLab.Problems.DataAnalysis.Symbolic
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/2968_infix_string_formatter/HeuristicLab.Problems.DataAnalysis.Symbolic
- Property svn:mergeinfo changed
-
branches/2968_infix_string_formatter/HeuristicLab.Problems.DataAnalysis.Symbolic/3.4/Hashing/HashExtensions.cs
r16305 r17503 1 1 #region License Information 2 2 /* HeuristicLab 3 * Copyright (C) 2002-2018Heuristic and Evolutionary Algorithms Laboratory (HEAL)3 * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 4 * 5 5 * This file is part of HeuristicLab. … … 21 21 22 22 using System; 23 using System. Collections.Generic;23 using System.Linq; 24 24 25 25 namespace HeuristicLab.Problems.DataAnalysis.Symbolic { 26 26 public static class SymbolicExpressionHashExtensions { 27 /// <summary> 28 /// Holds data that is necessary to handle tree nodes in hashing / simplification. 29 /// </summary> 30 /// <typeparam name="T">The tree node type</typeparam> 27 31 public sealed class HashNode<T> : IComparable<HashNode<T>>, IEquatable<HashNode<T>> where T : class { 28 32 public T Data; … … 38 42 public SimplifyAction Simplify; 39 43 40 public IComparer<T> Comparer;41 42 44 public bool IsLeaf => Arity == 0; 43 45 44 public HashNode(IComparer<T> comparer) {45 Comparer = comparer;46 }47 48 private HashNode() { }49 50 46 public int CompareTo(HashNode<T> other) { 51 var res = Comparer.Compare(Data, other.Data);47 var res = HashValue.CompareTo(other.HashValue); 52 48 return res == 0 ? CalculatedHashValue.CompareTo(other.CalculatedHashValue) : res; 53 49 } … … 103 99 104 100 public static HashNode<T>[] Simplify<T>(this HashNode<T>[] nodes, Func<byte[], ulong> hashFunction) where T : class { 105 var reduced = nodes.UpdateNodeSizes().Reduce().Sort(hashFunction); 106 107 for (int i = 0; i < reduced.Length; ++i) { 108 var node = reduced[i]; 109 if (node.IsLeaf) { 110 continue; 111 } 112 node.Simplify?.Invoke(ref reduced, i); 113 } 114 // detect if anything was simplified 115 var count = 0; 116 foreach (var node in reduced) { 117 if (!node.Enabled) { ++count; } 118 } 119 if (count == 0) { 120 return reduced; 121 } 122 123 var simplified = new HashNode<T>[reduced.Length - count]; 124 int j = 0; 125 foreach (var node in reduced) { 126 if (node.Enabled) { 127 simplified[j++] = node; 128 } 129 } 130 return simplified.UpdateNodeSizes().Reduce().Sort(hashFunction); 101 bool simplified = false; 102 nodes = nodes.UpdateNodeSizes().Reduce().Sort(hashFunction); 103 do { 104 if (simplified) { 105 simplified = false; 106 nodes = nodes.Where(x => x.Enabled).ToArray().UpdateNodeSizes().Reduce().Sort(hashFunction); 107 } 108 109 for (int i = 0; i < nodes.Length; ++i) { 110 var node = nodes[i]; 111 if (node.IsLeaf) { 112 continue; 113 } 114 node.Simplify?.Invoke(ref nodes, i); 115 for (int j = i - node.Size; j <= i; ++j) { 116 // detect if anything was simplified 117 if (!nodes[j].Enabled) { 118 simplified = true; 119 break; 120 } 121 } 122 } 123 } while (simplified); 124 return nodes.UpdateNodeSizes().Sort(hashFunction); 131 125 } 132 126 … … 173 167 174 168 /// <summary> 175 /// Get a function node's child indices t169 /// Get a function node's child indices 176 170 /// </summary> 177 171 /// <typeparam name="T">The data type encapsulated by a hash node</typeparam> 178 /// <param name="nodes">An array of hash nodes with up-to-date node sizes </param>172 /// <param name="nodes">An array of hash nodes with up-to-date node sizes (see UpdateNodeSizes)</param> 179 173 /// <param name="i">The index in the array of hash nodes of the node whose children we want to iterate</param> 180 174 /// <returns>An array containing child indices</returns> … … 191 185 } 192 186 187 /// <summary> 188 /// Determines size of each branch and sets the results for each node. 189 /// </summary> 190 /// <typeparam name="T">The data type encapsulated by a hash node</typeparam> 191 /// <param name="nodes">An array of hash nodes in postfix order.</param> 192 /// <returns>The array with updated node sizes. The array is not copied.</returns> 193 193 public static HashNode<T>[] UpdateNodeSizes<T>(this HashNode<T>[] nodes) where T : class { 194 194 for (int i = 0; i < nodes.Length; ++i) { … … 199 199 } 200 200 node.Size = node.Arity; 201 201 // visit all children and sum up their size (assumes postfix order). 202 202 for (int j = i - 1, k = 0; k < node.Arity; j -= 1 + nodes[j].Size, ++k) { 203 203 node.Size += nodes[j].Size; … … 207 207 } 208 208 209 private static HashNode<T>[] Reduce<T>(this HashNode<T>[] nodes) where T : class { 209 // disables duplicate branches and removes the disabled nodes 210 public static HashNode<T>[] Reduce<T>(this HashNode<T>[] nodes) where T : class { 210 211 int count = 0; 211 212 for (int i = 0; i < nodes.Length; ++i) { -
branches/2968_infix_string_formatter/HeuristicLab.Problems.DataAnalysis.Symbolic/3.4/Hashing/HashUtil.cs
r16272 r17503 1 1 #region License Information 2 2 /* HeuristicLab 3 * Copyright (C) 2002-2018Heuristic and Evolutionary Algorithms Laboratory (HEAL)3 * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 4 * 5 5 * This file is part of HeuristicLab. -
branches/2968_infix_string_formatter/HeuristicLab.Problems.DataAnalysis.Symbolic/3.4/Hashing/SymbolicExpressionTreeHash.cs
r16305 r17503 1 1 #region License Information 2 2 /* HeuristicLab 3 * Copyright (C) 2002-2018Heuristic and Evolutionary Algorithms Laboratory (HEAL)3 * Copyright (C) Heuristic and Evolutionary Algorithms Laboratory (HEAL) 4 4 * 5 5 * This file is part of HeuristicLab. … … 21 21 22 22 using System; 23 using System.Collections.Generic; 23 24 using System.Linq; 24 25 using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding; … … 37 38 private static readonly Constant constant = new Constant(); 38 39 39 private static readonly ISymbolicExpressionTreeNodeComparer comparer = new SymbolicExpressionTreeNodeComparer(); 40 41 public static ulong ComputeHash(this ISymbolicExpressionTree tree) { 42 return ComputeHash(tree.Root.GetSubtree(0).GetSubtree(0)); 43 } 44 45 public static double ComputeSimilarity(ISymbolicExpressionTree t1, ISymbolicExpressionTree t2, bool simplify = false) { 46 return ComputeSimilarity(t1.Root.GetSubtree(0).GetSubtree(0), t2.Root.GetSubtree(0).GetSubtree(0), simplify); 47 } 48 49 public static double ComputeSimilarity(ISymbolicExpressionTreeNode t1, ISymbolicExpressionTreeNode t2, bool simplify = false) { 50 HashNode<ISymbolicExpressionTreeNode>[] lhs; 51 HashNode<ISymbolicExpressionTreeNode>[] rhs; 52 53 ulong hashFunction(byte[] input) => HashUtil.DJBHash(input); 54 55 if (simplify) { 56 lhs = t1.MakeNodes().Simplify(hashFunction); 57 rhs = t2.MakeNodes().Simplify(hashFunction); 58 } else { 59 lhs = t1.MakeNodes().Sort(hashFunction); // sort calculates hash values 60 rhs = t2.MakeNodes().Sort(hashFunction); 61 } 62 63 var lh = lhs.Select(x => x.CalculatedHashValue).ToArray(); 64 var rh = rhs.Select(x => x.CalculatedHashValue).ToArray(); 65 66 Array.Sort(lh); 67 Array.Sort(rh); 68 69 return ComputeSimilarity(lh, rh); 70 } 71 72 // this will only work if lh and rh are sorted 73 private static double ComputeSimilarity(ulong[] lh, ulong[] rh) { 74 double count = 0; 75 for (int i = 0, j = 0; i < lh.Length && j < rh.Length;) { 76 var h1 = lh[i]; 77 var h2 = rh[j]; 78 if (h1 == h2) { 79 ++count; 80 ++i; 81 ++j; 82 } else if (h1 < h2) { 83 ++i; 84 } else if (h1 > h2) { 85 ++j; 86 } 87 } 88 89 return 2d * count / (lh.Length + rh.Length); 90 } 91 92 public static double ComputeAverageSimilarity(ISymbolicExpressionTree[] trees, bool simplify = false, bool strict = false) { 93 var total = (double)trees.Length * (trees.Length - 1) / 2; 94 double avg = 0; 95 var hashes = new ulong[trees.Length][]; 96 // build hash arrays 97 for (int i = 0; i < trees.Length; ++i) { 98 var nodes = trees[i].MakeNodes(strict); 99 hashes[i] = (simplify ? nodes.Simplify(HashUtil.DJBHash) : nodes.Sort(HashUtil.DJBHash)).Select(x => x.CalculatedHashValue).ToArray(); 100 Array.Sort(hashes[i]); 101 } 102 // compute similarity matrix 103 for (int i = 0; i < trees.Length - 1; ++i) { 104 for (int j = i + 1; j < trees.Length; ++j) { 105 avg += ComputeSimilarity(hashes[i], hashes[j]); 106 } 107 } 108 return avg / total; 109 } 110 111 public static double[,] ComputeSimilarityMatrix(ISymbolicExpressionTree[] trees, bool simplify = false, bool strict = false) { 112 var sim = new double[trees.Length, trees.Length]; 113 var hashes = new ulong[trees.Length][]; 114 // build hash arrays 115 for (int i = 0; i < trees.Length; ++i) { 116 var nodes = trees[i].MakeNodes(strict); 117 hashes[i] = (simplify ? nodes.Simplify(HashUtil.DJBHash) : nodes.Sort(HashUtil.DJBHash)).Select(x => x.CalculatedHashValue).ToArray(); 118 Array.Sort(hashes[i]); 119 } 120 // compute similarity matrix 121 for (int i = 0; i < trees.Length - 1; ++i) { 122 for (int j = i + 1; j < trees.Length; ++j) { 123 sim[i, j] = sim[j, i] = ComputeSimilarity(hashes[i], hashes[j]); 124 } 125 } 126 return sim; 127 } 128 129 public static ulong ComputeHash(this ISymbolicExpressionTreeNode treeNode, bool strict = false) { 130 ulong hashFunction(byte[] input) => HashUtil.JSHash(input); 131 var hashNodes = treeNode.MakeNodes(strict); 132 var simplified = hashNodes.Simplify(hashFunction); 133 return simplified.Last().CalculatedHashValue; 40 private static ISymbolicExpressionTreeNode ActualRoot(this ISymbolicExpressionTree tree) => tree.Root.GetSubtree(0).GetSubtree(0); 41 public static ulong HashFunction(byte[] input) => HashUtil.DJBHash(input); 42 43 #region tree hashing 44 public static ulong[] Hash(this ISymbolicExpressionTree tree, bool simplify = false, bool strict = false) { 45 return tree.ActualRoot().Hash(simplify, strict); 46 } 47 48 public static ulong[] Hash(this ISymbolicExpressionTreeNode node, bool simplify = false, bool strict = false) { 49 var hashNodes = simplify ? node.MakeNodes(strict).Simplify(HashFunction) : node.MakeNodes(strict).Sort(HashFunction); 50 var hashes = new ulong[hashNodes.Length]; 51 for (int i = 0; i < hashes.Length; ++i) { 52 hashes[i] = hashNodes[i].CalculatedHashValue; 53 } 54 return hashes; 55 } 56 57 public static ulong ComputeHash(this ISymbolicExpressionTree tree, bool simplify = false, bool strict = false) { 58 return ComputeHash(tree.ActualRoot(), simplify, strict); 59 } 60 61 public static ulong ComputeHash(this ISymbolicExpressionTreeNode treeNode, bool simplify = false, bool strict = false) { 62 return treeNode.Hash(simplify, strict).Last(); 134 63 } 135 64 … … 143 72 } 144 73 var hash = (ulong)name.GetHashCode(); 145 var hashNode = new HashNode<ISymbolicExpressionTreeNode> (comparer){74 var hashNode = new HashNode<ISymbolicExpressionTreeNode> { 146 75 Data = node, 147 76 Arity = node.SubtreeCount, … … 167 96 168 97 public static HashNode<ISymbolicExpressionTreeNode>[] MakeNodes(this ISymbolicExpressionTree tree, bool strict = false) { 169 return MakeNodes(tree. Root.GetSubtree(0).GetSubtree(0), strict);98 return MakeNodes(tree.ActualRoot(), strict); 170 99 } 171 100 … … 173 102 return node.IterateNodesPostfix().Select(x => x.ToHashNode(strict)).ToArray().UpdateNodeSizes(); 174 103 } 104 #endregion 105 106 #region tree similarity 107 public static double ComputeSimilarity(ISymbolicExpressionTree t1, ISymbolicExpressionTree t2, bool simplify = false, bool strict = false) { 108 return ComputeSimilarity(t1.ActualRoot(), t2.ActualRoot(), simplify, strict); 109 } 110 111 public static double ComputeSimilarity(ISymbolicExpressionTreeNode t1, ISymbolicExpressionTreeNode t2, bool simplify = false, bool strict = false) { 112 var lh = t1.Hash(simplify, strict); 113 var rh = t2.Hash(simplify, strict); 114 115 Array.Sort(lh); 116 Array.Sort(rh); 117 118 return ComputeSimilarity(lh, rh); 119 } 120 121 // requires lhs and rhs to be sorted 122 public static int IntersectCount(this ulong[] lh, ulong[] rh) { 123 int count = 0; 124 for (int i = 0, j = 0; i < lh.Length && j < rh.Length;) { 125 var h1 = lh[i]; 126 var h2 = rh[j]; 127 if (h1 == h2) { 128 ++count; 129 ++i; 130 ++j; 131 } else if (h1 < h2) { 132 ++i; 133 } else if (h1 > h2) { 134 ++j; 135 } 136 } 137 return count; 138 } 139 140 public static IEnumerable<ulong> Intersect(this ulong[] lh, ulong[] rh) { 141 for (int i = 0, j = 0; i < lh.Length && j < rh.Length;) { 142 var h1 = lh[i]; 143 var h2 = rh[j]; 144 if (h1 == h2) { 145 yield return h1; 146 ++i; 147 ++j; 148 } else if (h1 < h2) { 149 ++i; 150 } else if (h1 > h2) { 151 ++j; 152 } 153 } 154 } 155 156 // this will only work if lh and rh are sorted 157 public static double ComputeSimilarity(ulong[] lh, ulong[] rh) { 158 return 2d * IntersectCount(lh, rh) / (lh.Length + rh.Length); 159 } 160 161 public static double ComputeAverageSimilarity(IList<ISymbolicExpressionTree> trees, bool simplify = false, bool strict = false) { 162 var total = trees.Count * (trees.Count - 1) / 2; 163 double avg = 0; 164 var hashes = new ulong[trees.Count][]; 165 // build hash arrays 166 for (int i = 0; i < trees.Count; ++i) { 167 var nodes = trees[i].MakeNodes(strict); 168 hashes[i] = (simplify ? nodes.Simplify(HashFunction) : nodes.Sort(HashFunction)).Select(x => x.CalculatedHashValue).ToArray(); 169 Array.Sort(hashes[i]); 170 } 171 // compute similarity matrix 172 for (int i = 0; i < trees.Count - 1; ++i) { 173 for (int j = i + 1; j < trees.Count; ++j) { 174 avg += ComputeSimilarity(hashes[i], hashes[j]); 175 } 176 } 177 return avg / total; 178 } 179 180 public static double[,] ComputeSimilarityMatrix(IList<ulong[]> hashes) { 181 // compute similarity matrix 182 var n = hashes.Count; 183 var sim = new double[n, n]; 184 for (int i = 0; i < n - 1; ++i) { 185 for (int j = i + 1; j < n; ++j) { 186 sim[i, j] = sim[j, i] = ComputeSimilarity(hashes[i], hashes[j]); 187 } 188 } 189 return sim; 190 } 191 192 public static double[,] ComputeSimilarityMatrix(IList<ISymbolicExpressionTree> trees, bool simplify = false, bool strict = false) { 193 var hashes = new ulong[trees.Count][]; 194 // build hash arrays 195 for (int i = 0; i < trees.Count; ++i) { 196 var nodes = trees[i].MakeNodes(strict); 197 hashes[i] = (simplify ? nodes.Simplify(HashFunction) : nodes.Sort(HashFunction)).Select(x => x.CalculatedHashValue).ToArray(); 198 Array.Sort(hashes[i]); 199 } 200 return ComputeSimilarityMatrix(hashes); 201 } 202 #endregion 175 203 176 204 #region parse a nodes array back into a tree … … 218 246 #region tree simplification 219 247 // these simplification methods rely on the assumption that child nodes of the current node have already been simplified 220 // (in other words simplification should be applied in a bottom-up fashion) 248 // (in other words simplification should be applied in a bottom-up fashion) 221 249 public static ISymbolicExpressionTree Simplify(ISymbolicExpressionTree tree) { 222 ulong hashFunction(byte[] bytes) => HashUtil.JSHash(bytes); 223 var root = tree.Root.GetSubtree(0).GetSubtree(0); 224 var nodes = root.MakeNodes(); 225 var simplified = nodes.Simplify(hashFunction); 226 return simplified.ToTree(); 250 return tree.MakeNodes().Simplify(HashFunction).ToTree(); 227 251 } 228 252 … … 248 272 } 249 273 250 // simplify multiplications by reducing constants and div terms 274 // simplify multiplications by reducing constants and div terms 251 275 public static void SimplifyMultiplication(ref HashNode<ISymbolicExpressionTreeNode>[] nodes, int i) { 252 276 var node = nodes[i]; … … 261 285 262 286 var symbol = child.Data.Symbol; 263 if (symbol is Constant) { 287 if (child.Data is ConstantTreeNode firstConst) { 288 // fold sibling constant nodes into the first constant 264 289 for (int k = j + 1; k < children.Length; ++k) { 265 var d = children[k];266 if ( nodes[d].Data.Symbol is Constant) {267 nodes[d].Enabled = false;290 var sibling = nodes[children[k]]; 291 if (sibling.Data is ConstantTreeNode otherConst) { 292 sibling.Enabled = false; 268 293 node.Arity--; 294 firstConst.Value *= otherConst.Value; 295 } else { 296 break; 297 } 298 } 299 } else if (child.Data is VariableTreeNode variable) { 300 // fold sibling constant nodes into the variable weight 301 for (int k = j + 1; k < children.Length; ++k) { 302 var sibling = nodes[children[k]]; 303 if (sibling.Data is ConstantTreeNode constantNode) { 304 sibling.Enabled = false; 305 node.Arity--; 306 variable.Weight *= constantNode.Value; 269 307 } else { 270 308 break; … … 272 310 } 273 311 } else if (symbol is Division) { 274 var div = nodes[c]; 275 var denominator = 276 div.Arity == 1 ? 277 nodes[c - 1] : // 1 / x is expressed as div(x) (with a single child) 278 nodes[c - nodes[c - 1].Size - 2]; // assume division always has arity 1 or 2 279 280 foreach (var d in children) { 281 if (nodes[d].Enabled && nodes[d] == denominator) { 282 nodes[c].Enabled = nodes[d].Enabled = denominator.Enabled = false; 312 // 1/x is expressed as div(x) (with a single child) 313 // we assume division always has arity 1 or 2 314 var d = child.Arity == 1 ? c - 1 : c - nodes[c - 1].Size - 2; 315 var denominator = nodes[d]; 316 317 // iterate children of node i to see if any of them matches the denominator of div node c 318 for (int k = 0; k < children.Length; ++k) { 319 var sibling = nodes[children[k]]; 320 if (sibling.Enabled && sibling == denominator) { 321 nodes.SetEnabled(children[j], false); // disable the div subtree 322 nodes.SetEnabled(children[k], false); // disable the sibling matching the denominator 323 283 324 node.Arity -= 2; // matching child + division node 284 325 break;
Note: See TracChangeset
for help on using the changeset viewer.