using System; using System.Collections.Concurrent; using System.Collections.Generic; using System.Collections.Specialized; using System.Diagnostics; using System.Linq; using System.Net; using System.Security; using System.Security.AccessControl; using System.Text; using HeuristicLab.Common; using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding; namespace HeuristicLab.Problems.GrammaticalOptimization { public class SymbolicRegressionPoly10Problem : ISymbolicExpressionTreeProblem { // private const string grammarString = @" // G(E): // E -> V | V+E | V-E | V*E | (E) // V -> a .. j // "; //private const string grammarString = @" //G(E): //E -> a | b | c | d | e | f | g | h | i | j | a+E | b+E | c+E | d+E | e+E | f+E | g+E | h+E | i+E | j+E | a*E | b*E | c*E | d*E | e*E | f*E | g*E | h*E | i*E | j*E //"; private const string grammarString = @" G(E): E -> a | b | c | d | e | f | g | h | i | j | a+E | b+E | c+E | d+E | e+E | f+E | g+E | h+E | i+E | j+E | a*E | b*E | c*E | d*E | e*E | f*E | g*E | h*E | i*E | j*E "; // for tree-based GP in HL we need a different grammar for the same language // E = expr, S = sum, P = product private const string hlGrammarString = @" G(E): E -> S | P | a | b | c | d | e | f | g | h | i | j S -> EE | EEE P -> EE | EEE "; // mininmal tree for the optimal expr (40 nodes) // E S // E S // E P // E a E b // E P // E c E d // E P // E e E f // E S // E P // E a E g E i // E P // E c E f E j public IGrammar TreeBasedGPGrammar { get; private set; } private readonly IGrammar grammar; private readonly int N; private readonly double[][] x; private readonly double[] y; public string Name { get { return "SymbolicRegressionPoly10"; } } public SymbolicRegressionPoly10Problem() { this.grammar = new Grammar(grammarString); this.TreeBasedGPGrammar = new Grammar(hlGrammarString); this.N = 500; this.x = new double[N][]; this.y = new double[N]; GenerateData(); } private void GenerateData() { // generate data with fixed seed to make sure that data is always the same var rand = new System.Random(31415); for (int i = 0; i < N; i++) { x[i] = new double[10]; for (int j = 0; j < 10; j++) { x[i][j] = rand.NextDouble() * 2 - 1; } // poly-10 no noise /* a*b + c*d + e*f + a*g*i + c*f*j */ y[i] = x[i][0] * x[i][1] + x[i][2] * x[i][3] + x[i][4] * x[i][5] + x[i][0] * x[i][6] * x[i][8] + x[i][2] * x[i][5] * x[i][9]; } } public double BestKnownQuality(int maxLen) { // for now only an upper bound is returned, ideally we have an R² of 1.0 // the optimal R² can only be reached for sentences of at least 23 symbols return 1.0; } public IGrammar Grammar { get { return grammar; } } public double Evaluate(string sentence) { var interpreter = new ExpressionInterpreter(); return HeuristicLab.Common.Extensions.RSq(y, Enumerable.Range(0, N).Select(i => interpreter.Interpret(sentence, x[i])).ToArray()); } // most-recently-used caching (with limited capacity) for canonical representations MostRecentlyUsedCache canonicalPhraseCache = new MostRecentlyUsedCache(100000); // right now only + and * is supported public string CanonicalRepresentation(string phrase) { string canonicalPhrase; if (!canonicalPhraseCache.TryGetValue(phrase, out canonicalPhrase)) { var terms = phrase.Split('+'); var canonicalTerms = new SortedSet(); // only the last term might contain a NT-symbol. make sure this term is added at the end for (int i = 0; i < terms.Length - 1; i++) { canonicalTerms.Add(CanonicalTerm(terms[i])); } var sb = new StringBuilder(phrase.Length); foreach (var t in canonicalTerms) sb.Append(t).Append('+'); sb.Append(CanonicalTerm(terms[terms.Length - 1])); canonicalPhrase = sb.ToString(); canonicalPhraseCache.Add(phrase, canonicalPhrase); } return canonicalPhrase; } // cache the canonical form of terms for performance reasons private Dictionary canonicalTermDictionary = new Dictionary(); private string CanonicalTerm(string term) { string canonicalTerm; if (!canonicalTermDictionary.TryGetValue(term, out canonicalTerm)) { // add var chars = term.ToCharArray(); Array.Sort(chars); var sb = new StringBuilder(chars.Length); // we want to have the up-case characters last for (int i = chars.Length - 1; i > 0; i--) { if (chars[i] != '*') { sb.Append(chars[i]); if (chars[i - 1] != '*') sb.Append('*'); } } if (chars[0] != '*') sb.Append(chars[0]); // last term canonicalTerm = sb.ToString(); canonicalTermDictionary.Add(term, canonicalTerm); } return canonicalTerm; } private double[] varIds = new double[] { }; // splits the phrase into terms and creates (sparse) term-occurrance features public IEnumerable GetFeatures(string phrase) { //if (phrase.EndsWith("E")) phrase = phrase.TrimEnd('*', '+', 'E'); //yield return new Feature("$$$", 1.0); // const //var canonicalTerms = new HashSet(); //foreach (string t in phrase.Split('+')) { // canonicalTerms.Add(CanonicalTerm(t)); //} //return canonicalTerms.Select(entry => new Feature(entry, 1.0)); //.Concat(new Feature[] { new Feature(CanonicalRepresentation(phrase), 1.0) }); if (phrase.EndsWith("E")) phrase = phrase.TrimEnd('*', '+', 'E'); //var len = 5; //var start = Math.Max(0, phrase.Length - len); //var end = Math.Min(phrase.Length, start + len); //string f = phrase.Substring(start, end - start); //yield return new Feature(f, 1.0); // var terms = phrase.Split('+'); foreach (var t in terms.Distinct()) yield return new Feature(t, 1.0); for (int i = 0; i < terms.Length; i++) { for (int j = i + 1; j < terms.Length; j++) { yield return new Feature(terms[i] + " " + terms[j], 1.0); } } // var substrings = new HashSet(); // for (int start = 0; start <= phrase.Length - 2; start += 2) { // var s = phrase.Substring(start, 3); // substrings.Add(s); // } // // var list = new List(substrings); // // for (int i = 0; i < list.Count; i++) { // yield return new Feature(list[i], 1.0); // //for (int j = i+1; j < list.Count; j++) { // // yield return new Feature(list[i] + " " + list[j], 1.0); // //} // } // // for (int len = 1; len <= phrase.Length; len += 2) { // var start = Math.Max(0, phrase.Length - len); // var end = Math.Min(phrase.Length, start + len); // string f = phrase.Substring(start, end - start); // yield return new Feature(f, 1.0); // // } //var partialInterpreter = new PartialExpressionInterpreter(); //var vars = new double[] { 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, }; //var s = partialInterpreter.Interpret(phrase, vars); ////if (s.Any()) //// return new Feature[] { new Feature(s.Pop().ToString(), 1.0), }; ////else //// return new Feature[] { new Feature("$", 1.0), }; //return s.Select(f => new Feature(f.ToString(), 1.0)); } public string ConvertTreeToSentence(ISymbolicExpressionTree tree) { var sb = new StringBuilder(); TreeToSentence(tree.Root.GetSubtree(0).GetSubtree(0), sb); return sb.ToString(); } private void TreeToSentence(ISymbolicExpressionTreeNode treeNode, StringBuilder sb) { if (treeNode.SubtreeCount == 0) { // terminal sb.Append(treeNode.Symbol.Name); } else { string op = string.Empty; switch (treeNode.Symbol.Name) { case "S": op = "+"; break; case "P": op = "*"; break; default: { Debug.Assert(treeNode.SubtreeCount == 1); break; } } // nonterminal if (op == "+") sb.Append("("); TreeToSentence(treeNode.Subtrees.First(), sb); foreach (var subTree in treeNode.Subtrees.Skip(1)) { sb.Append(op); TreeToSentence(subTree, sb); } if (op == "+") sb.Append(")"); } } } }