Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Problems.GrammaticalOptimization/Problems/SymbolicRegressionPoly10Problem.cs @ 12533

Last change on this file since 12533 was 12391, checked in by gkronber, 10 years ago

#2283: added shuffling of terminal symbols to the royal pair problem to make sure that there is no bias from order of terminal symbols.

File size: 8.9 KB
Line 
1using System;
2using System.Collections.Concurrent;
3using System.Collections.Generic;
4using System.Collections.Specialized;
5using System.Diagnostics;
6using System.Linq;
7using System.Net;
8using System.Security;
9using System.Security.AccessControl;
10using System.Text;
11using HeuristicLab.Common;
12using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
13
14namespace HeuristicLab.Problems.GrammaticalOptimization {
15  public class SymbolicRegressionPoly10Problem : ISymbolicExpressionTreeProblem {
16    //    private const string grammarString = @"
17    //    G(E):
18    //    E -> V | V+E | V-E | V*E | (E)
19    //    V -> a .. j
20    //    ";
21    //private const string grammarString = @"
22    //G(E):
23    //E -> a | b | c | d | e | f | g | h | i | j | a+E | b+E | c+E | d+E | e+E | f+E | g+E | h+E | i+E | j+E | a*E | b*E | c*E | d*E | e*E | f*E | g*E | h*E | i*E | j*E
24    //";
25    private const string grammarString = @"
26    G(E):
27    E -> a | b | c | d | e | f | g | h | i | j | a+E | b+E | c+E | d+E | e+E | f+E | g+E | h+E | i+E | j+E | a*E | b*E | c*E | d*E | e*E | f*E | g*E | h*E | i*E | j*E 
28    ";
29
30    // for tree-based GP in HL we need a different grammar for the same language
31    // E = expr, S = sum, P = product
32    private const string hlGrammarString = @"
33    G(E):
34    E -> S | P | a | b | c | d | e | f | g | h | i | j
35    S -> EE | EEE
36    P -> EE | EEE
37    ";
38    // mininmal tree for the optimal expr (40 nodes)
39    // E S
40    //     E S
41    //         E P
42    //           E a E b
43    //         E P
44    //           E c E d
45    //         E P
46    //           E e E f
47    //     E S
48    //         E P
49    //           E a E g E i
50    //         E P
51    //           E c E f E j
52
53    public IGrammar TreeBasedGPGrammar { get; private set; }
54
55    private readonly IGrammar grammar;
56
57    private readonly int N;
58    private readonly double[][] x;
59    private readonly double[] y;
60    public string Name { get { return "SymbolicRegressionPoly10"; } }
61
62    public SymbolicRegressionPoly10Problem() {
63      this.grammar = new Grammar(grammarString);
64      this.TreeBasedGPGrammar = new Grammar(hlGrammarString);
65
66      this.N = 500;
67      this.x = new double[N][];
68      this.y = new double[N];
69
70      GenerateData();
71    }
72
73    private void GenerateData() {
74      // generate data with fixed seed to make sure that data is always the same
75      var rand = new System.Random(31415);
76      for (int i = 0; i < N; i++) {
77        x[i] = new double[10];
78        for (int j = 0; j < 10; j++) {
79          x[i][j] = rand.NextDouble() * 2 - 1;
80        }
81        // poly-10 no noise
82        /* a*b + c*d + e*f + a*g*i + c*f*j */
83        y[i] = x[i][0] * x[i][1] +
84               x[i][2] * x[i][3] +
85               x[i][4] * x[i][5] +
86               x[i][0] * x[i][6] * x[i][8] +
87               x[i][2] * x[i][5] * x[i][9];
88      }
89    }
90
91    public double BestKnownQuality(int maxLen) {
92      // for now only an upper bound is returned, ideally we have an R² of 1.0
93      // the optimal R² can only be reached for sentences of at least 23 symbols
94      return 1.0;
95    }
96
97    public IGrammar Grammar {
98      get { return grammar; }
99    }
100
101    public double Evaluate(string sentence) {
102      var interpreter = new ExpressionInterpreter();
103      return HeuristicLab.Common.Extensions.RSq(y, Enumerable.Range(0, N).Select(i => interpreter.Interpret(sentence, x[i])).ToArray());
104    }
105
106
107    // most-recently-used caching (with limited capacity) for canonical representations
108    MostRecentlyUsedCache<string, string> canonicalPhraseCache = new MostRecentlyUsedCache<string, string>(100000);
109    // right now only + and * is supported
110    public string CanonicalRepresentation(string phrase) {
111      string canonicalPhrase;
112      if (!canonicalPhraseCache.TryGetValue(phrase, out canonicalPhrase)) {
113        var terms = phrase.Split('+');
114        var canonicalTerms = new SortedSet<string>();
115        // only the last term might contain a NT-symbol. make sure this term is added at the end
116        for (int i = 0; i < terms.Length - 1; i++) {
117          canonicalTerms.Add(CanonicalTerm(terms[i]));
118        }
119
120        var sb = new StringBuilder(phrase.Length);
121        foreach (var t in canonicalTerms)
122          sb.Append(t).Append('+');
123
124        sb.Append(CanonicalTerm(terms[terms.Length - 1]));
125        canonicalPhrase = sb.ToString();
126        canonicalPhraseCache.Add(phrase, canonicalPhrase);
127      }
128      return canonicalPhrase;
129    }
130
131    // cache the canonical form of terms for performance reasons
132    private Dictionary<string, string> canonicalTermDictionary = new Dictionary<string, string>();
133    private string CanonicalTerm(string term) {
134      string canonicalTerm;
135      if (!canonicalTermDictionary.TryGetValue(term, out canonicalTerm)) {
136        // add
137        var chars = term.ToCharArray();
138        Array.Sort(chars);
139        var sb = new StringBuilder(chars.Length);
140        // we want to have the up-case characters last
141        for (int i = chars.Length - 1; i > 0; i--) {
142          if (chars[i] != '*') {
143            sb.Append(chars[i]);
144            if (chars[i - 1] != '*') sb.Append('*');
145          }
146        }
147        if (chars[0] != '*') sb.Append(chars[0]); // last term
148        canonicalTerm = sb.ToString();
149        canonicalTermDictionary.Add(term, canonicalTerm);
150      }
151      return canonicalTerm;
152    }
153
154    private double[] varIds = new double[] { };
155
156    // splits the phrase into terms and creates (sparse) term-occurrance features
157    public IEnumerable<Feature> GetFeatures(string phrase) {
158      //if (phrase.EndsWith("E")) phrase = phrase.TrimEnd('*', '+', 'E');
159      //yield return new Feature("$$$", 1.0); // const
160      //var canonicalTerms = new HashSet<string>();
161      //foreach (string t in phrase.Split('+')) {
162      //  canonicalTerms.Add(CanonicalTerm(t));
163      //}
164      //return canonicalTerms.Select(entry => new Feature(entry, 1.0));
165      //.Concat(new Feature[] { new Feature(CanonicalRepresentation(phrase), 1.0) });
166
167
168      if (phrase.EndsWith("E")) phrase = phrase.TrimEnd('*', '+', 'E');
169      //var len = 5;
170      //var start = Math.Max(0, phrase.Length - len);
171      //var end = Math.Min(phrase.Length, start + len);
172      //string f = phrase.Substring(start, end - start);
173      //yield return new Feature(f, 1.0);
174      //
175
176      var terms = phrase.Split('+');
177      foreach (var t in terms.Distinct()) yield return new Feature(t, 1.0);
178
179      for (int i = 0; i < terms.Length; i++) {
180        for (int j = i + 1; j < terms.Length; j++) {
181          yield return new Feature(terms[i] + " " + terms[j], 1.0);
182        }
183      }
184
185      // var substrings = new HashSet<string>();
186      // for (int start = 0; start <= phrase.Length - 2; start += 2) {
187      //   var s = phrase.Substring(start, 3);
188      //   substrings.Add(s);
189      // }
190      //
191      // var list = new List<string>(substrings);
192      //
193      // for (int i = 0; i < list.Count; i++) {
194      //   yield return new Feature(list[i], 1.0);
195      //   //for (int j = i+1; j < list.Count; j++) {
196      //   //  yield return new Feature(list[i] + " " + list[j], 1.0);
197      //   //}
198      // }
199
200      //
201      // for (int len = 1; len <= phrase.Length; len += 2) {
202      //   var start = Math.Max(0, phrase.Length - len);
203      //   var end = Math.Min(phrase.Length, start + len);
204      //   string f = phrase.Substring(start, end - start);
205      //   yield return new Feature(f, 1.0);
206      //
207      // }
208
209      //var partialInterpreter = new PartialExpressionInterpreter();
210      //var vars = new double[] { 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, };
211      //var s = partialInterpreter.Interpret(phrase, vars);
212      ////if (s.Any())
213      ////  return new Feature[] { new Feature(s.Pop().ToString(), 1.0), };
214      ////else
215      ////  return new Feature[] { new Feature("$", 1.0), };
216      //return s.Select(f => new Feature(f.ToString(), 1.0));
217    }
218
219    public string ConvertTreeToSentence(ISymbolicExpressionTree tree) {
220      var sb = new StringBuilder();
221
222      TreeToSentence(tree.Root.GetSubtree(0).GetSubtree(0), sb);
223
224      return sb.ToString();
225    }
226
227    private void TreeToSentence(ISymbolicExpressionTreeNode treeNode, StringBuilder sb) {
228      if (treeNode.SubtreeCount == 0) {
229        // terminal
230        sb.Append(treeNode.Symbol.Name);
231      } else {
232        string op = string.Empty;
233        switch (treeNode.Symbol.Name) {
234          case "S": op = "+"; break;
235          case "P": op = "*"; break;
236          default: {
237              Debug.Assert(treeNode.SubtreeCount == 1);
238              break;
239            }
240        }
241        // nonterminal
242        if (op == "+") sb.Append("(");
243        TreeToSentence(treeNode.Subtrees.First(), sb);
244        foreach (var subTree in treeNode.Subtrees.Skip(1)) {
245          sb.Append(op);
246          TreeToSentence(subTree, sb);
247        }
248        if (op == "+") sb.Append(")");
249      }
250    }
251  }
252}
Note: See TracBrowser for help on using the repository browser.