1 | using System;
|
---|
2 | using System.Collections.Concurrent;
|
---|
3 | using System.Collections.Generic;
|
---|
4 | using System.Collections.Specialized;
|
---|
5 | using System.Diagnostics;
|
---|
6 | using System.Linq;
|
---|
7 | using System.Net;
|
---|
8 | using System.Security;
|
---|
9 | using System.Security.AccessControl;
|
---|
10 | using System.Text;
|
---|
11 | using HeuristicLab.Common;
|
---|
12 | using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
|
---|
13 |
|
---|
14 | namespace HeuristicLab.Problems.GrammaticalOptimization
|
---|
15 | {
|
---|
16 | public class SymbolicRegressionPoly10Problem : ISymbolicExpressionTreeProblem
|
---|
17 | {
|
---|
18 | // private const string grammarString = @"
|
---|
19 | // G(E):
|
---|
20 | // E -> V | V+E | V-E | V*E | (E)
|
---|
21 | // V -> a .. j
|
---|
22 | // ";
|
---|
23 | private const string grammarString = @"
|
---|
24 | G(E):
|
---|
25 | E -> a | b | c | d | e | f | g | h | i | j | a+E | b+E | c+E | d+E | e+E | f+E | g+E | h+E | i+E | j+E | a*E | b*E | c*E | d*E | e*E | f*E | g*E | h*E | i*E | j*E
|
---|
26 | ";
|
---|
27 |
|
---|
28 | // for tree-based GP in HL we need a different grammar for the same language
|
---|
29 | // E = expr, S = sum, P = product
|
---|
30 | private const string hlGrammarString = @"
|
---|
31 | G(E):
|
---|
32 | E -> S | P | a | b | c | d | e | f | g | h | i | j
|
---|
33 | S -> EE | EEE
|
---|
34 | P -> EE | EEE
|
---|
35 | ";
|
---|
36 | // mininmal tree for the optimal expr (40 nodes)
|
---|
37 | // E S
|
---|
38 | // E S
|
---|
39 | // E P
|
---|
40 | // E a E b
|
---|
41 | // E P
|
---|
42 | // E c E d
|
---|
43 | // E P
|
---|
44 | // E e E f
|
---|
45 | // E S
|
---|
46 | // E P
|
---|
47 | // E a E g E i
|
---|
48 | // E P
|
---|
49 | // E c E f E j
|
---|
50 |
|
---|
51 | public IGrammar TreeBasedGPGrammar { get; private set; }
|
---|
52 |
|
---|
53 | private readonly IGrammar grammar;
|
---|
54 |
|
---|
55 | private readonly int N;
|
---|
56 | private readonly double[][] x;
|
---|
57 | private readonly double[] y;
|
---|
58 | public string Name { get { return "SymbolicRegressionPoly10"; } }
|
---|
59 |
|
---|
60 | public SymbolicRegressionPoly10Problem()
|
---|
61 | {
|
---|
62 | this.grammar = new Grammar(grammarString);
|
---|
63 | this.TreeBasedGPGrammar = new Grammar(hlGrammarString);
|
---|
64 |
|
---|
65 | this.N = 500;
|
---|
66 | this.x = new double[N][];
|
---|
67 | this.y = new double[N];
|
---|
68 |
|
---|
69 | GenerateData();
|
---|
70 | }
|
---|
71 |
|
---|
72 | private void GenerateData()
|
---|
73 | {
|
---|
74 | // generate data with fixed seed to make sure that data is always the same
|
---|
75 | var rand = new Random(31415);
|
---|
76 | for (int i = 0; i < N; i++)
|
---|
77 | {
|
---|
78 | x[i] = new double[10];
|
---|
79 | for (int j = 0; j < 10; j++)
|
---|
80 | {
|
---|
81 | x[i][j] = rand.NextDouble() * 2 - 1;
|
---|
82 | }
|
---|
83 | // poly-10 no noise
|
---|
84 | /* a*b + c*d + e*f + a*g*i + c*f*j */
|
---|
85 | y[i] = x[i][0] * x[i][1] +
|
---|
86 | x[i][2] * x[i][3] +
|
---|
87 | x[i][4] * x[i][5] +
|
---|
88 | x[i][0] * x[i][6] * x[i][8] +
|
---|
89 | x[i][2] * x[i][5] * x[i][9];
|
---|
90 | }
|
---|
91 | }
|
---|
92 |
|
---|
93 | public double BestKnownQuality(int maxLen)
|
---|
94 | {
|
---|
95 | // for now only an upper bound is returned, ideally we have an R² of 1.0
|
---|
96 | // the optimal R² can only be reached for sentences of at least 23 symbols
|
---|
97 | return 1.0;
|
---|
98 | }
|
---|
99 |
|
---|
100 | public IGrammar Grammar
|
---|
101 | {
|
---|
102 | get { return grammar; }
|
---|
103 | }
|
---|
104 |
|
---|
105 | public double Evaluate(string sentence)
|
---|
106 | {
|
---|
107 | var interpreter = new ExpressionInterpreter();
|
---|
108 | return HeuristicLab.Common.Extensions.RSq(y, Enumerable.Range(0, N).Select(i => interpreter.Interpret(sentence, x[i])).ToArray());
|
---|
109 | }
|
---|
110 |
|
---|
111 |
|
---|
112 | // most-recently-used caching (with limited capacity) for canonical representations
|
---|
113 | MostRecentlyUsedCache<string, string> canonicalPhraseCache = new MostRecentlyUsedCache<string, string>(100000);
|
---|
114 | // right now only + and * is supported
|
---|
115 | public string CanonicalRepresentation(string phrase)
|
---|
116 | {
|
---|
117 | string canonicalPhrase;
|
---|
118 | if (!canonicalPhraseCache.TryGetValue(phrase, out canonicalPhrase))
|
---|
119 | {
|
---|
120 | var terms = phrase.Split('+');
|
---|
121 | var canonicalTerms = new SortedSet<string>();
|
---|
122 | // only the last term might contain a NT-symbol. make sure this term is added at the end
|
---|
123 | for (int i = 0; i < terms.Length - 1; i++)
|
---|
124 | {
|
---|
125 | canonicalTerms.Add(CanonicalTerm(terms[i]));
|
---|
126 | }
|
---|
127 |
|
---|
128 | var sb = new StringBuilder(phrase.Length);
|
---|
129 | foreach (var t in canonicalTerms)
|
---|
130 | sb.Append(t).Append('+');
|
---|
131 |
|
---|
132 | sb.Append(CanonicalTerm(terms[terms.Length - 1]));
|
---|
133 | canonicalPhrase = sb.ToString();
|
---|
134 | canonicalPhraseCache.Add(phrase, canonicalPhrase);
|
---|
135 | }
|
---|
136 | return canonicalPhrase;
|
---|
137 | }
|
---|
138 |
|
---|
139 | // cache the canonical form of terms for performance reasons
|
---|
140 | private Dictionary<string, string> canonicalTermDictionary = new Dictionary<string, string>();
|
---|
141 | private string CanonicalTerm(string term)
|
---|
142 | {
|
---|
143 | string canonicalTerm;
|
---|
144 | if (!canonicalTermDictionary.TryGetValue(term, out canonicalTerm))
|
---|
145 | {
|
---|
146 | // add
|
---|
147 | var chars = term.ToCharArray();
|
---|
148 | Array.Sort(chars);
|
---|
149 | var sb = new StringBuilder(chars.Length);
|
---|
150 | // we want to have the up-case characters last
|
---|
151 | for (int i = chars.Length - 1; i > 0; i--)
|
---|
152 | {
|
---|
153 | if (chars[i] != '*')
|
---|
154 | {
|
---|
155 | sb.Append(chars[i]);
|
---|
156 | if (chars[i - 1] != '*') sb.Append('*');
|
---|
157 | }
|
---|
158 | }
|
---|
159 | if (chars[0] != '*') sb.Append(chars[0]); // last term
|
---|
160 | canonicalTerm = sb.ToString();
|
---|
161 | canonicalTermDictionary.Add(term, canonicalTerm);
|
---|
162 | }
|
---|
163 | return canonicalTerm;
|
---|
164 | }
|
---|
165 |
|
---|
166 | // splits the phrase into terms and creates (sparse) term-occurrance features
|
---|
167 | public IEnumerable<Feature> GetFeatures(string phrase)
|
---|
168 | {
|
---|
169 | var canonicalTerms = new HashSet<string>();
|
---|
170 | foreach (string t in phrase.Split('+'))
|
---|
171 | {
|
---|
172 | canonicalTerms.Add(CanonicalTerm(t));
|
---|
173 | }
|
---|
174 | return canonicalTerms.Select(entry => new Feature(entry, 1.0))
|
---|
175 | .Concat(new Feature[] { new Feature(CanonicalRepresentation(phrase), 1.0) });
|
---|
176 | }
|
---|
177 |
|
---|
178 | public string ConvertTreeToSentence(ISymbolicExpressionTree tree)
|
---|
179 | {
|
---|
180 | var sb = new StringBuilder();
|
---|
181 |
|
---|
182 | TreeToSentence(tree.Root.GetSubtree(0).GetSubtree(0), sb);
|
---|
183 |
|
---|
184 | return sb.ToString();
|
---|
185 | }
|
---|
186 |
|
---|
187 | private void TreeToSentence(ISymbolicExpressionTreeNode treeNode, StringBuilder sb)
|
---|
188 | {
|
---|
189 | if (treeNode.SubtreeCount == 0)
|
---|
190 | {
|
---|
191 | // terminal
|
---|
192 | sb.Append(treeNode.Symbol.Name);
|
---|
193 | }
|
---|
194 | else
|
---|
195 | {
|
---|
196 | string op = string.Empty;
|
---|
197 | switch (treeNode.Symbol.Name)
|
---|
198 | {
|
---|
199 | case "S": op = "+"; break;
|
---|
200 | case "P": op = "*"; break;
|
---|
201 | default:
|
---|
202 | {
|
---|
203 | Debug.Assert(treeNode.SubtreeCount == 1);
|
---|
204 | break;
|
---|
205 | }
|
---|
206 | }
|
---|
207 | // nonterminal
|
---|
208 | if (op == "+") sb.Append("(");
|
---|
209 | TreeToSentence(treeNode.Subtrees.First(), sb);
|
---|
210 | foreach (var subTree in treeNode.Subtrees.Skip(1))
|
---|
211 | {
|
---|
212 | sb.Append(op);
|
---|
213 | TreeToSentence(subTree, sb);
|
---|
214 | }
|
---|
215 | if (op == "+") sb.Append(")");
|
---|
216 | }
|
---|
217 | }
|
---|
218 |
|
---|
219 | public void GenerateProblemSolutions(int maxLen)
|
---|
220 | {
|
---|
221 | }
|
---|
222 |
|
---|
223 | private readonly List<string> canonicaSolutionTerms3 = new List<string> { "b*a", "d*c", "f*e" };
|
---|
224 | private readonly List<string> canonicaSolutionTerms5 = new List<string> { "i*g*a", "j*f*c" };
|
---|
225 |
|
---|
226 | public bool IsParentOfProblemSolution(string sentence, int maxLen)
|
---|
227 | {
|
---|
228 | /* a*b + c*d + e*f + a*g*i + c*f*j */
|
---|
229 |
|
---|
230 | bool containsE = sentence.EndsWith("E");
|
---|
231 |
|
---|
232 | if (!containsE && sentence.Length != 23 || sentence.Length > 23)
|
---|
233 | {
|
---|
234 | return false;
|
---|
235 | }
|
---|
236 |
|
---|
237 | // now: contains E or has solution length..
|
---|
238 | string[] terms = sentence.Split('+');
|
---|
239 |
|
---|
240 | bool[] terms3 = new bool[3];
|
---|
241 | bool[] terms5 = new bool[2];
|
---|
242 |
|
---|
243 | // run through addends without E
|
---|
244 | for (int i = 0; i < terms.Length - 1; i++)
|
---|
245 | {
|
---|
246 | string term = terms[i];
|
---|
247 | if (term.Length == 3)
|
---|
248 | {
|
---|
249 | string canonicalTerm = CanonicalTerm(term);
|
---|
250 | int index = canonicaSolutionTerms3.IndexOf(canonicalTerm);
|
---|
251 | if (index < 0 || terms3[index])
|
---|
252 | {
|
---|
253 | // term is not part of solution or twice in solution
|
---|
254 | return false;
|
---|
255 | }
|
---|
256 | terms3[index] = true;
|
---|
257 | }
|
---|
258 | else if (term.Length == 5)
|
---|
259 | {
|
---|
260 | string canonicalTerm = CanonicalTerm(term);
|
---|
261 | int index = canonicaSolutionTerms5.IndexOf(canonicalTerm);
|
---|
262 | if (index < 0 || terms5[index])
|
---|
263 | {
|
---|
264 | // term is not part of solution or twice in solution
|
---|
265 | return false;
|
---|
266 | }
|
---|
267 | terms5[index] = true;
|
---|
268 | }
|
---|
269 | else
|
---|
270 | {
|
---|
271 | return false;
|
---|
272 | }
|
---|
273 | }
|
---|
274 |
|
---|
275 | string lastTerm = terms[terms.Length - 1];
|
---|
276 | if (containsE)
|
---|
277 | {
|
---|
278 | if (lastTerm.Length == 1)
|
---|
279 | {
|
---|
280 | // is E
|
---|
281 | return true;
|
---|
282 | }
|
---|
283 | string withoutE = lastTerm.Substring(0, lastTerm.Length - 2);
|
---|
284 | if (lastTerm.Length == 3)
|
---|
285 | {
|
---|
286 | for (int i = 0; i < 3; i++)
|
---|
287 | {
|
---|
288 | if (!terms3[i] && canonicaSolutionTerms3[i].Contains(withoutE))
|
---|
289 | {
|
---|
290 | return true;
|
---|
291 | }
|
---|
292 | }
|
---|
293 |
|
---|
294 | }
|
---|
295 | else if (lastTerm.Length == 5)
|
---|
296 | {
|
---|
297 | for (int i = 0; i < 2; i++)
|
---|
298 | {
|
---|
299 | if (!terms5[i] && canonicaSolutionTerms5[i].Contains(withoutE))
|
---|
300 | {
|
---|
301 | return true;
|
---|
302 | }
|
---|
303 | }
|
---|
304 | }
|
---|
305 | }
|
---|
306 | else
|
---|
307 | {
|
---|
308 | // check if it is already correct solution
|
---|
309 | if (lastTerm.Length == 3)
|
---|
310 | {
|
---|
311 | for (int i = 0; i < 3; i++)
|
---|
312 | {
|
---|
313 | if (!terms3[i] && canonicaSolutionTerms3[i] == lastTerm)
|
---|
314 | {
|
---|
315 | return true;
|
---|
316 | }
|
---|
317 | }
|
---|
318 |
|
---|
319 | }
|
---|
320 | else if (lastTerm.Length == 5)
|
---|
321 | {
|
---|
322 | for (int i = 0; i < 2; i++)
|
---|
323 | {
|
---|
324 | if (!terms5[i] && canonicaSolutionTerms5[i] == lastTerm)
|
---|
325 | {
|
---|
326 | return true;
|
---|
327 | }
|
---|
328 | }
|
---|
329 | }
|
---|
330 | }
|
---|
331 |
|
---|
332 | return false;
|
---|
333 | }
|
---|
334 | }
|
---|
335 | }
|
---|