[11659] | 1 | using System;
|
---|
[11799] | 2 | using System.Collections.Concurrent;
|
---|
[11659] | 3 | using System.Collections.Generic;
|
---|
[11799] | 4 | using System.Collections.Specialized;
|
---|
[11846] | 5 | using System.Diagnostics;
|
---|
[11659] | 6 | using System.Linq;
|
---|
[11770] | 7 | using System.Net;
|
---|
[11659] | 8 | using System.Security;
|
---|
| 9 | using System.Security.AccessControl;
|
---|
| 10 | using System.Text;
|
---|
[11727] | 11 | using HeuristicLab.Common;
|
---|
[11846] | 12 | using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
|
---|
[11659] | 13 |
|
---|
| 14 | namespace HeuristicLab.Problems.GrammaticalOptimization {
|
---|
[11846] | 15 | public class SymbolicRegressionPoly10Problem : ISymbolicExpressionTreeProblem {
|
---|
[11727] | 16 | // private const string grammarString = @"
|
---|
| 17 | // G(E):
|
---|
| 18 | // E -> V | V+E | V-E | V*E | (E)
|
---|
| 19 | // V -> a .. j
|
---|
| 20 | // ";
|
---|
[12290] | 21 | //private const string grammarString = @"
|
---|
| 22 | //G(E):
|
---|
| 23 | //E -> a | b | c | d | e | f | g | h | i | j | a+E | b+E | c+E | d+E | e+E | f+E | g+E | h+E | i+E | j+E | a*E | b*E | c*E | d*E | e*E | f*E | g*E | h*E | i*E | j*E
|
---|
| 24 | //";
|
---|
[11659] | 25 | private const string grammarString = @"
|
---|
[11727] | 26 | G(E):
|
---|
[12290] | 27 | E -> a | b | c | d | e | f | g | h | i | j | a+E | b+E | c+E | d+E | e+E | f+E | g+E | h+E | i+E | j+E | a*E | b*E | c*E | d*E | e*E | f*E | g*E | h*E | i*E | j*E
|
---|
[11727] | 28 | ";
|
---|
[11659] | 29 |
|
---|
[11846] | 30 | // for tree-based GP in HL we need a different grammar for the same language
|
---|
| 31 | // E = expr, S = sum, P = product
|
---|
| 32 | private const string hlGrammarString = @"
|
---|
| 33 | G(E):
|
---|
| 34 | E -> S | P | a | b | c | d | e | f | g | h | i | j
|
---|
| 35 | S -> EE | EEE
|
---|
| 36 | P -> EE | EEE
|
---|
| 37 | ";
|
---|
| 38 | // mininmal tree for the optimal expr (40 nodes)
|
---|
| 39 | // E S
|
---|
| 40 | // E S
|
---|
| 41 | // E P
|
---|
| 42 | // E a E b
|
---|
| 43 | // E P
|
---|
| 44 | // E c E d
|
---|
| 45 | // E P
|
---|
| 46 | // E e E f
|
---|
| 47 | // E S
|
---|
| 48 | // E P
|
---|
| 49 | // E a E g E i
|
---|
| 50 | // E P
|
---|
| 51 | // E c E f E j
|
---|
[11708] | 52 |
|
---|
[11857] | 53 | public IGrammar TreeBasedGPGrammar { get; private set; }
|
---|
[11846] | 54 |
|
---|
[11659] | 55 | private readonly IGrammar grammar;
|
---|
| 56 |
|
---|
| 57 | private readonly int N;
|
---|
| 58 | private readonly double[][] x;
|
---|
| 59 | private readonly double[] y;
|
---|
[12099] | 60 | public string Name { get { return "SymbolicRegressionPoly10"; } }
|
---|
[11659] | 61 |
|
---|
| 62 | public SymbolicRegressionPoly10Problem() {
|
---|
| 63 | this.grammar = new Grammar(grammarString);
|
---|
[11857] | 64 | this.TreeBasedGPGrammar = new Grammar(hlGrammarString);
|
---|
[11659] | 65 |
|
---|
| 66 | this.N = 500;
|
---|
| 67 | this.x = new double[N][];
|
---|
| 68 | this.y = new double[N];
|
---|
| 69 |
|
---|
| 70 | GenerateData();
|
---|
| 71 | }
|
---|
| 72 |
|
---|
| 73 | private void GenerateData() {
|
---|
| 74 | // generate data with fixed seed to make sure that data is always the same
|
---|
[12391] | 75 | var rand = new System.Random(31415);
|
---|
[11659] | 76 | for (int i = 0; i < N; i++) {
|
---|
| 77 | x[i] = new double[10];
|
---|
| 78 | for (int j = 0; j < 10; j++) {
|
---|
| 79 | x[i][j] = rand.NextDouble() * 2 - 1;
|
---|
| 80 | }
|
---|
| 81 | // poly-10 no noise
|
---|
[11730] | 82 | /* a*b + c*d + e*f + a*g*i + c*f*j */
|
---|
[11659] | 83 | y[i] = x[i][0] * x[i][1] +
|
---|
| 84 | x[i][2] * x[i][3] +
|
---|
| 85 | x[i][4] * x[i][5] +
|
---|
| 86 | x[i][0] * x[i][6] * x[i][8] +
|
---|
| 87 | x[i][2] * x[i][5] * x[i][9];
|
---|
| 88 | }
|
---|
| 89 | }
|
---|
| 90 |
|
---|
[11732] | 91 | public double BestKnownQuality(int maxLen) {
|
---|
[11659] | 92 | // for now only an upper bound is returned, ideally we have an R² of 1.0
|
---|
| 93 | // the optimal R² can only be reached for sentences of at least 23 symbols
|
---|
| 94 | return 1.0;
|
---|
| 95 | }
|
---|
| 96 |
|
---|
| 97 | public IGrammar Grammar {
|
---|
| 98 | get { return grammar; }
|
---|
| 99 | }
|
---|
| 100 |
|
---|
| 101 | public double Evaluate(string sentence) {
|
---|
[11742] | 102 | var interpreter = new ExpressionInterpreter();
|
---|
[11732] | 103 | return HeuristicLab.Common.Extensions.RSq(y, Enumerable.Range(0, N).Select(i => interpreter.Interpret(sentence, x[i])).ToArray());
|
---|
[11659] | 104 | }
|
---|
| 105 |
|
---|
| 106 |
|
---|
[11799] | 107 | // most-recently-used caching (with limited capacity) for canonical representations
|
---|
| 108 | MostRecentlyUsedCache<string, string> canonicalPhraseCache = new MostRecentlyUsedCache<string, string>(100000);
|
---|
[11727] | 109 | // right now only + and * is supported
|
---|
[11745] | 110 | public string CanonicalRepresentation(string phrase) {
|
---|
[11799] | 111 | string canonicalPhrase;
|
---|
| 112 | if (!canonicalPhraseCache.TryGetValue(phrase, out canonicalPhrase)) {
|
---|
| 113 | var terms = phrase.Split('+');
|
---|
| 114 | var canonicalTerms = new SortedSet<string>();
|
---|
| 115 | // only the last term might contain a NT-symbol. make sure this term is added at the end
|
---|
| 116 | for (int i = 0; i < terms.Length - 1; i++) {
|
---|
| 117 | canonicalTerms.Add(CanonicalTerm(terms[i]));
|
---|
| 118 | }
|
---|
[11745] | 119 |
|
---|
[11799] | 120 | var sb = new StringBuilder(phrase.Length);
|
---|
| 121 | foreach (var t in canonicalTerms)
|
---|
| 122 | sb.Append(t).Append('+');
|
---|
| 123 |
|
---|
| 124 | sb.Append(CanonicalTerm(terms[terms.Length - 1]));
|
---|
| 125 | canonicalPhrase = sb.ToString();
|
---|
| 126 | canonicalPhraseCache.Add(phrase, canonicalPhrase);
|
---|
| 127 | }
|
---|
| 128 | return canonicalPhrase;
|
---|
[11727] | 129 | }
|
---|
[11745] | 130 |
|
---|
[11799] | 131 | // cache the canonical form of terms for performance reasons
|
---|
| 132 | private Dictionary<string, string> canonicalTermDictionary = new Dictionary<string, string>();
|
---|
[11745] | 133 | private string CanonicalTerm(string term) {
|
---|
[11799] | 134 | string canonicalTerm;
|
---|
| 135 | if (!canonicalTermDictionary.TryGetValue(term, out canonicalTerm)) {
|
---|
| 136 | // add
|
---|
| 137 | var chars = term.ToCharArray();
|
---|
| 138 | Array.Sort(chars);
|
---|
| 139 | var sb = new StringBuilder(chars.Length);
|
---|
| 140 | // we want to have the up-case characters last
|
---|
[11806] | 141 | for (int i = chars.Length - 1; i > 0; i--) {
|
---|
| 142 | if (chars[i] != '*') {
|
---|
| 143 | sb.Append(chars[i]);
|
---|
| 144 | if (chars[i - 1] != '*') sb.Append('*');
|
---|
| 145 | }
|
---|
[11799] | 146 | }
|
---|
[11806] | 147 | if (chars[0] != '*') sb.Append(chars[0]); // last term
|
---|
[11799] | 148 | canonicalTerm = sb.ToString();
|
---|
| 149 | canonicalTermDictionary.Add(term, canonicalTerm);
|
---|
| 150 | }
|
---|
| 151 | return canonicalTerm;
|
---|
[11745] | 152 | }
|
---|
[11832] | 153 |
|
---|
[12290] | 154 | private double[] varIds = new double[] { };
|
---|
| 155 |
|
---|
[11832] | 156 | // splits the phrase into terms and creates (sparse) term-occurrance features
|
---|
| 157 | public IEnumerable<Feature> GetFeatures(string phrase) {
|
---|
[12298] | 158 | //if (phrase.EndsWith("E")) phrase = phrase.TrimEnd('*', '+', 'E');
|
---|
| 159 | //yield return new Feature("$$$", 1.0); // const
|
---|
| 160 | //var canonicalTerms = new HashSet<string>();
|
---|
| 161 | //foreach (string t in phrase.Split('+')) {
|
---|
| 162 | // canonicalTerms.Add(CanonicalTerm(t));
|
---|
| 163 | //}
|
---|
| 164 | //return canonicalTerms.Select(entry => new Feature(entry, 1.0));
|
---|
| 165 | //.Concat(new Feature[] { new Feature(CanonicalRepresentation(phrase), 1.0) });
|
---|
| 166 |
|
---|
| 167 |
|
---|
| 168 | if (phrase.EndsWith("E")) phrase = phrase.TrimEnd('*', '+', 'E');
|
---|
| 169 | //var len = 5;
|
---|
| 170 | //var start = Math.Max(0, phrase.Length - len);
|
---|
| 171 | //var end = Math.Min(phrase.Length, start + len);
|
---|
| 172 | //string f = phrase.Substring(start, end - start);
|
---|
| 173 | //yield return new Feature(f, 1.0);
|
---|
| 174 | //
|
---|
| 175 |
|
---|
| 176 | var terms = phrase.Split('+');
|
---|
| 177 | foreach (var t in terms.Distinct()) yield return new Feature(t, 1.0);
|
---|
| 178 |
|
---|
| 179 | for (int i = 0; i < terms.Length; i++) {
|
---|
| 180 | for (int j = i + 1; j < terms.Length; j++) {
|
---|
| 181 | yield return new Feature(terms[i] + " " + terms[j], 1.0);
|
---|
| 182 | }
|
---|
| 183 | }
|
---|
| 184 |
|
---|
| 185 | // var substrings = new HashSet<string>();
|
---|
| 186 | // for (int start = 0; start <= phrase.Length - 2; start += 2) {
|
---|
| 187 | // var s = phrase.Substring(start, 3);
|
---|
| 188 | // substrings.Add(s);
|
---|
[12290] | 189 | // }
|
---|
[12298] | 190 | //
|
---|
| 191 | // var list = new List<string>(substrings);
|
---|
| 192 | //
|
---|
| 193 | // for (int i = 0; i < list.Count; i++) {
|
---|
| 194 | // yield return new Feature(list[i], 1.0);
|
---|
| 195 | // //for (int j = i+1; j < list.Count; j++) {
|
---|
| 196 | // // yield return new Feature(list[i] + " " + list[j], 1.0);
|
---|
| 197 | // //}
|
---|
| 198 | // }
|
---|
[12290] | 199 |
|
---|
[12298] | 200 | //
|
---|
| 201 | // for (int len = 1; len <= phrase.Length; len += 2) {
|
---|
| 202 | // var start = Math.Max(0, phrase.Length - len);
|
---|
| 203 | // var end = Math.Min(phrase.Length, start + len);
|
---|
| 204 | // string f = phrase.Substring(start, end - start);
|
---|
| 205 | // yield return new Feature(f, 1.0);
|
---|
| 206 | //
|
---|
| 207 | // }
|
---|
[12294] | 208 |
|
---|
[12298] | 209 | //var partialInterpreter = new PartialExpressionInterpreter();
|
---|
| 210 | //var vars = new double[] { 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, };
|
---|
| 211 | //var s = partialInterpreter.Interpret(phrase, vars);
|
---|
| 212 | ////if (s.Any())
|
---|
| 213 | //// return new Feature[] { new Feature(s.Pop().ToString(), 1.0), };
|
---|
| 214 | ////else
|
---|
| 215 | //// return new Feature[] { new Feature("$", 1.0), };
|
---|
| 216 | //return s.Select(f => new Feature(f.ToString(), 1.0));
|
---|
[11832] | 217 | }
|
---|
| 218 |
|
---|
[11846] | 219 | public string ConvertTreeToSentence(ISymbolicExpressionTree tree) {
|
---|
| 220 | var sb = new StringBuilder();
|
---|
| 221 |
|
---|
| 222 | TreeToSentence(tree.Root.GetSubtree(0).GetSubtree(0), sb);
|
---|
| 223 |
|
---|
| 224 | return sb.ToString();
|
---|
| 225 | }
|
---|
| 226 |
|
---|
| 227 | private void TreeToSentence(ISymbolicExpressionTreeNode treeNode, StringBuilder sb) {
|
---|
| 228 | if (treeNode.SubtreeCount == 0) {
|
---|
| 229 | // terminal
|
---|
| 230 | sb.Append(treeNode.Symbol.Name);
|
---|
| 231 | } else {
|
---|
| 232 | string op = string.Empty;
|
---|
| 233 | switch (treeNode.Symbol.Name) {
|
---|
| 234 | case "S": op = "+"; break;
|
---|
| 235 | case "P": op = "*"; break;
|
---|
| 236 | default: {
|
---|
| 237 | Debug.Assert(treeNode.SubtreeCount == 1);
|
---|
| 238 | break;
|
---|
| 239 | }
|
---|
| 240 | }
|
---|
| 241 | // nonterminal
|
---|
| 242 | if (op == "+") sb.Append("(");
|
---|
| 243 | TreeToSentence(treeNode.Subtrees.First(), sb);
|
---|
| 244 | foreach (var subTree in treeNode.Subtrees.Skip(1)) {
|
---|
| 245 | sb.Append(op);
|
---|
| 246 | TreeToSentence(subTree, sb);
|
---|
| 247 | }
|
---|
| 248 | if (op == "+") sb.Append(")");
|
---|
| 249 | }
|
---|
| 250 | }
|
---|
[11659] | 251 | }
|
---|
| 252 | }
|
---|