1 | using System;
|
---|
2 | using System.Collections.Concurrent;
|
---|
3 | using System.Collections.Generic;
|
---|
4 | using System.Collections.Specialized;
|
---|
5 | using System.Linq;
|
---|
6 | using System.Net;
|
---|
7 | using System.Security;
|
---|
8 | using System.Security.AccessControl;
|
---|
9 | using System.Text;
|
---|
10 | using HeuristicLab.Common;
|
---|
11 |
|
---|
12 | namespace HeuristicLab.Problems.GrammaticalOptimization {
|
---|
13 | public class SymbolicRegressionPoly10Problem : IProblem {
|
---|
14 | // private const string grammarString = @"
|
---|
15 | // G(E):
|
---|
16 | // E -> V | V+E | V-E | V*E | (E)
|
---|
17 | // V -> a .. j
|
---|
18 | // ";
|
---|
19 | private const string grammarString = @"
|
---|
20 | G(E):
|
---|
21 | E -> a | b | c | d | e | f | g | h | i | j | a+E | b+E | c+E | d+E | e+E | f+E | g+E | h+E | i+E | j+E | a*E | b*E | c*E | d*E | e*E | f*E | g*E | h*E | i*E | j*E
|
---|
22 | ";
|
---|
23 |
|
---|
24 |
|
---|
25 | private readonly IGrammar grammar;
|
---|
26 |
|
---|
27 | private readonly int N;
|
---|
28 | private readonly double[][] x;
|
---|
29 | private readonly double[] y;
|
---|
30 |
|
---|
31 | public SymbolicRegressionPoly10Problem() {
|
---|
32 | this.grammar = new Grammar(grammarString);
|
---|
33 |
|
---|
34 | this.N = 500;
|
---|
35 | this.x = new double[N][];
|
---|
36 | this.y = new double[N];
|
---|
37 |
|
---|
38 | GenerateData();
|
---|
39 | }
|
---|
40 |
|
---|
41 | private void GenerateData() {
|
---|
42 | // generate data with fixed seed to make sure that data is always the same
|
---|
43 | var rand = new Random(31415);
|
---|
44 | for (int i = 0; i < N; i++) {
|
---|
45 | x[i] = new double[10];
|
---|
46 | for (int j = 0; j < 10; j++) {
|
---|
47 | x[i][j] = rand.NextDouble() * 2 - 1;
|
---|
48 | }
|
---|
49 | // poly-10 no noise
|
---|
50 | /* a*b + c*d + e*f + a*g*i + c*f*j */
|
---|
51 | y[i] = x[i][0] * x[i][1] +
|
---|
52 | x[i][2] * x[i][3] +
|
---|
53 | x[i][4] * x[i][5] +
|
---|
54 | x[i][0] * x[i][6] * x[i][8] +
|
---|
55 | x[i][2] * x[i][5] * x[i][9];
|
---|
56 | }
|
---|
57 | }
|
---|
58 |
|
---|
59 | public double BestKnownQuality(int maxLen) {
|
---|
60 | // for now only an upper bound is returned, ideally we have an R² of 1.0
|
---|
61 | // the optimal R² can only be reached for sentences of at least 23 symbols
|
---|
62 | return 1.0;
|
---|
63 | }
|
---|
64 |
|
---|
65 | public IGrammar Grammar {
|
---|
66 | get { return grammar; }
|
---|
67 | }
|
---|
68 |
|
---|
69 | public double Evaluate(string sentence) {
|
---|
70 | var interpreter = new ExpressionInterpreter();
|
---|
71 | return HeuristicLab.Common.Extensions.RSq(y, Enumerable.Range(0, N).Select(i => interpreter.Interpret(sentence, x[i])).ToArray());
|
---|
72 | }
|
---|
73 |
|
---|
74 |
|
---|
75 |
|
---|
76 | // most-recently-used caching (with limited capacity) for canonical representations
|
---|
77 | MostRecentlyUsedCache<string, string> canonicalPhraseCache = new MostRecentlyUsedCache<string, string>(100000);
|
---|
78 | // right now only + and * is supported
|
---|
79 | public string CanonicalRepresentation(string phrase) {
|
---|
80 | string canonicalPhrase;
|
---|
81 | if (!canonicalPhraseCache.TryGetValue(phrase, out canonicalPhrase)) {
|
---|
82 | var terms = phrase.Split('+');
|
---|
83 | var canonicalTerms = new SortedSet<string>();
|
---|
84 | // only the last term might contain a NT-symbol. make sure this term is added at the end
|
---|
85 | for (int i = 0; i < terms.Length - 1; i++) {
|
---|
86 | canonicalTerms.Add(CanonicalTerm(terms[i]));
|
---|
87 | }
|
---|
88 |
|
---|
89 | var sb = new StringBuilder(phrase.Length);
|
---|
90 | foreach (var t in canonicalTerms)
|
---|
91 | sb.Append(t).Append('+');
|
---|
92 |
|
---|
93 | sb.Append(CanonicalTerm(terms[terms.Length - 1]));
|
---|
94 | canonicalPhrase = sb.ToString();
|
---|
95 | canonicalPhraseCache.Add(phrase, canonicalPhrase);
|
---|
96 | }
|
---|
97 | return canonicalPhrase;
|
---|
98 | }
|
---|
99 |
|
---|
100 | // cache the canonical form of terms for performance reasons
|
---|
101 | private Dictionary<string, string> canonicalTermDictionary = new Dictionary<string, string>();
|
---|
102 | private string CanonicalTerm(string term) {
|
---|
103 | string canonicalTerm;
|
---|
104 | if (!canonicalTermDictionary.TryGetValue(term, out canonicalTerm)) {
|
---|
105 | // add
|
---|
106 | var chars = term.ToCharArray();
|
---|
107 | Array.Sort(chars);
|
---|
108 | var sb = new StringBuilder(chars.Length);
|
---|
109 | // we want to have the up-case characters last
|
---|
110 | for (int i = chars.Length - 1; i > 0; i--) {
|
---|
111 | if (chars[i] != '*') {
|
---|
112 | sb.Append(chars[i]);
|
---|
113 | if (chars[i - 1] != '*') sb.Append('*');
|
---|
114 | }
|
---|
115 | }
|
---|
116 | if (chars[0] != '*') sb.Append(chars[0]); // last term
|
---|
117 | canonicalTerm = sb.ToString();
|
---|
118 | canonicalTermDictionary.Add(term, canonicalTerm);
|
---|
119 | }
|
---|
120 | return canonicalTerm;
|
---|
121 | }
|
---|
122 |
|
---|
123 | // splits the phrase into terms and creates (sparse) term-occurrance features
|
---|
124 | public IEnumerable<Feature> GetFeatures(string phrase) {
|
---|
125 | var canonicalTerms = new HashSet<string>();
|
---|
126 | foreach (string t in phrase.Split('+')) {
|
---|
127 | canonicalTerms.Add(CanonicalTerm(t));
|
---|
128 | }
|
---|
129 | return canonicalTerms.Select(entry => new Feature(entry, 1.0)).Concat(new Feature[] { new Feature(CanonicalRepresentation(phrase), 1.0) });
|
---|
130 | }
|
---|
131 |
|
---|
132 | }
|
---|
133 | }
|
---|