1 | using System;
2 | using System.Collections.Generic;
3 | using System.Diagnostics;
4 | using System.Linq;
5 | using System.Text;
6 | using HeuristicLab.Common;
7 | using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
8 |
9 | namespace HeuristicLab.Problems.GrammaticalOptimization {
10 | // must find a set of phrases where the ordering of phrases is irrelevant
11 | // Parameters
12 | // - size of the alphabet
13 | // - phrase length
14 | // - number of phrases in the sequence
15 | // - number of optimal phrases
16 | // - reward for optimal phrases
17 | // - number of decoy (sub-optimal) phrases
18 | // - reward for decoy phrases (must be smaller than reward for optimal phrases)
19 | // - phrasesAsSets: a switch to determine wether symbols in a phrase can be shuffled (sets) or if the ordering is relevant (non-sets)
20 |
21 | // this problem should be similar to symbolic regression and should be easier for approaches using a state esimation value and the canoncial state
22 | // when phrases are symbol sets instead of sequences then value-estimation routines should be better (TD)
23 | public class FindPhrasesProblem : ISymbolicExpressionTreeProblem {
24 |
25 | private readonly IGrammar grammar;
26 | private readonly int numPhrases;
27 | private readonly int phraseLen;
28 | private readonly double correctReward;
29 | private readonly double decoyReward;
30 | private readonly bool phrasesAsSets;
31 | private readonly int alphabetSize;
32 | private readonly int numOptimalPhrases;
33 | private readonly int numDecoyPhrases;
34 | private readonly SortedSet<string> optimalPhrases;
35 | private readonly SortedSet<string> decoyPhrases;
36 | public string Name { get { return string.Format("FindPhrases({0},{1},{2},{3},{4},{5},{6},{7})", alphabetSize, numPhrases, phraseLen, numOptimalPhrases, numDecoyPhrases, correctReward, decoyReward, phrasesAsSets); } }
37 |
38 | public FindPhrasesProblem(System.Random rand, int alphabetSize, int numPhrases, int phraseLen, int numOptimalPhrases, int numDecoyPhrases = 1,
39 | double correctReward = 1.0, double decoyReward = 0.0, bool phrasesAsSets = false) {
40 | if (alphabetSize <= 0 || alphabetSize > 26) throw new ArgumentException();
41 | if (numPhrases <= 0) throw new ArgumentException();
42 | if (phraseLen < 1) throw new ArgumentException();
43 | if (numOptimalPhrases < numPhrases) throw new ArgumentException();
44 | if (numDecoyPhrases < 0) throw new ArgumentException();
45 | if (correctReward <= decoyReward) throw new ArgumentException();
46 |
47 | this.numPhrases = numPhrases;
48 | this.phraseLen = phraseLen;
49 | this.correctReward = correctReward;
50 | this.decoyReward = decoyReward;
51 | this.phrasesAsSets = phrasesAsSets;
52 | this.alphabetSize = alphabetSize;
53 | this.numOptimalPhrases = numOptimalPhrases;
54 | this.numDecoyPhrases = numDecoyPhrases;
55 |
56 | var sentenceSymbol = 'S';
57 | var terminalSymbols = Enumerable.Range(0, alphabetSize).Select(off => (char)((byte)'a' + off)).ToArray();
58 | var nonTerminalSymbols = new char[] { sentenceSymbol };
59 |
60 | {
61 | // create grammar
62 | // S -> a..z | aS .. zS
63 | var rules = terminalSymbols.Select(t => Tuple.Create(sentenceSymbol, t.ToString()))
64 | .Concat(terminalSymbols.Select(t => Tuple.Create(sentenceSymbol, t + sentenceSymbol.ToString())));
65 |
66 | this.grammar = new Grammar(sentenceSymbol, terminalSymbols, nonTerminalSymbols, rules);
67 | }
68 | {
69 | // create grammar for tree-based GP
70 | // S -> a..z | SS
71 | var rules = terminalSymbols.Select(t => Tuple.Create(sentenceSymbol, t.ToString()))
72 | .Concat(new Tuple<char, string>[] { Tuple.Create(sentenceSymbol, sentenceSymbol.ToString() + sentenceSymbol) });
73 |
74 | this.TreeBasedGPGrammar = new Grammar(sentenceSymbol, terminalSymbols, nonTerminalSymbols, rules);
75 | }
76 |
77 | // generate optimal phrases
78 | optimalPhrases = new SortedSet<string>();
79 | while (optimalPhrases.Count < numOptimalPhrases) {
80 | string phrase = "";
81 | for (int l = 0; l < phraseLen; l++) {
82 | phrase += terminalSymbols.SelectRandom(rand);
83 | }
84 | phrase = CanonicalPhrase(phrase);
85 |
86 | // don't allow dups
87 | if (!optimalPhrases.Contains(phrase)) optimalPhrases.Add(phrase);
88 | }
89 |
90 | // generate decoy phrases
91 | decoyPhrases = new SortedSet<string>();
92 | while (decoyPhrases.Count < numDecoyPhrases) {
93 | string phrase = "";
94 | for (int l = 0; l < phraseLen; l++) {
95 | phrase += terminalSymbols.SelectRandom(rand);
96 | }
97 | phrase = CanonicalPhrase(phrase);
98 |
99 | // don't allow dups
100 | if (!optimalPhrases.Contains(phrase) && !decoyPhrases.Contains(phrase)) decoyPhrases.Add(phrase);
101 | }
102 |
103 | Debug.Assert(Evaluate(BestKnownSolution) / BestKnownQuality(phraseLen * numPhrases) == 1.0);
104 | }
105 |
106 | public double BestKnownQuality(int maxLen) {
107 | return Math.Min(maxLen / phraseLen, numPhrases) * correctReward; // integer division
108 | }
109 |
110 | public string BestKnownSolution {
111 | get { return string.Join("", optimalPhrases.Take(numPhrases)); }
112 | }
113 |
114 | public IGrammar Grammar {
115 | get { return grammar; }
116 | }
117 |
118 | public double Evaluate(string sentence) {
119 | // sentence must contain only terminal symbols, we are not checking if the sentence is syntactically valid here because it would be too slow!
120 | Debug.Assert(sentence.Any(c => grammar.IsTerminal(c)));
121 |
122 |
123 | // split the sentence in phrases
124 | // phrases must not overlap in the sentence, multiple occurences of a phrase are not counted
125 | // the order of phrases is not relevant
126 | var numPhrases = sentence.Length / phraseLen;
127 | var phrases = new SortedSet<string>();
128 | for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) {
129 | var sentenceIdx = phraseIdx * phraseLen;
130 | var phrase = sentence.Substring(sentenceIdx, phraseLen);
131 | phrase = CanonicalPhrase(phrase);
132 | if (!phrases.Contains(phrase)) phrases.Add(phrase);
133 | }
134 |
135 | // add reward for each correct phrase that occurs in the sentence
136 | // add reward for each decoy phrase that occurs in the sentence
137 | var reward = phrases.Intersect(optimalPhrases).Count() * correctReward
138 | + phrases.Intersect(decoyPhrases).Count() * decoyReward;
139 |
140 | return reward;
141 | }
142 |
143 | // TODO: cache canonical phrases in most-recently used dictionary for increased performance (see symbolicregressionpoly10problem)
144 | private string CanonicalPhrase(string phrase) {
145 | if (phrasesAsSets) return string.Join("", phrase.OrderBy(ch => (byte)ch));
146 | else return phrase;
147 | }
148 |
149 | public string CanonicalRepresentation(string phrase) {
150 | // as the ordering of phrases does not matter we can reorder the phrases
151 | // and remove duplicates
152 | var numPhrases = phrase.Length / phraseLen;
153 | var phrases = new SortedSet<string>();
154 | for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) {
155 | var sentenceIdx = phraseIdx * phraseLen;
156 | var subphrase = phrase.Substring(sentenceIdx, phraseLen);
157 | subphrase = CanonicalPhrase(subphrase);
158 | if (!phrases.Contains(subphrase)) phrases.Add(subphrase);
159 | }
160 | var remainder = phrase.Substring(numPhrases * phraseLen, phrase.Length - (numPhrases * phraseLen));
161 | remainder = CanonicalPhrase(remainder);
162 | if (!phrases.Contains(remainder)) phrases.Add(remainder);
163 |
164 | return string.Join("", phrases);
165 | }
166 |
167 | public IEnumerable<Feature> GetFeatures(string phrase)
168 | {
169 | return new Feature[] {new Feature(phrase, 1.0),};
170 | }
171 |
172 | public override string ToString() {
173 | return string.Format("\"FindPhrasesProblem {0} {1} {2} {3:F2} {4} {5:F2} {6}\"", numPhrases, phraseLen,
174 | optimalPhrases.Count, correctReward, decoyPhrases.Count, decoyReward, phrasesAsSets);
175 | }
176 |
177 | public IGrammar TreeBasedGPGrammar { get; private set; }
178 | public string ConvertTreeToSentence(ISymbolicExpressionTree tree) {
179 | var sb = new StringBuilder();
180 | foreach (var s in tree.Root.GetSubtree(0).GetSubtree(0).IterateNodesPrefix()) {
181 | if (s.Symbol.Name == "S") continue;
182 | sb.Append(s.Symbol.Name);
183 | }
184 | return sb.ToString();
185 | }
186 | }
187 | }