1 | using System;
|
---|
2 | using System.Collections.Generic;
|
---|
3 | using System.Data.Odbc;
|
---|
4 | using System.Diagnostics;
|
---|
5 | using System.Linq;
|
---|
6 | using System.Text;
|
---|
7 | using System.Text.RegularExpressions;
|
---|
8 | using HeuristicLab.Common;
|
---|
9 |
|
---|
10 | namespace HeuristicLab.Problems.GrammaticalOptimization {
|
---|
11 | // must find a set of phrases where the ordering of phrases is irrelevant
|
---|
12 | // Parameters
|
---|
13 | // - size of the alphabet
|
---|
14 | // - phrase length
|
---|
15 | // - number of phrases in the sequence
|
---|
16 | // - number of optimal phrases
|
---|
17 | // - reward for optimal phrases
|
---|
18 | // - number of decoy (sub-optimal) phrases
|
---|
19 | // - reward for decoy phrases (must be smaller than reward for optimal phrases)
|
---|
20 | // - phrasesAsSets: a switch to determine wether symbols in a phrase can be shuffled (sets) or if the ordering is relevant (non-sets)
|
---|
21 |
|
---|
22 | // this problem should be similar to symbolic regression and should be easier for approaches using a state esimation value and the canoncial state
|
---|
23 | // when phrases are symbol sets instead of sequences then value-estimation routines should be better (TD)
|
---|
24 | public class FindPhrasesProblem : IProblem {
|
---|
25 |
|
---|
26 | private readonly IGrammar grammar;
|
---|
27 | private readonly int numPhrases;
|
---|
28 | private readonly int phraseLen;
|
---|
29 | private readonly int numOptimalPhrases;
|
---|
30 | private readonly int numDecoyPhrases;
|
---|
31 | private readonly double correctReward;
|
---|
32 | private readonly double decoyReward;
|
---|
33 | private readonly bool phrasesAsSets;
|
---|
34 | private readonly SortedSet<string> optimalPhrases;
|
---|
35 | private readonly SortedSet<string> decoyPhrases;
|
---|
36 |
|
---|
37 | public FindPhrasesProblem(Random rand, int alphabetSize, int numPhrases, int phraseLen, int numOptimalPhrases, int numDecoyPhrases = 1,
|
---|
38 | double correctReward = 1.0, double decoyReward = 0.0, bool phrasesAsSets = false) {
|
---|
39 | if (alphabetSize <= 0 || alphabetSize > 26) throw new ArgumentException();
|
---|
40 | if (numPhrases <= 0) throw new ArgumentException();
|
---|
41 | if (phraseLen < 1) throw new ArgumentException();
|
---|
42 | if (numOptimalPhrases < numPhrases) throw new ArgumentException();
|
---|
43 | if (numDecoyPhrases < 0) throw new ArgumentException();
|
---|
44 | if (correctReward <= decoyReward) throw new ArgumentException();
|
---|
45 |
|
---|
46 | this.numPhrases = numPhrases;
|
---|
47 | this.phraseLen = phraseLen;
|
---|
48 | this.correctReward = correctReward;
|
---|
49 | this.decoyReward = decoyReward;
|
---|
50 | this.phrasesAsSets = phrasesAsSets;
|
---|
51 |
|
---|
52 | // create grammar
|
---|
53 | var sentenceSymbol = 'S';
|
---|
54 | var terminalSymbols = Enumerable.Range(0, alphabetSize).Select(off => (char)((byte)'a' + off)).ToArray();
|
---|
55 | var nonTerminalSymbols = new char[] { 'S' };
|
---|
56 | var rules = terminalSymbols.Select(t => Tuple.Create('S', t.ToString()))
|
---|
57 | .Concat(terminalSymbols.Select(t => Tuple.Create('S', t + "S")));
|
---|
58 |
|
---|
59 | this.grammar = new Grammar(sentenceSymbol, terminalSymbols, nonTerminalSymbols, rules);
|
---|
60 |
|
---|
61 | // generate optimal phrases
|
---|
62 | optimalPhrases = new SortedSet<string>();
|
---|
63 | while (optimalPhrases.Count < numOptimalPhrases) {
|
---|
64 | string phrase = "";
|
---|
65 | for (int l = 0; l < phraseLen; l++) {
|
---|
66 | phrase += terminalSymbols.SelectRandom(rand);
|
---|
67 | }
|
---|
68 | phrase = CanonicalPhrase(phrase);
|
---|
69 |
|
---|
70 | // don't allow dups
|
---|
71 | if (!optimalPhrases.Contains(phrase)) optimalPhrases.Add(phrase);
|
---|
72 | }
|
---|
73 |
|
---|
74 | // generate decoy phrases
|
---|
75 | decoyPhrases = new SortedSet<string>();
|
---|
76 | while (decoyPhrases.Count < numDecoyPhrases) {
|
---|
77 | string phrase = "";
|
---|
78 | for (int l = 0; l < phraseLen; l++) {
|
---|
79 | phrase += terminalSymbols.SelectRandom(rand);
|
---|
80 | }
|
---|
81 | phrase = CanonicalPhrase(phrase);
|
---|
82 |
|
---|
83 | // don't allow dups
|
---|
84 | if (!optimalPhrases.Contains(phrase) && !decoyPhrases.Contains(phrase)) decoyPhrases.Add(phrase);
|
---|
85 | }
|
---|
86 |
|
---|
87 | Debug.Assert(Evaluate(BestKnownSolution) / BestKnownQuality(phraseLen * numPhrases) == 1.0);
|
---|
88 | }
|
---|
89 |
|
---|
90 | public double BestKnownQuality(int maxLen) {
|
---|
91 | return Math.Min(maxLen / phraseLen, numPhrases) * correctReward; // integer division
|
---|
92 | }
|
---|
93 |
|
---|
94 | public string BestKnownSolution {
|
---|
95 | get { return string.Join("", optimalPhrases.Take(numPhrases)); }
|
---|
96 | }
|
---|
97 |
|
---|
98 | public IGrammar Grammar {
|
---|
99 | get { return grammar; }
|
---|
100 | }
|
---|
101 |
|
---|
102 | public double Evaluate(string sentence) {
|
---|
103 | // sentence must contain only terminal symbols, we are not checking if the sentence is syntactically valid here because it would be too slow!
|
---|
104 | Debug.Assert(sentence.Any(c => grammar.IsTerminal(c)));
|
---|
105 |
|
---|
106 |
|
---|
107 | // split the sentence in phrases
|
---|
108 | // phrases must not overlap in the sentence, multiple occurences of a phrase are not counted
|
---|
109 | // the order of phrases is not relevant
|
---|
110 | var numPhrases = sentence.Length / phraseLen;
|
---|
111 | var phrases = new SortedSet<string>();
|
---|
112 | for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) {
|
---|
113 | var sentenceIdx = phraseIdx * phraseLen;
|
---|
114 | var phrase = sentence.Substring(sentenceIdx, phraseLen);
|
---|
115 | phrase = CanonicalPhrase(phrase);
|
---|
116 | if (!phrases.Contains(phrase)) phrases.Add(phrase);
|
---|
117 | }
|
---|
118 |
|
---|
119 | // add reward for each correct phrase that occurs in the sentence
|
---|
120 | // add reward for each decoy phrase that occurs in the sentence
|
---|
121 | var reward = phrases.Intersect(optimalPhrases).Count() * correctReward
|
---|
122 | + phrases.Intersect(decoyPhrases).Count() * decoyReward;
|
---|
123 |
|
---|
124 |
|
---|
125 |
|
---|
126 | return reward;
|
---|
127 | }
|
---|
128 |
|
---|
129 | private string CanonicalPhrase(string phrase) {
|
---|
130 | if (phrasesAsSets) return string.Join("", phrase.OrderBy(ch => (byte)ch));
|
---|
131 | else return phrase;
|
---|
132 | }
|
---|
133 |
|
---|
134 | public string CanonicalRepresentation(string terminalPhrase) {
|
---|
135 | // as the ordering of phrases does not matter we can reorder the phrases
|
---|
136 | // and remove duplicates
|
---|
137 | var numPhrases = terminalPhrase.Length / phraseLen;
|
---|
138 | var phrases = new SortedSet<string>();
|
---|
139 | for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) {
|
---|
140 | var sentenceIdx = phraseIdx * phraseLen;
|
---|
141 | var phrase = terminalPhrase.Substring(sentenceIdx, phraseLen);
|
---|
142 | phrase = CanonicalPhrase(phrase);
|
---|
143 | if (!phrases.Contains(phrase)) phrases.Add(phrase);
|
---|
144 | }
|
---|
145 | var remainder = terminalPhrase.Substring(numPhrases * phraseLen, terminalPhrase.Length - (numPhrases * phraseLen));
|
---|
146 | remainder = CanonicalPhrase(remainder);
|
---|
147 | if (!phrases.Contains(remainder)) phrases.Add(remainder);
|
---|
148 |
|
---|
149 | return string.Join("", phrases);
|
---|
150 | }
|
---|
151 | }
|
---|
152 | }
|
---|