[11754] | 1 | using System;
|
---|
| 2 | using System.Collections.Generic;
|
---|
[11755] | 3 | using System.Data.Odbc;
|
---|
[11754] | 4 | using System.Diagnostics;
|
---|
| 5 | using System.Linq;
|
---|
| 6 | using System.Text;
|
---|
| 7 | using System.Text.RegularExpressions;
|
---|
| 8 | using HeuristicLab.Common;
|
---|
| 9 |
|
---|
| 10 | namespace HeuristicLab.Problems.GrammaticalOptimization {
|
---|
| 11 | // must find a set of phrases where the ordering of phrases is irrelevant
|
---|
| 12 | // Parameters
|
---|
| 13 | // - size of the alphabet
|
---|
| 14 | // - phrase length
|
---|
| 15 | // - number of phrases in the sequence
|
---|
| 16 | // - number of optimal phrases
|
---|
| 17 | // - reward for optimal phrases
|
---|
| 18 | // - number of decoy (sub-optimal) phrases
|
---|
| 19 | // - reward for decoy phrases (must be smaller than reward for optimal phrases)
|
---|
[11755] | 20 | // - phrasesAsSets: a switch to determine wether symbols in a phrase can be shuffled (sets) or if the ordering is relevant (non-sets)
|
---|
| 21 |
|
---|
| 22 | // this problem should be similar to symbolic regression and should be easier for approaches using a state esimation value and the canoncial state
|
---|
| 23 | // when phrases are symbol sets instead of sequences then value-estimation routines should be better (TD)
|
---|
[11754] | 24 | public class FindPhrasesProblem : IProblem {
|
---|
| 25 |
|
---|
| 26 | private readonly IGrammar grammar;
|
---|
| 27 | private readonly int numPhrases;
|
---|
| 28 | private readonly int phraseLen;
|
---|
| 29 | private readonly double correctReward;
|
---|
| 30 | private readonly double decoyReward;
|
---|
[11755] | 31 | private readonly bool phrasesAsSets;
|
---|
[11754] | 32 | private readonly SortedSet<string> optimalPhrases;
|
---|
| 33 | private readonly SortedSet<string> decoyPhrases;
|
---|
| 34 |
|
---|
| 35 | public FindPhrasesProblem(Random rand, int alphabetSize, int numPhrases, int phraseLen, int numOptimalPhrases, int numDecoyPhrases = 1,
|
---|
[11755] | 36 | double correctReward = 1.0, double decoyReward = 0.0, bool phrasesAsSets = false) {
|
---|
[11754] | 37 | if (alphabetSize <= 0 || alphabetSize > 26) throw new ArgumentException();
|
---|
| 38 | if (numPhrases <= 0) throw new ArgumentException();
|
---|
| 39 | if (phraseLen < 1) throw new ArgumentException();
|
---|
| 40 | if (numOptimalPhrases < numPhrases) throw new ArgumentException();
|
---|
| 41 | if (numDecoyPhrases < 0) throw new ArgumentException();
|
---|
| 42 | if (correctReward <= decoyReward) throw new ArgumentException();
|
---|
| 43 |
|
---|
| 44 | this.numPhrases = numPhrases;
|
---|
| 45 | this.phraseLen = phraseLen;
|
---|
| 46 | this.correctReward = correctReward;
|
---|
| 47 | this.decoyReward = decoyReward;
|
---|
[11755] | 48 | this.phrasesAsSets = phrasesAsSets;
|
---|
[11754] | 49 |
|
---|
| 50 | // create grammar
|
---|
| 51 | var sentenceSymbol = 'S';
|
---|
| 52 | var terminalSymbols = Enumerable.Range(0, alphabetSize).Select(off => (char)((byte)'a' + off)).ToArray();
|
---|
| 53 | var nonTerminalSymbols = new char[] { 'S' };
|
---|
| 54 | var rules = terminalSymbols.Select(t => Tuple.Create('S', t.ToString()))
|
---|
| 55 | .Concat(terminalSymbols.Select(t => Tuple.Create('S', t + "S")));
|
---|
| 56 |
|
---|
| 57 | this.grammar = new Grammar(sentenceSymbol, terminalSymbols, nonTerminalSymbols, rules);
|
---|
| 58 |
|
---|
[11755] | 59 | // generate optimal phrases
|
---|
| 60 | optimalPhrases = new SortedSet<string>();
|
---|
| 61 | while (optimalPhrases.Count < numOptimalPhrases) {
|
---|
| 62 | string phrase = "";
|
---|
| 63 | for (int l = 0; l < phraseLen; l++) {
|
---|
| 64 | phrase += terminalSymbols.SelectRandom(rand);
|
---|
[11754] | 65 | }
|
---|
[11755] | 66 | phrase = CanonicalPhrase(phrase);
|
---|
| 67 |
|
---|
| 68 | // don't allow dups
|
---|
| 69 | if (!optimalPhrases.Contains(phrase)) optimalPhrases.Add(phrase);
|
---|
[11754] | 70 | }
|
---|
| 71 |
|
---|
[11755] | 72 | // generate decoy phrases
|
---|
| 73 | decoyPhrases = new SortedSet<string>();
|
---|
| 74 | while (decoyPhrases.Count < numDecoyPhrases) {
|
---|
| 75 | string phrase = "";
|
---|
| 76 | for (int l = 0; l < phraseLen; l++) {
|
---|
| 77 | phrase += terminalSymbols.SelectRandom(rand);
|
---|
| 78 | }
|
---|
| 79 | phrase = CanonicalPhrase(phrase);
|
---|
| 80 |
|
---|
| 81 | // don't allow dups
|
---|
| 82 | if (!optimalPhrases.Contains(phrase) && !decoyPhrases.Contains(phrase)) decoyPhrases.Add(phrase);
|
---|
| 83 | }
|
---|
| 84 |
|
---|
| 85 | Debug.Assert(Evaluate(BestKnownSolution) / BestKnownQuality(phraseLen * numPhrases) == 1.0);
|
---|
[11754] | 86 | }
|
---|
| 87 |
|
---|
| 88 | public double BestKnownQuality(int maxLen) {
|
---|
[11755] | 89 | return Math.Min(maxLen / phraseLen, numPhrases) * correctReward; // integer division
|
---|
[11754] | 90 | }
|
---|
| 91 |
|
---|
| 92 | public string BestKnownSolution {
|
---|
[11755] | 93 | get { return string.Join("", optimalPhrases.Take(numPhrases)); }
|
---|
[11754] | 94 | }
|
---|
| 95 |
|
---|
| 96 | public IGrammar Grammar {
|
---|
| 97 | get { return grammar; }
|
---|
| 98 | }
|
---|
| 99 |
|
---|
| 100 | public double Evaluate(string sentence) {
|
---|
| 101 | // sentence must contain only terminal symbols, we are not checking if the sentence is syntactically valid here because it would be too slow!
|
---|
| 102 | Debug.Assert(sentence.Any(c => grammar.IsTerminal(c)));
|
---|
[11755] | 103 |
|
---|
| 104 |
|
---|
| 105 | // split the sentence in phrases
|
---|
| 106 | // phrases must not overlap in the sentence, multiple occurences of a phrase are not counted
|
---|
| 107 | // the order of phrases is not relevant
|
---|
| 108 | var numPhrases = sentence.Length / phraseLen;
|
---|
| 109 | var phrases = new SortedSet<string>();
|
---|
| 110 | for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) {
|
---|
| 111 | var sentenceIdx = phraseIdx * phraseLen;
|
---|
| 112 | var phrase = sentence.Substring(sentenceIdx, phraseLen);
|
---|
| 113 | phrase = CanonicalPhrase(phrase);
|
---|
| 114 | if (!phrases.Contains(phrase)) phrases.Add(phrase);
|
---|
[11754] | 115 | }
|
---|
[11755] | 116 |
|
---|
| 117 | // add reward for each correct phrase that occurs in the sentence
|
---|
| 118 | // add reward for each decoy phrase that occurs in the sentence
|
---|
| 119 | var reward = phrases.Intersect(optimalPhrases).Count() * correctReward
|
---|
| 120 | + phrases.Intersect(decoyPhrases).Count() * decoyReward;
|
---|
| 121 |
|
---|
[11754] | 122 | return reward;
|
---|
| 123 | }
|
---|
| 124 |
|
---|
[11799] | 125 | // TODO: cache canonical phrases in most-recently used dictionary for increased performance (see symbolicregressionpoly10problem)
|
---|
[11755] | 126 | private string CanonicalPhrase(string phrase) {
|
---|
| 127 | if (phrasesAsSets) return string.Join("", phrase.OrderBy(ch => (byte)ch));
|
---|
| 128 | else return phrase;
|
---|
| 129 | }
|
---|
| 130 |
|
---|
[11832] | 131 | public string CanonicalRepresentation(string phrase) {
|
---|
[11755] | 132 | // as the ordering of phrases does not matter we can reorder the phrases
|
---|
| 133 | // and remove duplicates
|
---|
[11832] | 134 | var numPhrases = phrase.Length / phraseLen;
|
---|
[11755] | 135 | var phrases = new SortedSet<string>();
|
---|
| 136 | for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) {
|
---|
| 137 | var sentenceIdx = phraseIdx * phraseLen;
|
---|
[11832] | 138 | var subphrase = phrase.Substring(sentenceIdx, phraseLen);
|
---|
| 139 | subphrase = CanonicalPhrase(subphrase);
|
---|
| 140 | if (!phrases.Contains(subphrase)) phrases.Add(subphrase);
|
---|
[11755] | 141 | }
|
---|
[11832] | 142 | var remainder = phrase.Substring(numPhrases * phraseLen, phrase.Length - (numPhrases * phraseLen));
|
---|
[11770] | 143 | remainder = CanonicalPhrase(remainder);
|
---|
| 144 | if (!phrases.Contains(remainder)) phrases.Add(remainder);
|
---|
| 145 |
|
---|
[11755] | 146 | return string.Join("", phrases);
|
---|
[11754] | 147 | }
|
---|
[11832] | 148 |
|
---|
| 149 | public IEnumerable<Feature> GetFeatures(string phrase)
|
---|
| 150 | {
|
---|
| 151 | throw new NotImplementedException();
|
---|
| 152 | }
|
---|
| 153 |
|
---|
| 154 | public override string ToString() {
|
---|
| 155 | return string.Format("\"FindPhrasesProblem {0} {1} {2} {3:F2} {4} {5:F2} {6}\"", numPhrases, phraseLen,
|
---|
| 156 | optimalPhrases.Count, correctReward, decoyPhrases.Count, decoyReward, phrasesAsSets);
|
---|
| 157 | }
|
---|
[11754] | 158 | }
|
---|
| 159 | }
|
---|