[11747] | 1 | using System;
|
---|
| 2 | using System.Collections.Generic;
|
---|
| 3 | using System.Diagnostics;
|
---|
| 4 | using System.Linq;
|
---|
[12290] | 5 | using System.Runtime.InteropServices;
|
---|
[11747] | 6 | using System.Text;
|
---|
| 7 | using System.Text.RegularExpressions;
|
---|
| 8 | using HeuristicLab.Common;
|
---|
[11865] | 9 | using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
|
---|
[11747] | 10 |
|
---|
| 11 | namespace HeuristicLab.Problems.GrammaticalOptimization {
|
---|
[11755] | 12 | // must find one of numCorrectPhrases*sequenceLen sequences where the quality of a sequence is the length of the subsequence containing only correct _phrases_ (of length phraseLen) and starting at the first position
|
---|
[11747] | 13 | // compared to the RoyalSequence problem this problem is harder because the number of different phrases starting at a position is much larger than the number of symbols (grows exponentially with the phrase-length)
|
---|
| 14 | // if phraseLen = 1 this is the same as the RoyalSequence problem
|
---|
| 15 | // parameters
|
---|
| 16 | // - alphabetSize: number of different symbols (max=26)
|
---|
| 17 | // - phraseLen: the length of a phrase in number of symbols
|
---|
| 18 | // - sequenceLen: the number of phrases in the correct subsequence (total sequence length is n * phraseLen
|
---|
[11755] | 19 | // - numCorrectPhrases: the number of correct phrases starting at each position
|
---|
| 20 | // - phrasesAsSets: switch to determine if the ordering of symbols within a phrase is relevant
|
---|
[11747] | 21 | //
|
---|
| 22 | // this problem should be hard for GP and easy for MCTS (TD should not have an advantage compared to MCTS)
|
---|
| 23 | // for phraseLen > 1 this should be harder than RoyalSymbolProblem
|
---|
[11755] | 24 | // when phrases are symbol sets instead of sequences then value-estimation routines should be better (TD)
|
---|
[11865] | 25 | public class RoyalPhraseSequenceProblem : ISymbolicExpressionTreeProblem {
|
---|
[11747] | 26 |
|
---|
| 27 | private readonly IGrammar grammar;
|
---|
| 28 | private readonly double correctReward;
|
---|
| 29 | private readonly double incorrectReward;
|
---|
| 30 | private readonly int sequenceLen;
|
---|
| 31 | private readonly int phraseLen;
|
---|
[11755] | 32 | private readonly bool phrasesAsSets;
|
---|
[11747] | 33 | private readonly SortedSet<string>[] optimalPhrasesForPos;
|
---|
[12099] | 34 | public string Name { get { return "RoyalPhraseSequence"; } }
|
---|
[12391] | 35 | public RoyalPhraseSequenceProblem(System.Random rand, int alphabetSize, int sequenceLen, int phraseLen = 1, int numCorrectPhrases = 1, double correctReward = 1.0, double incorrectReward = 0.0, bool phrasesAsSets = false) {
|
---|
[11747] | 36 | if (alphabetSize <= 0 || alphabetSize > 26) throw new ArgumentException();
|
---|
| 37 | if (sequenceLen <= 0) throw new ArgumentException();
|
---|
[11755] | 38 | if (numCorrectPhrases < 1 || numCorrectPhrases > alphabetSize) throw new ArgumentException();
|
---|
[11747] | 39 | if (phraseLen < 1) throw new ArgumentException();
|
---|
| 40 | if (correctReward <= incorrectReward) throw new ArgumentException();
|
---|
| 41 |
|
---|
| 42 | this.sequenceLen = sequenceLen;
|
---|
| 43 | this.phraseLen = phraseLen;
|
---|
| 44 | this.correctReward = correctReward;
|
---|
| 45 | this.incorrectReward = incorrectReward;
|
---|
[11755] | 46 | this.phrasesAsSets = phrasesAsSets;
|
---|
[11865] | 47 |
|
---|
[11747] | 48 | var sentenceSymbol = 'S';
|
---|
| 49 | var terminalSymbols = Enumerable.Range(0, alphabetSize).Select(off => (char)((byte)'a' + off)).ToArray();
|
---|
[11865] | 50 | var nonTerminalSymbols = new char[] { sentenceSymbol };
|
---|
[11747] | 51 |
|
---|
[11865] | 52 | {
|
---|
| 53 | // create grammar
|
---|
| 54 | // S -> a..z | aS .. zS
|
---|
| 55 | var rules = terminalSymbols.Select(t => Tuple.Create(sentenceSymbol, t.ToString()))
|
---|
| 56 | .Concat(terminalSymbols.Select(t => Tuple.Create(sentenceSymbol, t + sentenceSymbol.ToString())));
|
---|
[11747] | 57 |
|
---|
[11865] | 58 | this.grammar = new Grammar(sentenceSymbol, terminalSymbols, nonTerminalSymbols, rules);
|
---|
| 59 | }
|
---|
| 60 | {
|
---|
| 61 | // create grammar for tree-based GP
|
---|
| 62 | // S -> a..z | SS
|
---|
| 63 | var rules = terminalSymbols.Select(t => Tuple.Create(sentenceSymbol, t.ToString()))
|
---|
| 64 | .Concat(new Tuple<char, string>[] { Tuple.Create(sentenceSymbol, sentenceSymbol.ToString() + sentenceSymbol) });
|
---|
| 65 |
|
---|
| 66 | this.TreeBasedGPGrammar = new Grammar(sentenceSymbol, terminalSymbols, nonTerminalSymbols, rules);
|
---|
| 67 | }
|
---|
| 68 |
|
---|
[11747] | 69 | this.optimalPhrasesForPos = new SortedSet<string>[sequenceLen];
|
---|
| 70 | for (int i = 0; i < sequenceLen; i++) {
|
---|
| 71 | optimalPhrasesForPos[i] = new SortedSet<string>();
|
---|
[11755] | 72 | for (int j = 0; j < numCorrectPhrases; j++) {
|
---|
[11747] | 73 | string phrase = "";
|
---|
| 74 | do {
|
---|
| 75 | for (int l = 0; l < phraseLen; l++) {
|
---|
| 76 | phrase += terminalSymbols.SelectRandom(rand);
|
---|
| 77 | }
|
---|
[11755] | 78 | phrase = CanonicalPhrase(phrase);
|
---|
[11747] | 79 | } while (optimalPhrasesForPos[i].Contains(phrase)); // don't allow duplicate phrases
|
---|
| 80 | optimalPhrasesForPos[i].Add(phrase);
|
---|
| 81 | }
|
---|
| 82 | }
|
---|
| 83 |
|
---|
[11755] | 84 | Debug.Assert(Evaluate(BestKnownSolution) / BestKnownQuality(phraseLen * sequenceLen) == 1.0);
|
---|
[11747] | 85 | }
|
---|
| 86 |
|
---|
| 87 | public double BestKnownQuality(int maxLen) {
|
---|
| 88 | return Math.Min(maxLen / phraseLen, sequenceLen) * correctReward; // integer division
|
---|
| 89 | }
|
---|
| 90 |
|
---|
| 91 | public string BestKnownSolution {
|
---|
| 92 | get {
|
---|
| 93 | string solution = "";
|
---|
| 94 | for (int i = 0; i < sequenceLen; i++) {
|
---|
| 95 | solution += optimalPhrasesForPos[i].First();
|
---|
| 96 | }
|
---|
| 97 | return solution;
|
---|
| 98 | }
|
---|
| 99 | }
|
---|
| 100 |
|
---|
| 101 | public IGrammar Grammar {
|
---|
| 102 | get { return grammar; }
|
---|
| 103 | }
|
---|
| 104 |
|
---|
| 105 | public double Evaluate(string sentence) {
|
---|
| 106 | // sentence must contain only terminal symbols, we are not checking if the sentence is syntactically valid here because it would be too slow!
|
---|
| 107 | Debug.Assert(sentence.Any(c => grammar.IsTerminal(c)));
|
---|
| 108 | // as long as only correct symbols are found we increase the reward by +1
|
---|
| 109 | // on the first incorrect symbol we return
|
---|
| 110 | var reward = 0.0;
|
---|
| 111 | for (int i = 0; i < Math.Min(sentence.Length / phraseLen, sequenceLen); i++) {
|
---|
[11755] | 112 | var canonicalPhrase = CanonicalPhrase(sentence.Substring(i * phraseLen, phraseLen));
|
---|
| 113 | if (optimalPhrasesForPos[i].Contains(canonicalPhrase)) {
|
---|
[11747] | 114 | reward += correctReward;
|
---|
| 115 | } else {
|
---|
| 116 | // alternatively reduce reward by number of remaining phrases
|
---|
| 117 | return Math.Max(0.0, reward + incorrectReward * (sentence.Length / phraseLen - i));
|
---|
| 118 | // stop on first incorrect symbol and return reward
|
---|
| 119 | //return reward;
|
---|
| 120 | }
|
---|
| 121 | }
|
---|
| 122 | return reward;
|
---|
| 123 | }
|
---|
| 124 |
|
---|
[11799] | 125 | // TODO: cache canonical phrases in most-recently used dictionary for increased performance (see symbolicregressionpoly10problem)
|
---|
[11755] | 126 | private string CanonicalPhrase(string phrase) {
|
---|
| 127 | if (phrasesAsSets) return string.Join("", phrase.OrderBy(ch => (byte)ch));
|
---|
| 128 | else return phrase;
|
---|
| 129 | }
|
---|
| 130 |
|
---|
[11832] | 131 | public string CanonicalRepresentation(string phrase) {
|
---|
[11755] | 132 | if (phrasesAsSets) {
|
---|
[11806] | 133 | var sb = new StringBuilder();
|
---|
[11832] | 134 | var numPhrases = phrase.Length / phraseLen;
|
---|
[11755] | 135 | for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) {
|
---|
| 136 | var sentenceIdx = phraseIdx * phraseLen;
|
---|
[11832] | 137 | var subphrase = phrase.Substring(sentenceIdx, phraseLen);
|
---|
| 138 | subphrase = CanonicalPhrase(subphrase);
|
---|
| 139 | sb.Append(subphrase);
|
---|
[11755] | 140 | }
|
---|
| 141 |
|
---|
[11832] | 142 | var remainder = phrase.Substring(numPhrases * phraseLen, phrase.Length - (numPhrases * phraseLen));
|
---|
[11770] | 143 | remainder = CanonicalPhrase(remainder);
|
---|
[11806] | 144 | sb.Append(remainder);
|
---|
[11770] | 145 |
|
---|
[11806] | 146 | return sb.ToString();
|
---|
[11755] | 147 | } else
|
---|
[11832] | 148 | return phrase;
|
---|
[11747] | 149 | }
|
---|
[11832] | 150 |
|
---|
[12290] | 151 | public IEnumerable<Feature> GetFeatures(string phrase)
|
---|
| 152 | {
|
---|
| 153 | return new Feature[] {new Feature(phrase, 1.0)};
|
---|
[11832] | 154 | }
|
---|
[11865] | 155 |
|
---|
| 156 | public IGrammar TreeBasedGPGrammar { get; private set; }
|
---|
| 157 | public string ConvertTreeToSentence(ISymbolicExpressionTree tree) {
|
---|
| 158 | var sb = new StringBuilder();
|
---|
| 159 | foreach (var s in tree.Root.GetSubtree(0).GetSubtree(0).IterateNodesPrefix()) {
|
---|
| 160 | if (s.Symbol.Name == "S") continue;
|
---|
| 161 | sb.Append(s.Symbol.Name);
|
---|
| 162 | }
|
---|
| 163 | return sb.ToString();
|
---|
| 164 | }
|
---|
[11747] | 165 | }
|
---|
| 166 | }
|
---|