using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using HeuristicLab.Common; using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding; namespace HeuristicLab.Problems.GrammaticalOptimization { // must find a set of phrases where the ordering of phrases is irrelevant // Parameters // - size of the alphabet // - phrase length // - number of phrases in the sequence // - number of optimal phrases // - reward for optimal phrases // - number of decoy (sub-optimal) phrases // - reward for decoy phrases (must be smaller than reward for optimal phrases) // - phrasesAsSets: a switch to determine wether symbols in a phrase can be shuffled (sets) or if the ordering is relevant (non-sets) // this problem should be similar to symbolic regression and should be easier for approaches using a state esimation value and the canoncial state // when phrases are symbol sets instead of sequences then value-estimation routines should be better (TD) public class FindPhrasesProblem : ISymbolicExpressionTreeProblem { private readonly IGrammar grammar; private readonly int numPhrases; private readonly int phraseLen; private readonly double correctReward; private readonly double decoyReward; private readonly bool phrasesAsSets; private readonly int alphabetSize; private readonly int numOptimalPhrases; private readonly int numDecoyPhrases; private readonly SortedSet optimalPhrases; private readonly SortedSet decoyPhrases; public string Name { get { return string.Format("FindPhrases({0},{1},{2},{3},{4},{5},{6},{7})", alphabetSize, numPhrases, phraseLen, numOptimalPhrases, numDecoyPhrases, correctReward, decoyReward, phrasesAsSets); } } public FindPhrasesProblem(System.Random rand, int alphabetSize, int numPhrases, int phraseLen, int numOptimalPhrases, int numDecoyPhrases = 1, double correctReward = 1.0, double decoyReward = 0.0, bool phrasesAsSets = false) { if (alphabetSize <= 0 || alphabetSize > 26) throw new ArgumentException(); if (numPhrases <= 0) throw new ArgumentException(); if (phraseLen < 1) throw new ArgumentException(); if (numOptimalPhrases < numPhrases) throw new ArgumentException(); if (numDecoyPhrases < 0) throw new ArgumentException(); if (correctReward <= decoyReward) throw new ArgumentException(); this.numPhrases = numPhrases; this.phraseLen = phraseLen; this.correctReward = correctReward; this.decoyReward = decoyReward; this.phrasesAsSets = phrasesAsSets; this.alphabetSize = alphabetSize; this.numOptimalPhrases = numOptimalPhrases; this.numDecoyPhrases = numDecoyPhrases; var sentenceSymbol = 'S'; var terminalSymbols = Enumerable.Range(0, alphabetSize).Select(off => (char)((byte)'a' + off)).ToArray(); var nonTerminalSymbols = new char[] { sentenceSymbol }; { // create grammar // S -> a..z | aS .. zS var rules = terminalSymbols.Select(t => Tuple.Create(sentenceSymbol, t.ToString())) .Concat(terminalSymbols.Select(t => Tuple.Create(sentenceSymbol, t + sentenceSymbol.ToString()))); this.grammar = new Grammar(sentenceSymbol, terminalSymbols, nonTerminalSymbols, rules); } { // create grammar for tree-based GP // S -> a..z | SS var rules = terminalSymbols.Select(t => Tuple.Create(sentenceSymbol, t.ToString())) .Concat(new Tuple[] { Tuple.Create(sentenceSymbol, sentenceSymbol.ToString() + sentenceSymbol) }); this.TreeBasedGPGrammar = new Grammar(sentenceSymbol, terminalSymbols, nonTerminalSymbols, rules); } // generate optimal phrases optimalPhrases = new SortedSet(); while (optimalPhrases.Count < numOptimalPhrases) { string phrase = ""; for (int l = 0; l < phraseLen; l++) { phrase += terminalSymbols.SelectRandom(rand); } phrase = CanonicalPhrase(phrase); // don't allow dups if (!optimalPhrases.Contains(phrase)) optimalPhrases.Add(phrase); } // generate decoy phrases decoyPhrases = new SortedSet(); while (decoyPhrases.Count < numDecoyPhrases) { string phrase = ""; for (int l = 0; l < phraseLen; l++) { phrase += terminalSymbols.SelectRandom(rand); } phrase = CanonicalPhrase(phrase); // don't allow dups if (!optimalPhrases.Contains(phrase) && !decoyPhrases.Contains(phrase)) decoyPhrases.Add(phrase); } Debug.Assert(Evaluate(BestKnownSolution) / BestKnownQuality(phraseLen * numPhrases) == 1.0); } public double BestKnownQuality(int maxLen) { return Math.Min(maxLen / phraseLen, numPhrases) * correctReward; // integer division } public string BestKnownSolution { get { return string.Join("", optimalPhrases.Take(numPhrases)); } } public IGrammar Grammar { get { return grammar; } } public double Evaluate(string sentence) { // sentence must contain only terminal symbols, we are not checking if the sentence is syntactically valid here because it would be too slow! Debug.Assert(sentence.Any(c => grammar.IsTerminal(c))); // split the sentence in phrases // phrases must not overlap in the sentence, multiple occurences of a phrase are not counted // the order of phrases is not relevant var numPhrases = sentence.Length / phraseLen; var phrases = new SortedSet(); for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) { var sentenceIdx = phraseIdx * phraseLen; var phrase = sentence.Substring(sentenceIdx, phraseLen); phrase = CanonicalPhrase(phrase); if (!phrases.Contains(phrase)) phrases.Add(phrase); } // add reward for each correct phrase that occurs in the sentence // add reward for each decoy phrase that occurs in the sentence var reward = phrases.Intersect(optimalPhrases).Count() * correctReward + phrases.Intersect(decoyPhrases).Count() * decoyReward; return reward; } // TODO: cache canonical phrases in most-recently used dictionary for increased performance (see symbolicregressionpoly10problem) private string CanonicalPhrase(string phrase) { if (phrasesAsSets) return string.Join("", phrase.OrderBy(ch => (byte)ch)); else return phrase; } public string CanonicalRepresentation(string phrase) { // as the ordering of phrases does not matter we can reorder the phrases // and remove duplicates var numPhrases = phrase.Length / phraseLen; var phrases = new SortedSet(); for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) { var sentenceIdx = phraseIdx * phraseLen; var subphrase = phrase.Substring(sentenceIdx, phraseLen); subphrase = CanonicalPhrase(subphrase); if (!phrases.Contains(subphrase)) phrases.Add(subphrase); } var remainder = phrase.Substring(numPhrases * phraseLen, phrase.Length - (numPhrases * phraseLen)); remainder = CanonicalPhrase(remainder); if (!phrases.Contains(remainder)) phrases.Add(remainder); return string.Join("", phrases); } public IEnumerable GetFeatures(string phrase) { return new Feature[] {new Feature(phrase, 1.0),}; } public override string ToString() { return string.Format("\"FindPhrasesProblem {0} {1} {2} {3:F2} {4} {5:F2} {6}\"", numPhrases, phraseLen, optimalPhrases.Count, correctReward, decoyPhrases.Count, decoyReward, phrasesAsSets); } public IGrammar TreeBasedGPGrammar { get; private set; } public string ConvertTreeToSentence(ISymbolicExpressionTree tree) { var sb = new StringBuilder(); foreach (var s in tree.Root.GetSubtree(0).GetSubtree(0).IterateNodesPrefix()) { if (s.Symbol.Name == "S") continue; sb.Append(s.Symbol.Name); } return sb.ToString(); } } }