using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Text.RegularExpressions; using HeuristicLab.Common; namespace HeuristicLab.Problems.GrammaticalOptimization { // must find a set of phrases where the ordering of phrases is irrelevant // Parameters // - size of the alphabet // - phrase length // - number of phrases in the sequence // - number of optimal phrases // - reward for optimal phrases // - number of decoy (sub-optimal) phrases // - reward for decoy phrases (must be smaller than reward for optimal phrases) public class FindPhrasesProblem : IProblem { private readonly IGrammar grammar; private readonly int alphabetSize; private readonly int numPhrases; private readonly int phraseLen; private readonly int numOptimalPhrases; private readonly int numDecoyPhrases; private readonly double correctReward; private readonly double decoyReward; private readonly SortedSet optimalPhrases; private readonly SortedSet decoyPhrases; public FindPhrasesProblem(Random rand, int alphabetSize, int numPhrases, int phraseLen, int numOptimalPhrases, int numDecoyPhrases = 1, double correctReward = 1.0, double decoyReward = 0.0) { if (alphabetSize <= 0 || alphabetSize > 26) throw new ArgumentException(); if (numPhrases <= 0) throw new ArgumentException(); if (phraseLen < 1) throw new ArgumentException(); if (numOptimalPhrases < numPhrases) throw new ArgumentException(); if (numDecoyPhrases < 0) throw new ArgumentException(); if (correctReward <= decoyReward) throw new ArgumentException(); this.alphabetSize = alphabetSize; this.numPhrases = numPhrases; this.phraseLen = phraseLen; this.correctReward = correctReward; this.decoyReward = decoyReward; // create grammar var sentenceSymbol = 'S'; var terminalSymbols = Enumerable.Range(0, alphabetSize).Select(off => (char)((byte)'a' + off)).ToArray(); var nonTerminalSymbols = new char[] { 'S' }; var rules = terminalSymbols.Select(t => Tuple.Create('S', t.ToString())) .Concat(terminalSymbols.Select(t => Tuple.Create('S', t + "S"))); this.grammar = new Grammar(sentenceSymbol, terminalSymbols, nonTerminalSymbols, rules); this.optimalPhrasesForPos = new SortedSet[sequenceLen]; for (int i = 0; i < sequenceLen; i++) { optimalPhrasesForPos[i] = new SortedSet(); for (int j = 0; j < k; j++) { string phrase = ""; do { for (int l = 0; l < phraseLen; l++) { phrase += terminalSymbols.SelectRandom(rand); } } while (optimalPhrasesForPos[i].Contains(phrase)); // don't allow duplicate phrases optimalPhrasesForPos[i].Add(phrase); } } Debug.Assert(Evaluate(BestKnownSolution) / BestKnownQuality(phraseLen * sequenceLen) == 1.0); } public double BestKnownQuality(int maxLen) { return Math.Min(maxLen / phraseLen, sequenceLen) * correctReward; // integer division } public string BestKnownSolution { get { string solution = ""; for (int i = 0; i < sequenceLen; i++) { solution += optimalPhrasesForPos[i].First(); } return solution; } } public IGrammar Grammar { get { return grammar; } } public double Evaluate(string sentence) { // sentence must contain only terminal symbols, we are not checking if the sentence is syntactically valid here because it would be too slow! Debug.Assert(sentence.Any(c => grammar.IsTerminal(c))); // as long as only correct symbols are found we increase the reward by +1 // on the first incorrect symbol we return var reward = 0.0; for (int i = 0; i < Math.Min(sentence.Length / phraseLen, sequenceLen); i++) { if (optimalPhrasesForPos[i].Contains(sentence.Substring(i * phraseLen, phraseLen))) { reward += correctReward; } else { // alternatively reduce reward by number of remaining phrases return Math.Max(0.0, reward + incorrectReward * (sentence.Length / phraseLen - i)); // stop on first incorrect symbol and return reward //return reward; } } return reward; } public string CanonicalRepresentation(string terminalPhrase) { return terminalPhrase; } } }