Changeset 11755 for branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/FindPhrasesProblem.cs
- Timestamp:
- 01/13/15 20:02:29 (9 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/FindPhrasesProblem.cs
r11754 r11755 1 1 using System; 2 2 using System.Collections.Generic; 3 using System.Data.Odbc; 3 4 using System.Diagnostics; 4 5 using System.Linq; … … 17 18 // - number of decoy (sub-optimal) phrases 18 19 // - reward for decoy phrases (must be smaller than reward for optimal phrases) 20 // - phrasesAsSets: a switch to determine wether symbols in a phrase can be shuffled (sets) or if the ordering is relevant (non-sets) 21 22 // this problem should be similar to symbolic regression and should be easier for approaches using a state esimation value and the canoncial state 23 // when phrases are symbol sets instead of sequences then value-estimation routines should be better (TD) 19 24 public class FindPhrasesProblem : IProblem { 20 25 21 26 private readonly IGrammar grammar; 22 private readonly int alphabetSize;23 27 private readonly int numPhrases; 24 28 private readonly int phraseLen; … … 27 31 private readonly double correctReward; 28 32 private readonly double decoyReward; 33 private readonly bool phrasesAsSets; 29 34 private readonly SortedSet<string> optimalPhrases; 30 35 private readonly SortedSet<string> decoyPhrases; 31 36 32 37 public FindPhrasesProblem(Random rand, int alphabetSize, int numPhrases, int phraseLen, int numOptimalPhrases, int numDecoyPhrases = 1, 33 double correctReward = 1.0, double decoyReward = 0.0 ) {38 double correctReward = 1.0, double decoyReward = 0.0, bool phrasesAsSets = false) { 34 39 if (alphabetSize <= 0 || alphabetSize > 26) throw new ArgumentException(); 35 40 if (numPhrases <= 0) throw new ArgumentException(); … … 39 44 if (correctReward <= decoyReward) throw new ArgumentException(); 40 45 41 this.alphabetSize = alphabetSize;42 46 this.numPhrases = numPhrases; 43 47 this.phraseLen = phraseLen; 44 48 this.correctReward = correctReward; 45 49 this.decoyReward = decoyReward; 50 this.phrasesAsSets = phrasesAsSets; 46 51 47 52 // create grammar … … 54 59 this.grammar = new Grammar(sentenceSymbol, terminalSymbols, nonTerminalSymbols, rules); 55 60 56 this.optimalPhrasesForPos = new SortedSet<string>[sequenceLen]; 57 for (int i = 0; i < sequenceLen; i++) { 58 optimalPhrasesForPos[i] = new SortedSet<string>(); 59 for (int j = 0; j < k; j++) { 60 string phrase = ""; 61 do { 62 for (int l = 0; l < phraseLen; l++) { 63 phrase += terminalSymbols.SelectRandom(rand); 64 } 65 } while (optimalPhrasesForPos[i].Contains(phrase)); // don't allow duplicate phrases 66 optimalPhrasesForPos[i].Add(phrase); 61 // generate optimal phrases 62 optimalPhrases = new SortedSet<string>(); 63 while (optimalPhrases.Count < numOptimalPhrases) { 64 string phrase = ""; 65 for (int l = 0; l < phraseLen; l++) { 66 phrase += terminalSymbols.SelectRandom(rand); 67 67 } 68 phrase = CanonicalPhrase(phrase); 69 70 // don't allow dups 71 if (!optimalPhrases.Contains(phrase)) optimalPhrases.Add(phrase); 68 72 } 69 73 70 Debug.Assert(Evaluate(BestKnownSolution) / BestKnownQuality(phraseLen * sequenceLen) == 1.0); 74 // generate decoy phrases 75 decoyPhrases = new SortedSet<string>(); 76 while (decoyPhrases.Count < numDecoyPhrases) { 77 string phrase = ""; 78 for (int l = 0; l < phraseLen; l++) { 79 phrase += terminalSymbols.SelectRandom(rand); 80 } 81 phrase = CanonicalPhrase(phrase); 82 83 // don't allow dups 84 if (!optimalPhrases.Contains(phrase) && !decoyPhrases.Contains(phrase)) decoyPhrases.Add(phrase); 85 } 86 87 Debug.Assert(Evaluate(BestKnownSolution) / BestKnownQuality(phraseLen * numPhrases) == 1.0); 71 88 } 72 89 73 90 public double BestKnownQuality(int maxLen) { 74 return Math.Min(maxLen / phraseLen, sequenceLen) * correctReward; // integer division91 return Math.Min(maxLen / phraseLen, numPhrases) * correctReward; // integer division 75 92 } 76 93 77 94 public string BestKnownSolution { 78 get { 79 string solution = ""; 80 for (int i = 0; i < sequenceLen; i++) { 81 solution += optimalPhrasesForPos[i].First(); 82 } 83 return solution; 84 } 95 get { return string.Join("", optimalPhrases.Take(numPhrases)); } 85 96 } 86 97 … … 92 103 // sentence must contain only terminal symbols, we are not checking if the sentence is syntactically valid here because it would be too slow! 93 104 Debug.Assert(sentence.Any(c => grammar.IsTerminal(c))); 94 // as long as only correct symbols are found we increase the reward by +1 95 // on the first incorrect symbol we return 96 var reward = 0.0;97 for (int i = 0; i < Math.Min(sentence.Length / phraseLen, sequenceLen); i++) {98 if (optimalPhrasesForPos[i].Contains(sentence.Substring(i * phraseLen, phraseLen))) {99 reward += correctReward;100 } else {101 // alternatively reduce reward by number of remaining phrases102 return Math.Max(0.0, reward + incorrectReward * (sentence.Length / phraseLen - i));103 // stop on first incorrect symbol and return reward104 //return reward;105 }105 106 107 // split the sentence in phrases 108 // phrases must not overlap in the sentence, multiple occurences of a phrase are not counted 109 // the order of phrases is not relevant 110 var numPhrases = sentence.Length / phraseLen; 111 var phrases = new SortedSet<string>(); 112 for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) { 113 var sentenceIdx = phraseIdx * phraseLen; 114 var phrase = sentence.Substring(sentenceIdx, phraseLen); 115 phrase = CanonicalPhrase(phrase); 116 if (!phrases.Contains(phrase)) phrases.Add(phrase); 106 117 } 118 119 // add reward for each correct phrase that occurs in the sentence 120 // add reward for each decoy phrase that occurs in the sentence 121 var reward = phrases.Intersect(optimalPhrases).Count() * correctReward 122 + phrases.Intersect(decoyPhrases).Count() * decoyReward; 123 124 125 107 126 return reward; 108 127 } 109 128 129 private string CanonicalPhrase(string phrase) { 130 if (phrasesAsSets) return string.Join("", phrase.OrderBy(ch => (byte)ch)); 131 else return phrase; 132 } 133 110 134 public string CanonicalRepresentation(string terminalPhrase) { 111 return terminalPhrase; 135 // as the ordering of phrases does not matter we can reorder the phrases 136 // and remove duplicates 137 var numPhrases = terminalPhrase.Length / phraseLen; 138 var phrases = new SortedSet<string>(); 139 for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) { 140 var sentenceIdx = phraseIdx * phraseLen; 141 var phrase = terminalPhrase.Substring(sentenceIdx, phraseLen); 142 phrase = CanonicalPhrase(phrase); 143 if (!phrases.Contains(phrase)) phrases.Add(phrase); 144 } 145 return string.Join("", phrases); 112 146 } 113 147 }
Note: See TracChangeset
for help on using the changeset viewer.