Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/Problems/FindPhrasesProblem.cs @ 11851

Last change on this file since 11851 was 11851, checked in by gkronber, 9 years ago

#2283: solution reorganization

File size: 6.6 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using HeuristicLab.Common;
6
7namespace HeuristicLab.Problems.GrammaticalOptimization {
8  // must find a set of phrases where the ordering of phrases is irrelevant
9  // Parameters
10  // - size of the alphabet
11  // - phrase length
12  // - number of phrases in the sequence
13  // - number of optimal phrases
14  // - reward for optimal phrases
15  // - number of decoy (sub-optimal) phrases
16  // - reward for decoy phrases (must be smaller than reward for optimal phrases)
17  // - phrasesAsSets: a switch to determine wether symbols in a phrase can be shuffled (sets) or if the ordering is relevant (non-sets)
18
19  // this problem should be similar to symbolic regression and should be easier for approaches using a state esimation value and the canoncial state
20  // when phrases are symbol sets instead of sequences then value-estimation routines should be better (TD)
21  public class FindPhrasesProblem : IProblem {
22
23    private readonly IGrammar grammar;
24    private readonly int numPhrases;
25    private readonly int phraseLen;
26    private readonly double correctReward;
27    private readonly double decoyReward;
28    private readonly bool phrasesAsSets;
29    private readonly SortedSet<string> optimalPhrases;
30    private readonly SortedSet<string> decoyPhrases;
31
32    public FindPhrasesProblem(Random rand, int alphabetSize, int numPhrases, int phraseLen, int numOptimalPhrases, int numDecoyPhrases = 1,
33      double correctReward = 1.0, double decoyReward = 0.0, bool phrasesAsSets = false) {
34      if (alphabetSize <= 0 || alphabetSize > 26) throw new ArgumentException();
35      if (numPhrases <= 0) throw new ArgumentException();
36      if (phraseLen < 1) throw new ArgumentException();
37      if (numOptimalPhrases < numPhrases) throw new ArgumentException();
38      if (numDecoyPhrases < 0) throw new ArgumentException();
39      if (correctReward <= decoyReward) throw new ArgumentException();
40
41      this.numPhrases = numPhrases;
42      this.phraseLen = phraseLen;
43      this.correctReward = correctReward;
44      this.decoyReward = decoyReward;
45      this.phrasesAsSets = phrasesAsSets;
46
47      // create grammar
48      var sentenceSymbol = 'S';
49      var terminalSymbols = Enumerable.Range(0, alphabetSize).Select(off => (char)((byte)'a' + off)).ToArray();
50      var nonTerminalSymbols = new char[] { 'S' };
51      var rules = terminalSymbols.Select(t => Tuple.Create('S', t.ToString()))
52        .Concat(terminalSymbols.Select(t => Tuple.Create('S', t + "S")));
53
54      this.grammar = new Grammar(sentenceSymbol, terminalSymbols, nonTerminalSymbols, rules);
55
56      // generate optimal phrases
57      optimalPhrases = new SortedSet<string>();
58      while (optimalPhrases.Count < numOptimalPhrases) {
59        string phrase = "";
60        for (int l = 0; l < phraseLen; l++) {
61          phrase += terminalSymbols.SelectRandom(rand);
62        }
63        phrase = CanonicalPhrase(phrase);
64
65        // don't allow dups
66        if (!optimalPhrases.Contains(phrase)) optimalPhrases.Add(phrase);
67      }
68
69      // generate decoy phrases
70      decoyPhrases = new SortedSet<string>();
71      while (decoyPhrases.Count < numDecoyPhrases) {
72        string phrase = "";
73        for (int l = 0; l < phraseLen; l++) {
74          phrase += terminalSymbols.SelectRandom(rand);
75        }
76        phrase = CanonicalPhrase(phrase);
77
78        // don't allow dups
79        if (!optimalPhrases.Contains(phrase) && !decoyPhrases.Contains(phrase)) decoyPhrases.Add(phrase);
80      }
81
82      Debug.Assert(Evaluate(BestKnownSolution) / BestKnownQuality(phraseLen * numPhrases) == 1.0);
83    }
84
85    public double BestKnownQuality(int maxLen) {
86      return Math.Min(maxLen / phraseLen, numPhrases) * correctReward; // integer division
87    }
88
89    public string BestKnownSolution {
90      get { return string.Join("", optimalPhrases.Take(numPhrases)); }
91    }
92
93    public IGrammar Grammar {
94      get { return grammar; }
95    }
96
97    public double Evaluate(string sentence) {
98      // sentence must contain only terminal symbols, we are not checking if the sentence is syntactically valid here because it would be too slow!
99      Debug.Assert(sentence.Any(c => grammar.IsTerminal(c)));
100
101
102      // split the sentence in phrases
103      // phrases must not overlap in the sentence, multiple occurences of a phrase are not counted
104      // the order of phrases is not relevant
105      var numPhrases = sentence.Length / phraseLen;
106      var phrases = new SortedSet<string>();
107      for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) {
108        var sentenceIdx = phraseIdx * phraseLen;
109        var phrase = sentence.Substring(sentenceIdx, phraseLen);
110        phrase = CanonicalPhrase(phrase);
111        if (!phrases.Contains(phrase)) phrases.Add(phrase);
112      }
113
114      // add reward for each correct phrase that occurs in the sentence
115      // add reward for each decoy phrase that occurs in the sentence
116      var reward = phrases.Intersect(optimalPhrases).Count() * correctReward
117               + phrases.Intersect(decoyPhrases).Count() * decoyReward;
118
119      return reward;
120    }
121
122    // TODO: cache canonical phrases in most-recently used dictionary for increased performance (see symbolicregressionpoly10problem)
123    private string CanonicalPhrase(string phrase) {
124      if (phrasesAsSets) return string.Join("", phrase.OrderBy(ch => (byte)ch));
125      else return phrase;
126    }
127
128    public string CanonicalRepresentation(string phrase) {
129      // as the ordering of phrases does not matter we can reorder the phrases
130      // and remove duplicates
131      var numPhrases = phrase.Length / phraseLen;
132      var phrases = new SortedSet<string>();
133      for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) {
134        var sentenceIdx = phraseIdx * phraseLen;
135        var subphrase = phrase.Substring(sentenceIdx, phraseLen);
136        subphrase = CanonicalPhrase(subphrase);
137        if (!phrases.Contains(subphrase)) phrases.Add(subphrase);
138      }
139      var remainder = phrase.Substring(numPhrases * phraseLen, phrase.Length - (numPhrases * phraseLen));
140      remainder = CanonicalPhrase(remainder);
141      if (!phrases.Contains(remainder)) phrases.Add(remainder);
142
143      return string.Join("", phrases);
144    }
145
146    public IEnumerable<Feature> GetFeatures(string phrase)
147    {
148      throw new NotImplementedException();
149    }
150
151    public override string ToString() {
152      return string.Format("\"FindPhrasesProblem {0} {1} {2} {3:F2} {4} {5:F2} {6}\"", numPhrases, phraseLen,
153        optimalPhrases.Count, correctReward, decoyPhrases.Count, decoyReward, phrasesAsSets);
154    }
155  }
156}
Note: See TracBrowser for help on using the repository browser.