Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/Problems/FindPhrasesProblem.cs @ 11832

Last change on this file since 11832 was 11832, checked in by gkronber, 8 years ago

linear value function approximation and good results for poly-10 benchmark

File size: 6.7 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Data.Odbc;
4using System.Diagnostics;
5using System.Linq;
6using System.Text;
7using System.Text.RegularExpressions;
8using HeuristicLab.Common;
9
10namespace HeuristicLab.Problems.GrammaticalOptimization {
11  // must find a set of phrases where the ordering of phrases is irrelevant
12  // Parameters
13  // - size of the alphabet
14  // - phrase length
15  // - number of phrases in the sequence
16  // - number of optimal phrases
17  // - reward for optimal phrases
18  // - number of decoy (sub-optimal) phrases
19  // - reward for decoy phrases (must be smaller than reward for optimal phrases)
20  // - phrasesAsSets: a switch to determine wether symbols in a phrase can be shuffled (sets) or if the ordering is relevant (non-sets)
21
22  // this problem should be similar to symbolic regression and should be easier for approaches using a state esimation value and the canoncial state
23  // when phrases are symbol sets instead of sequences then value-estimation routines should be better (TD)
24  public class FindPhrasesProblem : IProblem {
25
26    private readonly IGrammar grammar;
27    private readonly int numPhrases;
28    private readonly int phraseLen;
29    private readonly double correctReward;
30    private readonly double decoyReward;
31    private readonly bool phrasesAsSets;
32    private readonly SortedSet<string> optimalPhrases;
33    private readonly SortedSet<string> decoyPhrases;
34
35    public FindPhrasesProblem(Random rand, int alphabetSize, int numPhrases, int phraseLen, int numOptimalPhrases, int numDecoyPhrases = 1,
36      double correctReward = 1.0, double decoyReward = 0.0, bool phrasesAsSets = false) {
37      if (alphabetSize <= 0 || alphabetSize > 26) throw new ArgumentException();
38      if (numPhrases <= 0) throw new ArgumentException();
39      if (phraseLen < 1) throw new ArgumentException();
40      if (numOptimalPhrases < numPhrases) throw new ArgumentException();
41      if (numDecoyPhrases < 0) throw new ArgumentException();
42      if (correctReward <= decoyReward) throw new ArgumentException();
43
44      this.numPhrases = numPhrases;
45      this.phraseLen = phraseLen;
46      this.correctReward = correctReward;
47      this.decoyReward = decoyReward;
48      this.phrasesAsSets = phrasesAsSets;
49
50      // create grammar
51      var sentenceSymbol = 'S';
52      var terminalSymbols = Enumerable.Range(0, alphabetSize).Select(off => (char)((byte)'a' + off)).ToArray();
53      var nonTerminalSymbols = new char[] { 'S' };
54      var rules = terminalSymbols.Select(t => Tuple.Create('S', t.ToString()))
55        .Concat(terminalSymbols.Select(t => Tuple.Create('S', t + "S")));
56
57      this.grammar = new Grammar(sentenceSymbol, terminalSymbols, nonTerminalSymbols, rules);
58
59      // generate optimal phrases
60      optimalPhrases = new SortedSet<string>();
61      while (optimalPhrases.Count < numOptimalPhrases) {
62        string phrase = "";
63        for (int l = 0; l < phraseLen; l++) {
64          phrase += terminalSymbols.SelectRandom(rand);
65        }
66        phrase = CanonicalPhrase(phrase);
67
68        // don't allow dups
69        if (!optimalPhrases.Contains(phrase)) optimalPhrases.Add(phrase);
70      }
71
72      // generate decoy phrases
73      decoyPhrases = new SortedSet<string>();
74      while (decoyPhrases.Count < numDecoyPhrases) {
75        string phrase = "";
76        for (int l = 0; l < phraseLen; l++) {
77          phrase += terminalSymbols.SelectRandom(rand);
78        }
79        phrase = CanonicalPhrase(phrase);
80
81        // don't allow dups
82        if (!optimalPhrases.Contains(phrase) && !decoyPhrases.Contains(phrase)) decoyPhrases.Add(phrase);
83      }
84
85      Debug.Assert(Evaluate(BestKnownSolution) / BestKnownQuality(phraseLen * numPhrases) == 1.0);
86    }
87
88    public double BestKnownQuality(int maxLen) {
89      return Math.Min(maxLen / phraseLen, numPhrases) * correctReward; // integer division
90    }
91
92    public string BestKnownSolution {
93      get { return string.Join("", optimalPhrases.Take(numPhrases)); }
94    }
95
96    public IGrammar Grammar {
97      get { return grammar; }
98    }
99
100    public double Evaluate(string sentence) {
101      // sentence must contain only terminal symbols, we are not checking if the sentence is syntactically valid here because it would be too slow!
102      Debug.Assert(sentence.Any(c => grammar.IsTerminal(c)));
103
104
105      // split the sentence in phrases
106      // phrases must not overlap in the sentence, multiple occurences of a phrase are not counted
107      // the order of phrases is not relevant
108      var numPhrases = sentence.Length / phraseLen;
109      var phrases = new SortedSet<string>();
110      for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) {
111        var sentenceIdx = phraseIdx * phraseLen;
112        var phrase = sentence.Substring(sentenceIdx, phraseLen);
113        phrase = CanonicalPhrase(phrase);
114        if (!phrases.Contains(phrase)) phrases.Add(phrase);
115      }
116
117      // add reward for each correct phrase that occurs in the sentence
118      // add reward for each decoy phrase that occurs in the sentence
119      var reward = phrases.Intersect(optimalPhrases).Count() * correctReward
120               + phrases.Intersect(decoyPhrases).Count() * decoyReward;
121
122      return reward;
123    }
124
125    // TODO: cache canonical phrases in most-recently used dictionary for increased performance (see symbolicregressionpoly10problem)
126    private string CanonicalPhrase(string phrase) {
127      if (phrasesAsSets) return string.Join("", phrase.OrderBy(ch => (byte)ch));
128      else return phrase;
129    }
130
131    public string CanonicalRepresentation(string phrase) {
132      // as the ordering of phrases does not matter we can reorder the phrases
133      // and remove duplicates
134      var numPhrases = phrase.Length / phraseLen;
135      var phrases = new SortedSet<string>();
136      for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) {
137        var sentenceIdx = phraseIdx * phraseLen;
138        var subphrase = phrase.Substring(sentenceIdx, phraseLen);
139        subphrase = CanonicalPhrase(subphrase);
140        if (!phrases.Contains(subphrase)) phrases.Add(subphrase);
141      }
142      var remainder = phrase.Substring(numPhrases * phraseLen, phrase.Length - (numPhrases * phraseLen));
143      remainder = CanonicalPhrase(remainder);
144      if (!phrases.Contains(remainder)) phrases.Add(remainder);
145
146      return string.Join("", phrases);
147    }
148
149    public IEnumerable<Feature> GetFeatures(string phrase)
150    {
151      throw new NotImplementedException();
152    }
153
154    public override string ToString() {
155      return string.Format("\"FindPhrasesProblem {0} {1} {2} {3:F2} {4} {5:F2} {6}\"", numPhrases, phraseLen,
156        optimalPhrases.Count, correctReward, decoyPhrases.Count, decoyReward, phrasesAsSets);
157    }
158  }
159}
Note: See TracBrowser for help on using the repository browser.