Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Problems.GrammaticalOptimization/Problems/FindPhrasesProblem.cs @ 12394

Last change on this file since 12394 was 12391, checked in by gkronber, 10 years ago

#2283: added shuffling of terminal symbols to the royal pair problem to make sure that there is no bias from order of terminal symbols.

File size: 8.1 KB
RevLine 
[11754]1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
[11865]5using System.Text;
[11754]6using HeuristicLab.Common;
[11865]7using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
[11754]8
9namespace HeuristicLab.Problems.GrammaticalOptimization {
10  // must find a set of phrases where the ordering of phrases is irrelevant
11  // Parameters
12  // - size of the alphabet
13  // - phrase length
14  // - number of phrases in the sequence
15  // - number of optimal phrases
16  // - reward for optimal phrases
17  // - number of decoy (sub-optimal) phrases
18  // - reward for decoy phrases (must be smaller than reward for optimal phrases)
[11755]19  // - phrasesAsSets: a switch to determine wether symbols in a phrase can be shuffled (sets) or if the ordering is relevant (non-sets)
20
21  // this problem should be similar to symbolic regression and should be easier for approaches using a state esimation value and the canoncial state
22  // when phrases are symbol sets instead of sequences then value-estimation routines should be better (TD)
[11865]23  public class FindPhrasesProblem : ISymbolicExpressionTreeProblem {
[11754]24
25    private readonly IGrammar grammar;
26    private readonly int numPhrases;
27    private readonly int phraseLen;
28    private readonly double correctReward;
29    private readonly double decoyReward;
[11755]30    private readonly bool phrasesAsSets;
[12391]31    private readonly int alphabetSize;
32    private readonly int numOptimalPhrases;
33    private readonly int numDecoyPhrases;
[11754]34    private readonly SortedSet<string> optimalPhrases;
35    private readonly SortedSet<string> decoyPhrases;
[12391]36    public string Name { get { return string.Format("FindPhrases({0},{1},{2},{3},{4},{5},{6},{7})", alphabetSize, numPhrases, phraseLen, numOptimalPhrases, numDecoyPhrases, correctReward, decoyReward, phrasesAsSets); } }
[11754]37
[12391]38    public FindPhrasesProblem(System.Random rand, int alphabetSize, int numPhrases, int phraseLen, int numOptimalPhrases, int numDecoyPhrases = 1,
[11755]39      double correctReward = 1.0, double decoyReward = 0.0, bool phrasesAsSets = false) {
[11754]40      if (alphabetSize <= 0 || alphabetSize > 26) throw new ArgumentException();
41      if (numPhrases <= 0) throw new ArgumentException();
42      if (phraseLen < 1) throw new ArgumentException();
43      if (numOptimalPhrases < numPhrases) throw new ArgumentException();
44      if (numDecoyPhrases < 0) throw new ArgumentException();
45      if (correctReward <= decoyReward) throw new ArgumentException();
46
47      this.numPhrases = numPhrases;
48      this.phraseLen = phraseLen;
49      this.correctReward = correctReward;
50      this.decoyReward = decoyReward;
[11755]51      this.phrasesAsSets = phrasesAsSets;
[12391]52      this.alphabetSize = alphabetSize;
53      this.numOptimalPhrases = numOptimalPhrases;
54      this.numDecoyPhrases = numDecoyPhrases;
[11754]55
56      var sentenceSymbol = 'S';
57      var terminalSymbols = Enumerable.Range(0, alphabetSize).Select(off => (char)((byte)'a' + off)).ToArray();
[11865]58      var nonTerminalSymbols = new char[] { sentenceSymbol };
[11754]59
[11865]60      {
61        // create grammar
62        // S -> a..z | aS .. zS
63        var rules = terminalSymbols.Select(t => Tuple.Create(sentenceSymbol, t.ToString()))
64          .Concat(terminalSymbols.Select(t => Tuple.Create(sentenceSymbol, t + sentenceSymbol.ToString())));
[11754]65
[11865]66        this.grammar = new Grammar(sentenceSymbol, terminalSymbols, nonTerminalSymbols, rules);
67      }
68      {
69        // create grammar for tree-based GP
70        // S -> a..z | SS
71        var rules = terminalSymbols.Select(t => Tuple.Create(sentenceSymbol, t.ToString()))
72          .Concat(new Tuple<char, string>[] { Tuple.Create(sentenceSymbol, sentenceSymbol.ToString() + sentenceSymbol) });
73
74        this.TreeBasedGPGrammar = new Grammar(sentenceSymbol, terminalSymbols, nonTerminalSymbols, rules);
75      }
76
[11755]77      // generate optimal phrases
78      optimalPhrases = new SortedSet<string>();
79      while (optimalPhrases.Count < numOptimalPhrases) {
80        string phrase = "";
81        for (int l = 0; l < phraseLen; l++) {
82          phrase += terminalSymbols.SelectRandom(rand);
[11754]83        }
[11755]84        phrase = CanonicalPhrase(phrase);
85
86        // don't allow dups
87        if (!optimalPhrases.Contains(phrase)) optimalPhrases.Add(phrase);
[11754]88      }
89
[11755]90      // generate decoy phrases
91      decoyPhrases = new SortedSet<string>();
92      while (decoyPhrases.Count < numDecoyPhrases) {
93        string phrase = "";
94        for (int l = 0; l < phraseLen; l++) {
95          phrase += terminalSymbols.SelectRandom(rand);
96        }
97        phrase = CanonicalPhrase(phrase);
98
99        // don't allow dups
100        if (!optimalPhrases.Contains(phrase) && !decoyPhrases.Contains(phrase)) decoyPhrases.Add(phrase);
101      }
102
103      Debug.Assert(Evaluate(BestKnownSolution) / BestKnownQuality(phraseLen * numPhrases) == 1.0);
[11754]104    }
105
106    public double BestKnownQuality(int maxLen) {
[11755]107      return Math.Min(maxLen / phraseLen, numPhrases) * correctReward; // integer division
[11754]108    }
109
110    public string BestKnownSolution {
[11755]111      get { return string.Join("", optimalPhrases.Take(numPhrases)); }
[11754]112    }
113
114    public IGrammar Grammar {
115      get { return grammar; }
116    }
117
118    public double Evaluate(string sentence) {
119      // sentence must contain only terminal symbols, we are not checking if the sentence is syntactically valid here because it would be too slow!
120      Debug.Assert(sentence.Any(c => grammar.IsTerminal(c)));
[11755]121
122
123      // split the sentence in phrases
124      // phrases must not overlap in the sentence, multiple occurences of a phrase are not counted
125      // the order of phrases is not relevant
126      var numPhrases = sentence.Length / phraseLen;
127      var phrases = new SortedSet<string>();
128      for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) {
129        var sentenceIdx = phraseIdx * phraseLen;
130        var phrase = sentence.Substring(sentenceIdx, phraseLen);
131        phrase = CanonicalPhrase(phrase);
132        if (!phrases.Contains(phrase)) phrases.Add(phrase);
[11754]133      }
[11755]134
135      // add reward for each correct phrase that occurs in the sentence
136      // add reward for each decoy phrase that occurs in the sentence
137      var reward = phrases.Intersect(optimalPhrases).Count() * correctReward
138               + phrases.Intersect(decoyPhrases).Count() * decoyReward;
139
[11754]140      return reward;
141    }
142
[11799]143    // TODO: cache canonical phrases in most-recently used dictionary for increased performance (see symbolicregressionpoly10problem)
[11755]144    private string CanonicalPhrase(string phrase) {
145      if (phrasesAsSets) return string.Join("", phrase.OrderBy(ch => (byte)ch));
146      else return phrase;
147    }
148
[11832]149    public string CanonicalRepresentation(string phrase) {
[11755]150      // as the ordering of phrases does not matter we can reorder the phrases
151      // and remove duplicates
[11832]152      var numPhrases = phrase.Length / phraseLen;
[11755]153      var phrases = new SortedSet<string>();
154      for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) {
155        var sentenceIdx = phraseIdx * phraseLen;
[11832]156        var subphrase = phrase.Substring(sentenceIdx, phraseLen);
157        subphrase = CanonicalPhrase(subphrase);
158        if (!phrases.Contains(subphrase)) phrases.Add(subphrase);
[11755]159      }
[11832]160      var remainder = phrase.Substring(numPhrases * phraseLen, phrase.Length - (numPhrases * phraseLen));
[11770]161      remainder = CanonicalPhrase(remainder);
162      if (!phrases.Contains(remainder)) phrases.Add(remainder);
163
[11755]164      return string.Join("", phrases);
[11754]165    }
[11832]166
[12290]167    public IEnumerable<Feature> GetFeatures(string phrase)
168    {
169      return new Feature[] {new Feature(phrase, 1.0),};
[11832]170    }
171
172    public override string ToString() {
173      return string.Format("\"FindPhrasesProblem {0} {1} {2} {3:F2} {4} {5:F2} {6}\"", numPhrases, phraseLen,
174        optimalPhrases.Count, correctReward, decoyPhrases.Count, decoyReward, phrasesAsSets);
175    }
[11865]176
177    public IGrammar TreeBasedGPGrammar { get; private set; }
178    public string ConvertTreeToSentence(ISymbolicExpressionTree tree) {
179      var sb = new StringBuilder();
180      foreach (var s in tree.Root.GetSubtree(0).GetSubtree(0).IterateNodesPrefix()) {
181        if (s.Symbol.Name == "S") continue;
182        sb.Append(s.Symbol.Name);
183      }
184      return sb.ToString();
185    }
[11754]186  }
187}
Note: See TracBrowser for help on using the repository browser.