Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/Problems/FindPhrasesProblem.cs @ 12099

Last change on this file since 12099 was 12099, checked in by gkronber, 9 years ago

#2283: name for all problems (for output), new unit test, and added testsettings file

File size: 7.7 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Text;
6using HeuristicLab.Common;
7using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
8
9namespace HeuristicLab.Problems.GrammaticalOptimization {
10  // must find a set of phrases where the ordering of phrases is irrelevant
11  // Parameters
12  // - size of the alphabet
13  // - phrase length
14  // - number of phrases in the sequence
15  // - number of optimal phrases
16  // - reward for optimal phrases
17  // - number of decoy (sub-optimal) phrases
18  // - reward for decoy phrases (must be smaller than reward for optimal phrases)
19  // - phrasesAsSets: a switch to determine wether symbols in a phrase can be shuffled (sets) or if the ordering is relevant (non-sets)
20
21  // this problem should be similar to symbolic regression and should be easier for approaches using a state esimation value and the canoncial state
22  // when phrases are symbol sets instead of sequences then value-estimation routines should be better (TD)
23  public class FindPhrasesProblem : ISymbolicExpressionTreeProblem {
24
25    private readonly IGrammar grammar;
26    private readonly int numPhrases;
27    private readonly int phraseLen;
28    private readonly double correctReward;
29    private readonly double decoyReward;
30    private readonly bool phrasesAsSets;
31    private readonly SortedSet<string> optimalPhrases;
32    private readonly SortedSet<string> decoyPhrases;
33    public string Name { get { return "FindPhrases"; } }
34
35    public FindPhrasesProblem(Random rand, int alphabetSize, int numPhrases, int phraseLen, int numOptimalPhrases, int numDecoyPhrases = 1,
36      double correctReward = 1.0, double decoyReward = 0.0, bool phrasesAsSets = false) {
37      if (alphabetSize <= 0 || alphabetSize > 26) throw new ArgumentException();
38      if (numPhrases <= 0) throw new ArgumentException();
39      if (phraseLen < 1) throw new ArgumentException();
40      if (numOptimalPhrases < numPhrases) throw new ArgumentException();
41      if (numDecoyPhrases < 0) throw new ArgumentException();
42      if (correctReward <= decoyReward) throw new ArgumentException();
43
44      this.numPhrases = numPhrases;
45      this.phraseLen = phraseLen;
46      this.correctReward = correctReward;
47      this.decoyReward = decoyReward;
48      this.phrasesAsSets = phrasesAsSets;
49
50      var sentenceSymbol = 'S';
51      var terminalSymbols = Enumerable.Range(0, alphabetSize).Select(off => (char)((byte)'a' + off)).ToArray();
52      var nonTerminalSymbols = new char[] { sentenceSymbol };
53
54      {
55        // create grammar
56        // S -> a..z | aS .. zS
57        var rules = terminalSymbols.Select(t => Tuple.Create(sentenceSymbol, t.ToString()))
58          .Concat(terminalSymbols.Select(t => Tuple.Create(sentenceSymbol, t + sentenceSymbol.ToString())));
59
60        this.grammar = new Grammar(sentenceSymbol, terminalSymbols, nonTerminalSymbols, rules);
61      }
62      {
63        // create grammar for tree-based GP
64        // S -> a..z | SS
65        var rules = terminalSymbols.Select(t => Tuple.Create(sentenceSymbol, t.ToString()))
66          .Concat(new Tuple<char, string>[] { Tuple.Create(sentenceSymbol, sentenceSymbol.ToString() + sentenceSymbol) });
67
68        this.TreeBasedGPGrammar = new Grammar(sentenceSymbol, terminalSymbols, nonTerminalSymbols, rules);
69      }
70
71      // generate optimal phrases
72      optimalPhrases = new SortedSet<string>();
73      while (optimalPhrases.Count < numOptimalPhrases) {
74        string phrase = "";
75        for (int l = 0; l < phraseLen; l++) {
76          phrase += terminalSymbols.SelectRandom(rand);
77        }
78        phrase = CanonicalPhrase(phrase);
79
80        // don't allow dups
81        if (!optimalPhrases.Contains(phrase)) optimalPhrases.Add(phrase);
82      }
83
84      // generate decoy phrases
85      decoyPhrases = new SortedSet<string>();
86      while (decoyPhrases.Count < numDecoyPhrases) {
87        string phrase = "";
88        for (int l = 0; l < phraseLen; l++) {
89          phrase += terminalSymbols.SelectRandom(rand);
90        }
91        phrase = CanonicalPhrase(phrase);
92
93        // don't allow dups
94        if (!optimalPhrases.Contains(phrase) && !decoyPhrases.Contains(phrase)) decoyPhrases.Add(phrase);
95      }
96
97      Debug.Assert(Evaluate(BestKnownSolution) / BestKnownQuality(phraseLen * numPhrases) == 1.0);
98    }
99
100    public double BestKnownQuality(int maxLen) {
101      return Math.Min(maxLen / phraseLen, numPhrases) * correctReward; // integer division
102    }
103
104    public string BestKnownSolution {
105      get { return string.Join("", optimalPhrases.Take(numPhrases)); }
106    }
107
108    public IGrammar Grammar {
109      get { return grammar; }
110    }
111
112    public double Evaluate(string sentence) {
113      // sentence must contain only terminal symbols, we are not checking if the sentence is syntactically valid here because it would be too slow!
114      Debug.Assert(sentence.Any(c => grammar.IsTerminal(c)));
115
116
117      // split the sentence in phrases
118      // phrases must not overlap in the sentence, multiple occurences of a phrase are not counted
119      // the order of phrases is not relevant
120      var numPhrases = sentence.Length / phraseLen;
121      var phrases = new SortedSet<string>();
122      for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) {
123        var sentenceIdx = phraseIdx * phraseLen;
124        var phrase = sentence.Substring(sentenceIdx, phraseLen);
125        phrase = CanonicalPhrase(phrase);
126        if (!phrases.Contains(phrase)) phrases.Add(phrase);
127      }
128
129      // add reward for each correct phrase that occurs in the sentence
130      // add reward for each decoy phrase that occurs in the sentence
131      var reward = phrases.Intersect(optimalPhrases).Count() * correctReward
132               + phrases.Intersect(decoyPhrases).Count() * decoyReward;
133
134      return reward;
135    }
136
137    // TODO: cache canonical phrases in most-recently used dictionary for increased performance (see symbolicregressionpoly10problem)
138    private string CanonicalPhrase(string phrase) {
139      if (phrasesAsSets) return string.Join("", phrase.OrderBy(ch => (byte)ch));
140      else return phrase;
141    }
142
143    public string CanonicalRepresentation(string phrase) {
144      // as the ordering of phrases does not matter we can reorder the phrases
145      // and remove duplicates
146      var numPhrases = phrase.Length / phraseLen;
147      var phrases = new SortedSet<string>();
148      for (int phraseIdx = 0; phraseIdx < numPhrases; phraseIdx++) {
149        var sentenceIdx = phraseIdx * phraseLen;
150        var subphrase = phrase.Substring(sentenceIdx, phraseLen);
151        subphrase = CanonicalPhrase(subphrase);
152        if (!phrases.Contains(subphrase)) phrases.Add(subphrase);
153      }
154      var remainder = phrase.Substring(numPhrases * phraseLen, phrase.Length - (numPhrases * phraseLen));
155      remainder = CanonicalPhrase(remainder);
156      if (!phrases.Contains(remainder)) phrases.Add(remainder);
157
158      return string.Join("", phrases);
159    }
160
161    public IEnumerable<Feature> GetFeatures(string phrase) {
162      throw new NotImplementedException();
163    }
164
165    public override string ToString() {
166      return string.Format("\"FindPhrasesProblem {0} {1} {2} {3:F2} {4} {5:F2} {6}\"", numPhrases, phraseLen,
167        optimalPhrases.Count, correctReward, decoyPhrases.Count, decoyReward, phrasesAsSets);
168    }
169
170    public IGrammar TreeBasedGPGrammar { get; private set; }
171    public string ConvertTreeToSentence(ISymbolicExpressionTree tree) {
172      var sb = new StringBuilder();
173      foreach (var s in tree.Root.GetSubtree(0).GetSubtree(0).IterateNodesPrefix()) {
174        if (s.Symbol.Name == "S") continue;
175        sb.Append(s.Symbol.Name);
176      }
177      return sb.ToString();
178    }
179  }
180}
Note: See TracBrowser for help on using the repository browser.