Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Problems.GrammaticalOptimization/Grammar.cs @ 13599

Last change on this file since 13599 was 12893, checked in by gkronber, 9 years ago

#2283: experiments on grammatical optimization algorithms (maxreward instead of avg reward, ...)

File size: 12.7 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.IO;
5using System.Linq;
6using System.Text;
7using System.Text.RegularExpressions;
8using HeuristicLab.Common;
9
10namespace HeuristicLab.Problems.GrammaticalOptimization {
11
12  // phrases and symbols are both strings
13  // spaces are used to separate symbols in phases
14  // terminal and non-terminal symbols must not contain white-space
15  // white-spaces are not counted when determining the length of a sentence, only the number of terminal symbols is relevant
16
17  public class Grammar : IGrammar {
18
19    private readonly Dictionary<char, List<Sequence>> rules;
20    private readonly HashSet<char> terminalSymbols;
21    private readonly char sentenceSymbol;
22    private readonly HashSet<char> nonTerminalSymbols;
23    private readonly Dictionary<Sequence, int> maxPhraseLength = new Dictionary<Sequence, int>();
24    private readonly Dictionary<Sequence, int> minPhraseLength = new Dictionary<Sequence, int>();
25
26    public char SentenceSymbol { get { return sentenceSymbol; } }
27    public IEnumerable<char> NonTerminalSymbols { get { return nonTerminalSymbols; } }
28    public IEnumerable<char> TerminalSymbols { get { return terminalSymbols; } }
29    public IEnumerable<char> Symbols { get { return nonTerminalSymbols.Concat(terminalSymbols); } } // ctor guarantees that intersection is empty
30
31    // cloning ctor
32    public Grammar(Grammar orig) {
33      this.rules = new Dictionary<char, List<Sequence>>();
34      foreach (var r in orig.rules)
35        this.rules.Add(r.Key, new List<Sequence>(r.Value.Select(v => new ReadonlySequence(v)))); // clone sequences
36      this.terminalSymbols = new HashSet<char>(orig.terminalSymbols);
37      this.sentenceSymbol = orig.sentenceSymbol;
38      this.nonTerminalSymbols = new HashSet<char>(orig.nonTerminalSymbols);
39      this.maxPhraseLength = new Dictionary<Sequence, int>();
40      foreach (var p in orig.maxPhraseLength) this.maxPhraseLength.Add(new ReadonlySequence(p.Key), p.Value);
41      this.minPhraseLength = new Dictionary<Sequence, int>();
42      foreach (var p in orig.minPhraseLength) this.minPhraseLength.Add(new ReadonlySequence(p.Key), p.Value);
43    }
44
45    public Grammar(string grammar) : this((Grammar)FromString(grammar)) { }
46
47    public Grammar(char sentenceSymbol, IEnumerable<char> terminalSymbols, IEnumerable<char> nonTerminalSymbols, IEnumerable<Tuple<char, string>> rules) {
48      this.sentenceSymbol = sentenceSymbol;
49      this.terminalSymbols = new HashSet<char>(terminalSymbols);
50      this.nonTerminalSymbols = new HashSet<char>(nonTerminalSymbols);
51      this.rules = new Dictionary<char, List<Sequence>>();
52      foreach (var r in rules) {
53        if (!this.rules.ContainsKey(r.Item1)) this.rules.Add(r.Item1, new List<Sequence>());
54        this.rules[r.Item1].Add(new ReadonlySequence(r.Item2)); // here we store an array of symbols for a phase
55      }
56
57      CheckValidity();
58      CalculatePhraseLengthBounds();
59    }
60
61    private void CheckValidity() {
62      // check validity of parameters
63      if (!nonTerminalSymbols.Contains(sentenceSymbol) || // sentence symbol must be a non-terminal
64          nonTerminalSymbols.Any(c => c < 'A' || c > 'Z') || // nonterminal symbols \in 'A' .. 'Z'
65          terminalSymbols.Intersect(nonTerminalSymbols).Any() || // no overlap between non-terminals and terminal
66          rules.Keys.Intersect(nonTerminalSymbols).Count() < nonTerminalSymbols.Count ||
67          nonTerminalSymbols.Any(nt => !rules.ContainsKey(nt)) || // each nt must have at least one rule
68          rules.Values.Any(alternatives => !alternatives.Any()) || // no eps alternatives
69          rules.Any(rule => rule.Value.Any(alternative => alternative.Length == 1 && alternative[0] == rule.Key))
70        // no infinite direct recursions
71        ) {
72        throw new ArgumentException();
73      }
74    }
75
76    private void CalculatePhraseLengthBounds() {
77      // cache phrase lengths
78      foreach (var nt in nonTerminalSymbols) {
79        var min = int.MaxValue;
80        var max = int.MinValue;
81        foreach (var alt in rules[nt].OrderBy(alt => alt.Length)) {
82          CalcAndSetMinPhraseLength(alt);
83          CalcAndSetMaxPhraseLength(alt);
84
85          min = Math.Min(min, minPhraseLength[alt]);
86          max = Math.Max(max, maxPhraseLength[alt]);
87        }
88        minPhraseLength[new ReadonlySequence(nt)] = min;
89        maxPhraseLength[new ReadonlySequence(nt)] = max;
90      }
91    }
92
93    public IEnumerable<Sequence> GetAlternatives(char nt) {
94      return rules[nt];
95    }
96
97    public IEnumerable<Sequence> GetTerminalAlternatives(char nt) {
98      return GetAlternatives(nt).Where(alt => alt.All(IsTerminal));
99    }
100
101    public IEnumerable<Sequence> GetNonTerminalAlternatives(char nt) {
102      return GetAlternatives(nt).Where(alt => alt.Any(IsNonTerminal));
103    }
104
105    #region population of minphraselength cache
106    private int CalcAndSetMinSymbolLength(char symb) {
107      if (IsTerminal(symb)) return 1;
108      else return Math.Min(short.MaxValue, rules[symb].Min(alt => CalcAndSetMinPhraseLength(alt))); // maximal allowed value is short.MaxValue
109    }
110    private int CalcAndSetMinPhraseLength(Sequence phrase) {
111      Debug.Assert(phrase is ReadonlySequence);
112      int l;
113      if (minPhraseLength.TryGetValue(phrase, out l)) return l;
114      l = 0;
115      minPhraseLength[phrase] = short.MaxValue; // mark
116
117      foreach (var symb in phrase) {
118        l += CalcAndSetMinSymbolLength(symb);
119      }
120
121      l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue
122      minPhraseLength[phrase] = l; // update
123      return l;
124    }
125    #endregion
126
127    // read only access to caches
128    private int MinSymbolLength(char symb) {
129      if (IsTerminal(symb)) return 1;
130      else return Math.Min(short.MaxValue, rules[symb].Min(alt => MinPhraseLength(alt))); // maximal allowed value is short.MaxValue
131    }
132    public int MinPhraseLength(Sequence phrase) {
133      var minLen = 0;
134      if (minPhraseLength.TryGetValue(phrase, out minLen)) return minLen;
135      foreach (var s in phrase) {
136        minLen += MinSymbolLength(s);
137      }
138      return Math.Min(short.MaxValue, minLen); // maximal allowed value is short.MaxValue
139    }
140
141    #region population of maxphraselength cache
142    private int CalcAndSetMaxSymbolLength(char symb) {
143      if (IsTerminal(symb)) return 1;
144      else return Math.Min(short.MaxValue, rules[symb].Max(alt => CalcAndSetMaxPhraseLength(alt))); // maximal allowed value is short.MaxValue
145    }
146    // caches for this are build in construction of object
147    private int CalcAndSetMaxPhraseLength(Sequence phrase) {
148      Debug.Assert(phrase is ReadonlySequence);
149      int l;
150      if (maxPhraseLength.TryGetValue(phrase, out l)) return l;
151      l = 0;
152      maxPhraseLength[phrase] = short.MaxValue; // mark
153
154      foreach (var symb in phrase) {
155        l += CalcAndSetMaxSymbolLength(symb);
156      }
157      l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue
158      maxPhraseLength[phrase] = l; // update
159      return l;
160    }
161    #endregion
162
163    // read only access to caches
164    public int MaxSymbolLength(char symb) {
165      if (IsTerminal(symb)) return 1;
166      else return Math.Min(short.MaxValue, rules[symb].Max(alt => MaxPhraseLength(alt))); // maximal allowed value is short.MaxValue
167    }
168    public int MaxPhraseLength(Sequence phrase) {
169      var maxLen = 0;
170      if (maxPhraseLength.TryGetValue(phrase, out maxLen)) return maxLen;
171      foreach (var s in phrase) {
172        maxLen += MaxSymbolLength(s);
173      }
174      return Math.Min(short.MaxValue, maxLen); // maximal allowed value is short.MaxValue
175    }
176
177    public Sequence CompleteSentenceRandomly(System.Random random, Sequence phrase, int maxLen) {
178      if (phrase.Length > maxLen) throw new ArgumentException();
179      if (MinPhraseLength(phrase) > maxLen) throw new ArgumentException();
180      while (!phrase.IsTerminal) {
181        char nt = phrase.FirstNonTerminal;
182
183        int maxLenOfReplacement = maxLen - (phrase.Length - 1); // replacing aAb with maxLen 4 means we can only use alternatives with a minPhraseLen <= 2
184        Debug.Assert(maxLenOfReplacement > 0);
185
186
187        IEnumerable<Sequence> alts;
188        if (random.NextDouble() < 0.005) {
189          alts = GetTerminalAlternatives(nt).Where(alt => MinPhraseLength(alt) <= maxLenOfReplacement);
190        } else {
191          alts = GetNonTerminalAlternatives(nt).Where(alt => MinPhraseLength(alt) <= maxLenOfReplacement);
192          if (!alts.Any())
193            alts = GetTerminalAlternatives(nt).Where(alt => MinPhraseLength(alt) <= maxLenOfReplacement);
194        }
195        Debug.Assert(alts.Any());
196
197        // replace nt with random alternative
198        var selectedAlt = alts.SelectRandom(random);
199        phrase.ReplaceAt(phrase.FirstNonTerminalIndex, 1, selectedAlt);
200
201      }
202      return phrase;
203    }
204
205    public bool IsTerminal(char symbol) {
206      return terminalSymbols.Contains(symbol);
207    }
208
209    public bool IsTerminal(string phrase) {
210      // reverse because for our grammars and left-canonical derivation it is more likely that NTs occur near the end of the sequence
211      for (int i = phrase.Length - 1; i >= 0; i--) {
212        if (!IsTerminal(phrase[i])) return false;
213      }
214      return true;
215    }
216
217    public bool IsNonTerminal(char symbol) {
218      return nonTerminalSymbols.Contains(symbol);
219    }
220
221    private static readonly string symbRegex = @"(?<symb>[^\s\|\.])";
222    private static readonly string altRegex = @"(?<alt>[^\s\|\.]+)";
223    private static readonly string altListRegex = altRegex + @"(\s*\|\s*" + altRegex + @")*";
224    // private static readonly string terminalClassRegex = @"[^\s\|\.]\s*\.\.\s*[^\s\|\.]";
225    private static readonly string terminalClassRegex = symbRegex + @"\s*\.\.\s*" + symbRegex + @"\s*";
226
227    private static readonly Regex sentenceSymbRegex = new Regex(@"\s*G\(" + symbRegex + @"\):\s*$", RegexOptions.ExplicitCapture);
228    private static readonly Regex ruleRegex = new Regex(@"\s*" + symbRegex + @"\s*->\s*(" + altListRegex + "|" + terminalClassRegex + @")\s*$", RegexOptions.ExplicitCapture);
229
230
231    // supports only meta-symbol for alternatives (syntax of formal grammars)
232    // epsilon-alternatives are not supported
233    public static IGrammar FromString(string grammar) {
234      char sentenceSymbol;
235      var nonTerminals = new List<char>();
236      var terminals = new List<char>();
237      var rules = new List<Tuple<char, string>>();
238
239      using (var reader = new StringReader(grammar)) {
240        var line = reader.ReadLine();
241        while (string.IsNullOrWhiteSpace(line)) line = reader.ReadLine();
242        // first line contains "G(...):"
243        var match = sentenceSymbRegex.Match(line);
244        if (!match.Success) throw new ArgumentException();
245        sentenceSymbol = match.Groups["symb"].Value[0]; // only a single symbol is in the capture group
246
247        line = reader.ReadLine();
248        while (!string.IsNullOrWhiteSpace(line)) {
249          match = ruleRegex.Match(line);
250          if (match.Length != line.Length) throw new ArgumentException();
251          var nt = match.Groups["symb"].Captures[0].Value[0];
252          if (!nonTerminals.Contains(nt)) nonTerminals.Add(nt);
253
254          // contains alternatives?
255          if (match.Groups["alt"].Captures.Count > 0) {
256            foreach (var m in match.Groups["alt"].Captures) {
257              rules.Add(Tuple.Create(nt, m.ToString()));
258            }
259          } else if (match.Groups["symb"].Captures.Count == 3) {
260            // contains terminal range
261            var fromSymb = match.Groups["symb"].Captures[1].Value[0];
262            var toSymb = match.Groups["symb"].Captures[2].Value[0];
263            if (fromSymb > toSymb) throw new ArgumentException("left symbol is larger than right symbol in range");
264
265            for (byte b = Convert.ToByte(fromSymb); b <= Convert.ToByte(toSymb); b++) {
266              rules.Add(Tuple.Create(nt, Convert.ToChar(b).ToString()));
267            }
268          }
269
270          line = reader.ReadLine();
271        }
272      }
273
274      foreach (var r in rules) {
275        foreach (var s in r.Item2) {
276          if (!nonTerminals.Contains(s) && !terminals.Contains(s)) terminals.Add(s);
277        }
278      }
279      return new Grammar(sentenceSymbol, terminals, nonTerminals, rules);
280    }
281
282    public override string ToString() {
283      // for debugging
284      var sb = new StringBuilder();
285      sb.AppendFormat("G({0}):", sentenceSymbol).AppendLine();
286      foreach (var r in rules) {
287        foreach (var alt in r.Value) {
288          sb.AppendFormat("  {0} -> {1} (min: {2}, max {3})", r.Key, alt, MinPhraseLength(alt), MaxPhraseLength(alt))
289            .AppendLine();
290        }
291      }
292      return sb.ToString();
293    }
294  }
295}
Note: See TracBrowser for help on using the repository browser.