Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization-gkr/HeuristicLab.Problems.GrammaticalOptimization/Grammar.cs @ 12533

Last change on this file since 12533 was 12391, checked in by gkronber, 10 years ago

#2283: added shuffling of terminal symbols to the royal pair problem to make sure that there is no bias from order of terminal symbols.

File size: 12.4 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.IO;
5using System.Linq;
6using System.Text;
7using System.Text.RegularExpressions;
8using HeuristicLab.Common;
9
10namespace HeuristicLab.Problems.GrammaticalOptimization {
11
12  // phrases and symbols are both strings
13  // spaces are used to separate symbols in phases
14  // terminal and non-terminal symbols must not contain white-space
15  // white-spaces are not counted when determining the length of a sentence, only the number of terminal symbols is relevant
16
17  public class Grammar : IGrammar {
18
19    private readonly Dictionary<char, List<Sequence>> rules;
20    private readonly HashSet<char> terminalSymbols;
21    private readonly char sentenceSymbol;
22    private readonly HashSet<char> nonTerminalSymbols;
23    private readonly Dictionary<Sequence, int> maxPhraseLength = new Dictionary<Sequence, int>();
24    private readonly Dictionary<Sequence, int> minPhraseLength = new Dictionary<Sequence, int>();
25
26    public char SentenceSymbol { get { return sentenceSymbol; } }
27    public IEnumerable<char> NonTerminalSymbols { get { return nonTerminalSymbols; } }
28    public IEnumerable<char> TerminalSymbols { get { return terminalSymbols; } }
29    public IEnumerable<char> Symbols { get { return nonTerminalSymbols.Concat(terminalSymbols); } } // ctor guarantees that intersection is empty
30
31    // cloning ctor
32    public Grammar(Grammar orig) {
33      this.rules = new Dictionary<char, List<Sequence>>();
34      foreach (var r in orig.rules)
35        this.rules.Add(r.Key, new List<Sequence>(r.Value.Select(v => new ReadonlySequence(v)))); // clone sequences
36      this.terminalSymbols = new HashSet<char>(orig.terminalSymbols);
37      this.sentenceSymbol = orig.sentenceSymbol;
38      this.nonTerminalSymbols = new HashSet<char>(orig.nonTerminalSymbols);
39      this.maxPhraseLength = new Dictionary<Sequence, int>();
40      foreach (var p in orig.maxPhraseLength) this.maxPhraseLength.Add(new ReadonlySequence(p.Key), p.Value);
41      this.minPhraseLength = new Dictionary<Sequence, int>();
42      foreach (var p in orig.minPhraseLength) this.minPhraseLength.Add(new ReadonlySequence(p.Key), p.Value);
43    }
44
45    public Grammar(string grammar) : this((Grammar)FromString(grammar)) { }
46
47    public Grammar(char sentenceSymbol, IEnumerable<char> terminalSymbols, IEnumerable<char> nonTerminalSymbols, IEnumerable<Tuple<char, string>> rules) {
48      this.sentenceSymbol = sentenceSymbol;
49      this.terminalSymbols = new HashSet<char>(terminalSymbols);
50      this.nonTerminalSymbols = new HashSet<char>(nonTerminalSymbols);
51      this.rules = new Dictionary<char, List<Sequence>>();
52      foreach (var r in rules) {
53        if (!this.rules.ContainsKey(r.Item1)) this.rules.Add(r.Item1, new List<Sequence>());
54        this.rules[r.Item1].Add(new ReadonlySequence(r.Item2)); // here we store an array of symbols for a phase
55      }
56
57      CheckValidity();
58      CalculatePhraseLengthBounds();
59    }
60
61    private void CheckValidity() {
62      // check validity of parameters
63      if (!nonTerminalSymbols.Contains(sentenceSymbol) || // sentence symbol must be a non-terminal
64          nonTerminalSymbols.Any(c => c < 'A' || c > 'Z') || // nonterminal symbols \in 'A' .. 'Z'
65          terminalSymbols.Intersect(nonTerminalSymbols).Any() || // no overlap between non-terminals and terminal
66          rules.Keys.Intersect(nonTerminalSymbols).Count() < nonTerminalSymbols.Count ||
67          nonTerminalSymbols.Any(nt => !rules.ContainsKey(nt)) || // each nt must have at least one rule
68          rules.Values.Any(alternatives => !alternatives.Any()) || // no eps alternatives
69          rules.Any(rule => rule.Value.Any(alternative => alternative.Length == 1 && alternative[0] == rule.Key))
70        // no infinite direct recursions
71        ) {
72        throw new ArgumentException();
73      }
74    }
75
76    private void CalculatePhraseLengthBounds() {
77      // cache phrase lengths
78      foreach (var nt in nonTerminalSymbols) {
79        var min = int.MaxValue;
80        var max = int.MinValue;
81        foreach (var alt in rules[nt].OrderBy(alt => alt.Length)) {
82          CalcAndSetMinPhraseLength(alt);
83          CalcAndSetMaxPhraseLength(alt);
84
85          min = Math.Min(min, minPhraseLength[alt]);
86          max = Math.Max(max, maxPhraseLength[alt]);
87        }
88        minPhraseLength[new ReadonlySequence(nt)] = min;
89        maxPhraseLength[new ReadonlySequence(nt)] = max;
90      }
91    }
92
93    public IEnumerable<Sequence> GetAlternatives(char nt) {
94      return rules[nt];
95    }
96
97    public IEnumerable<Sequence> GetTerminalAlternatives(char nt) {
98      return GetAlternatives(nt).Where(alt => alt.All(IsTerminal));
99    }
100
101    public IEnumerable<Sequence> GetNonTerminalAlternatives(char nt) {
102      return GetAlternatives(nt).Where(alt => alt.Any(IsNonTerminal));
103    }
104
105    #region population of minphraselength cache
106    private int CalcAndSetMinSymbolLength(char symb) {
107      if (IsTerminal(symb)) return 1;
108      else return Math.Min(short.MaxValue, rules[symb].Min(alt => CalcAndSetMinPhraseLength(alt))); // maximal allowed value is short.MaxValue
109    }
110    private int CalcAndSetMinPhraseLength(Sequence phrase) {
111      Debug.Assert(phrase is ReadonlySequence);
112      int l;
113      if (minPhraseLength.TryGetValue(phrase, out l)) return l;
114      l = 0;
115      minPhraseLength[phrase] = short.MaxValue; // mark
116
117      foreach (var symb in phrase) {
118        l += CalcAndSetMinSymbolLength(symb);
119      }
120
121      l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue
122      minPhraseLength[phrase] = l; // update
123      return l;
124    }
125    #endregion
126
127    // read only access to caches
128    private int MinSymbolLength(char symb) {
129      if (IsTerminal(symb)) return 1;
130      else return Math.Min(short.MaxValue, rules[symb].Min(alt => MinPhraseLength(alt))); // maximal allowed value is short.MaxValue
131    }
132    public int MinPhraseLength(Sequence phrase) {
133      var minLen = 0;
134      if (minPhraseLength.TryGetValue(phrase, out minLen)) return minLen;
135      foreach (var s in phrase) {
136        minLen += MinSymbolLength(s);
137      }
138      return Math.Min(short.MaxValue, minLen); // maximal allowed value is short.MaxValue
139    }
140
141    #region population of maxphraselength cache
142    private int CalcAndSetMaxSymbolLength(char symb) {
143      if (IsTerminal(symb)) return 1;
144      else return Math.Min(short.MaxValue, rules[symb].Max(alt => CalcAndSetMaxPhraseLength(alt))); // maximal allowed value is short.MaxValue
145    }
146    // caches for this are build in construction of object
147    private int CalcAndSetMaxPhraseLength(Sequence phrase) {
148      Debug.Assert(phrase is ReadonlySequence);
149      int l;
150      if (maxPhraseLength.TryGetValue(phrase, out l)) return l;
151      l = 0;
152      maxPhraseLength[phrase] = short.MaxValue; // mark
153
154      foreach (var symb in phrase) {
155        l += CalcAndSetMaxSymbolLength(symb);
156      }
157      l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue
158      maxPhraseLength[phrase] = l; // update
159      return l;
160    }
161    #endregion
162
163    // read only access to caches
164    public int MaxSymbolLength(char symb) {
165      if (IsTerminal(symb)) return 1;
166      else return Math.Min(short.MaxValue, rules[symb].Max(alt => MaxPhraseLength(alt))); // maximal allowed value is short.MaxValue
167    }
168    public int MaxPhraseLength(Sequence phrase) {
169      var maxLen = 0;
170      if (maxPhraseLength.TryGetValue(phrase, out maxLen)) return maxLen;
171      foreach (var s in phrase) {
172        maxLen += MaxSymbolLength(s);
173      }
174      return Math.Min(short.MaxValue, maxLen); // maximal allowed value is short.MaxValue
175    }
176
177    public Sequence CompleteSentenceRandomly(System.Random random, Sequence phrase, int maxLen) {
178      if (phrase.Length > maxLen) throw new ArgumentException();
179      if (MinPhraseLength(phrase) > maxLen) throw new ArgumentException();
180      while (!phrase.IsTerminal) {
181        char nt = phrase.FirstNonTerminal;
182
183        int maxLenOfReplacement = maxLen - (phrase.Length - 1); // replacing aAb with maxLen 4 means we can only use alternatives with a minPhraseLen <= 2
184        Debug.Assert(maxLenOfReplacement > 0);
185
186        var alts = GetTerminalAlternatives(nt).Where(alt => MinPhraseLength(alt) <= maxLenOfReplacement);
187        Debug.Assert(alts.Any());
188
189        // replace nt with random alternative
190        var selectedAlt = alts.SelectRandom(random);
191        phrase.ReplaceAt(phrase.FirstNonTerminalIndex, 1, selectedAlt);
192
193      }
194      return phrase;
195    }
196
197    public bool IsTerminal(char symbol) {
198      return terminalSymbols.Contains(symbol);
199    }
200
201    public bool IsTerminal(string phrase) {
202      // reverse because for our grammars and left-canonical derivation it is more likely that NTs occur near the end of the sequence
203      for (int i = phrase.Length - 1; i >= 0; i--) {
204        if (!IsTerminal(phrase[i])) return false;
205      }
206      return true;
207    }
208
209    public bool IsNonTerminal(char symbol) {
210      return nonTerminalSymbols.Contains(symbol);
211    }
212
213    private static readonly string symbRegex = @"(?<symb>[^\s\|\.])";
214    private static readonly string altRegex = @"(?<alt>[^\s\|\.]+)";
215    private static readonly string altListRegex = altRegex + @"(\s*\|\s*" + altRegex + @")*";
216    // private static readonly string terminalClassRegex = @"[^\s\|\.]\s*\.\.\s*[^\s\|\.]";
217    private static readonly string terminalClassRegex = symbRegex + @"\s*\.\.\s*" + symbRegex + @"\s*";
218
219    private static readonly Regex sentenceSymbRegex = new Regex(@"\s*G\(" + symbRegex + @"\):\s*$", RegexOptions.ExplicitCapture);
220    private static readonly Regex ruleRegex = new Regex(@"\s*" + symbRegex + @"\s*->\s*(" + altListRegex + "|" + terminalClassRegex + @")\s*$", RegexOptions.ExplicitCapture);
221
222
223    // supports only meta-symbol for alternatives (syntax of formal grammars)
224    // epsilon-alternatives are not supported
225    public static IGrammar FromString(string grammar) {
226      char sentenceSymbol;
227      var nonTerminals = new List<char>();
228      var terminals = new List<char>();
229      var rules = new List<Tuple<char, string>>();
230
231      using (var reader = new StringReader(grammar)) {
232        var line = reader.ReadLine();
233        while (string.IsNullOrWhiteSpace(line)) line = reader.ReadLine();
234        // first line contains "G(...):"
235        var match = sentenceSymbRegex.Match(line);
236        if (!match.Success) throw new ArgumentException();
237        sentenceSymbol = match.Groups["symb"].Value[0]; // only a single symbol is in the capture group
238
239        line = reader.ReadLine();
240        while (!string.IsNullOrWhiteSpace(line)) {
241          match = ruleRegex.Match(line);
242          if (match.Length != line.Length) throw new ArgumentException();
243          var nt = match.Groups["symb"].Captures[0].Value[0];
244          if (!nonTerminals.Contains(nt)) nonTerminals.Add(nt);
245
246          // contains alternatives?
247          if (match.Groups["alt"].Captures.Count > 0) {
248            foreach (var m in match.Groups["alt"].Captures) {
249              rules.Add(Tuple.Create(nt, m.ToString()));
250            }
251          } else if (match.Groups["symb"].Captures.Count == 3) {
252            // contains terminal range
253            var fromSymb = match.Groups["symb"].Captures[1].Value[0];
254            var toSymb = match.Groups["symb"].Captures[2].Value[0];
255            if (fromSymb > toSymb) throw new ArgumentException("left symbol is larger than right symbol in range");
256
257            for (byte b = Convert.ToByte(fromSymb); b <= Convert.ToByte(toSymb); b++) {
258              rules.Add(Tuple.Create(nt, Convert.ToChar(b).ToString()));
259            }
260          }
261
262          line = reader.ReadLine();
263        }
264      }
265
266      foreach (var r in rules) {
267        foreach (var s in r.Item2) {
268          if (!nonTerminals.Contains(s) && !terminals.Contains(s)) terminals.Add(s);
269        }
270      }
271      return new Grammar(sentenceSymbol, terminals, nonTerminals, rules);
272    }
273
274    public override string ToString() {
275      // for debugging
276      var sb = new StringBuilder();
277      sb.AppendFormat("G({0}):", sentenceSymbol).AppendLine();
278      foreach (var r in rules) {
279        foreach (var alt in r.Value) {
280          sb.AppendFormat("  {0} -> {1} (min: {2}, max {3})", r.Key, alt, MinPhraseLength(alt), MaxPhraseLength(alt))
281            .AppendLine();
282        }
283      }
284      return sb.ToString();
285    }
286  }
287}
Note: See TracBrowser for help on using the repository browser.