Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/Grammar.cs @ 11727

Last change on this file since 11727 was 11727, checked in by gkronber, 9 years ago

#2283: worked on grammatical optimization problem solvers (simple MCTS done)

File size: 11.1 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Data;
4using System.Diagnostics;
5using System.IO;
6using System.Linq;
7using System.Text;
8using System.Text.RegularExpressions;
9using System.Xml.Linq;
10using HeuristicLab.Common;
11
12namespace HeuristicLab.Problems.GrammaticalOptimization {
13
14  // phrases and symbols are both strings
15  // spaces are used to separate symbols in phases
16  // terminal and non-terminal symbols must not contain white-space
17  // white-spaces are not counted when determining the length of a sentence, only the number of terminal symbols is relevant
18
19  public class Grammar : IGrammar {
20
21    private readonly Dictionary<char, List<string>> rules;
22    private readonly HashSet<char> terminalSymbols;
23    private readonly char sentenceSymbol;
24    private readonly HashSet<char> nonTerminalSymbols;
25    private readonly Dictionary<string, int> maxPhraseLength = new Dictionary<string, int>();
26    private readonly Dictionary<string, int> minPhraseLength = new Dictionary<string, int>();
27
28    public char SentenceSymbol { get { return sentenceSymbol; } }
29    public IEnumerable<char> NonTerminalSymbols { get { return nonTerminalSymbols; } }
30    public IEnumerable<char> TerminalSymbols { get { return terminalSymbols; } }
31    public IEnumerable<char> Symbols { get { return nonTerminalSymbols.Concat(terminalSymbols); } } // ctor guarantees that intersection is empty
32
33    // cloning ctor
34    public Grammar(Grammar orig) {
35      this.rules = new Dictionary<char, List<string>>(orig.rules);
36      this.terminalSymbols = new HashSet<char>(orig.terminalSymbols);
37      this.sentenceSymbol = orig.sentenceSymbol;
38      this.nonTerminalSymbols = new HashSet<char>(orig.nonTerminalSymbols);
39      this.maxPhraseLength = new Dictionary<string, int>(orig.maxPhraseLength);
40      this.minPhraseLength = new Dictionary<string, int>(orig.minPhraseLength);
41    }
42
43    public Grammar(string grammar) : this((Grammar)FromString(grammar)) { }
44
45    public Grammar(char sentenceSymbol, IEnumerable<char> terminalSymbols, IEnumerable<char> nonTerminalSymbols, IEnumerable<Tuple<char, string>> rules) {
46      this.sentenceSymbol = sentenceSymbol;
47      this.terminalSymbols = new HashSet<char>(terminalSymbols);
48      this.nonTerminalSymbols = new HashSet<char>(nonTerminalSymbols);
49      this.rules = new Dictionary<char, List<string>>();
50      foreach (var r in rules) {
51        if (!this.rules.ContainsKey(r.Item1)) this.rules.Add(r.Item1, new List<string>());
52        this.rules[r.Item1].Add(r.Item2); // here we store an array of symbols for a phase
53      }
54
55      CheckValidity();
56      CalculatePhaseLengthBounds();
57    }
58
59    private void CheckValidity() {
60      // check validity of parameters
61      if (!nonTerminalSymbols.Contains(sentenceSymbol) || // sentence symbol must be a non-terminal
62          nonTerminalSymbols.Any(c => c < 'A' || c > 'Z') || // nonterminal symbols \in 'A' .. 'Z'
63          terminalSymbols.Intersect(nonTerminalSymbols).Any() || // no overlap between non-terminals and terminal
64          rules.Keys.Intersect(nonTerminalSymbols).Count() < nonTerminalSymbols.Count ||
65          nonTerminalSymbols.Any(nt => !rules.ContainsKey(nt)) || // each nt must have at least one rule
66          rules.Values.Any(alternatives => !alternatives.Any()) || // no eps alternatives
67          rules.Any(rule => rule.Value.Any(alternative => alternative.Length == 1 && alternative[0] == rule.Key))
68        // no infinite direct recursions
69        ) {
70        throw new ArgumentException();
71      }
72    }
73
74    private void CalculatePhaseLengthBounds() {
75      minPhraseLength.Clear();
76      maxPhraseLength.Clear();
77      // cache phrase lengths
78      foreach (var nt in nonTerminalSymbols) {
79        var min = int.MaxValue;
80        var max = int.MinValue;
81        foreach (var alt in rules[nt].OrderBy(alt => alt.Length)) {
82          minPhraseLength[alt] = MinPhraseLength(alt);
83          maxPhraseLength[alt] = MaxPhraseLength(alt);
84
85          min = Math.Min(min, minPhraseLength[alt]);
86          max = Math.Max(max, maxPhraseLength[alt]);
87        }
88        minPhraseLength[nt.ToString()] = min;
89        maxPhraseLength[nt.ToString()] = max;
90      }
91    }
92
93
94    public IEnumerable<string> GetAlternatives(char nt) {
95      return rules[nt];
96    }
97
98    public IEnumerable<string> GetTerminalAlternatives(char nt) {
99      return GetAlternatives(nt).Where(alt => alt.All(IsTerminal));
100    }
101
102    public IEnumerable<string> GetNonTerminalAlternatives(char nt) {
103      return GetAlternatives(nt).Where(alt => alt.Any(IsNonTerminal));
104    }
105
106    // caches for this are build in construction of object
107    public int MinPhraseLength(string phrase) {
108      int l;
109      if (minPhraseLength.TryGetValue(phrase, out l)) return l;
110      l = 0;
111      minPhraseLength[phrase] = short.MaxValue; // mark
112
113      foreach (var symb in phrase) {
114        if (IsNonTerminal(symb)) {
115          l += MinSymbolLength(symb);
116        } else {
117          l++;
118        }
119      }
120
121      l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue
122      minPhraseLength[phrase] = l; // update
123      return l;
124    }
125
126    // caches for this are build in construction of object
127    public int MaxPhraseLength(string phrase) {
128      int l;
129      if (maxPhraseLength.TryGetValue(phrase, out l)) return l;
130      l = 0;
131      maxPhraseLength[phrase] = short.MaxValue; // mark
132
133      foreach (var symb in phrase) {
134        if (IsNonTerminal(symb)) {
135          l += MaxSymbolLength(symb);
136        } else {
137          l++;
138        }
139      }
140      l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue
141      maxPhraseLength[phrase] = l; // update
142      return l;
143    }
144
145    private int MinSymbolLength(char nt) {
146      if (IsTerminal(nt)) return 1;
147      else return Math.Min(short.MaxValue, rules[nt].Min(alt => MinPhraseLength(alt))); // maximal allowed value is short.MaxValue
148    }
149    private int MaxSymbolLength(char nt) {
150      if (IsTerminal(nt)) return 1;
151      else return Math.Min(short.MaxValue, rules[nt].Max(alt => MaxPhraseLength(alt))); // maximal allowed value is short.MaxValue
152    }
153
154    public string CompleteSentenceRandomly(Random random, string phrase, int maxLen) {
155      if (phrase.Length > maxLen) throw new ArgumentException();
156      if (MinPhraseLength(phrase) > maxLen) throw new ArgumentException();
157      bool done = phrase.All(IsTerminal); // terminal phrase means we are done
158      while (!done) {
159        int ntIdx; char nt;
160        FindFirstNonTerminal(this, phrase, out nt, out ntIdx);
161
162        int maxLenOfReplacement = maxLen - (phrase.Length - 1); // replacing aAb with maxLen 4 means we can only use alternatives with a minPhraseLen <= 2
163        Debug.Assert(maxLenOfReplacement > 0);
164
165        var alts = GetAlternatives(nt).Where(alt => MinPhraseLength(alt) <= maxLenOfReplacement);
166        Debug.Assert(alts.Any());
167
168        // replace nt with random alternative
169        var selectedAlt = alts.SelectRandom(random);
170        phrase = phrase.Remove(ntIdx, 1);
171        phrase = phrase.Insert(ntIdx, selectedAlt);
172
173        done = phrase.All(IsTerminal); // terminal phrase means we are done
174      }
175      return phrase;
176    }
177
178    public static void FindFirstNonTerminal(IGrammar g, string phrase, out char nt, out int ntIdx) {
179      ntIdx = 0;
180      while (ntIdx < phrase.Length && g.IsTerminal(phrase[ntIdx])) ntIdx++;
181      if (ntIdx >= phrase.Length) {
182        ntIdx = -1;
183        nt = '\0';
184      } else {
185        nt = phrase[ntIdx];
186      }
187    }
188
189    public bool IsTerminal(char symbol) {
190      return terminalSymbols.Contains(symbol);
191    }
192
193    public bool IsNonTerminal(char symbol) {
194      return nonTerminalSymbols.Contains(symbol);
195    }
196
197    private static readonly string symbRegex = @"(?<symb>[^\s\|\.])";
198    private static readonly string altRegex = @"(?<alt>[^\s\|\.]+)";
199    private static readonly string altListRegex = altRegex + @"(\s*\|\s*" + altRegex + @")*";
200    // private static readonly string terminalClassRegex = @"[^\s\|\.]\s*\.\.\s*[^\s\|\.]";
201    private static readonly string terminalClassRegex = symbRegex + @"\s*\.\.\s*" + symbRegex + @"\s*";
202
203    private static readonly Regex sentenceSymbRegex = new Regex(@"\s*G\(" + symbRegex + @"\):\s*$", RegexOptions.ExplicitCapture);
204    private static readonly Regex ruleRegex = new Regex(@"\s*" + symbRegex + @"\s*->\s*(" + altListRegex + "|" + terminalClassRegex + @")\s*$", RegexOptions.ExplicitCapture);
205
206
207    // supports only meta-symbol for alternatives (syntax of formal grammars)
208    // epsilon-alternatives are not supported
209    public static IGrammar FromString(string grammar) {
210      char sentenceSymbol;
211      var nonTerminals = new List<char>();
212      var terminals = new List<char>();
213      var rules = new List<Tuple<char, string>>();
214
215      using (var reader = new StringReader(grammar)) {
216        var line = reader.ReadLine();
217        while (string.IsNullOrWhiteSpace(line)) line = reader.ReadLine();
218        // first line contains "G(...):"
219        var match = sentenceSymbRegex.Match(line);
220        if (!match.Success) throw new ArgumentException();
221        sentenceSymbol = match.Groups["symb"].Value[0]; // only a single symbol is in the capture group
222
223        line = reader.ReadLine();
224        while (!string.IsNullOrWhiteSpace(line)) {
225          match = ruleRegex.Match(line);
226          if (match.Length != line.Length) throw new ArgumentException();
227          var nt = match.Groups["symb"].Captures[0].Value[0];
228          if (!nonTerminals.Contains(nt)) nonTerminals.Add(nt);
229
230          // contains alternatives?
231          if (match.Groups["alt"].Captures.Count > 0) {
232            foreach (var m in match.Groups["alt"].Captures) {
233              rules.Add(Tuple.Create(nt, m.ToString()));
234            }
235          } else if (match.Groups["symb"].Captures.Count == 3) {
236            // contains terminal range
237            var fromSymb = match.Groups["symb"].Captures[1].Value[0];
238            var toSymb = match.Groups["symb"].Captures[2].Value[0];
239            if (fromSymb > toSymb) throw new ArgumentException("left symbol is larger than right symbol in range");
240
241            for (byte b = Convert.ToByte(fromSymb); b <= Convert.ToByte(toSymb); b++) {
242              rules.Add(Tuple.Create(nt, Convert.ToChar(b).ToString()));
243            }
244          }
245
246          line = reader.ReadLine();
247        }
248      }
249
250      foreach (var r in rules) {
251        foreach (var s in r.Item2) {
252          if (!nonTerminals.Contains(s) && !terminals.Contains(s)) terminals.Add(s);
253        }
254      }
255      return new Grammar(sentenceSymbol, terminals, nonTerminals, rules);
256    }
257
258    public override string ToString() {
259      // for debugging
260      var sb = new StringBuilder();
261      sb.AppendFormat("G({0}):", sentenceSymbol).AppendLine();
262      foreach (var r in rules) {
263        foreach (var alt in r.Value) {
264          var phrase = string.Join(" ", alt);
265          sb.AppendFormat("  {0} -> {1} (min: {2}, max {3})", r.Key, phrase, MinPhraseLength(phrase), MaxPhraseLength(phrase))
266            .AppendLine();
267        }
268      }
269      return sb.ToString();
270    }
271  }
272}
Note: See TracBrowser for help on using the repository browser.