Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HeuristicLab.Problems.GrammaticalOptimization/HeuristicLab.Problems.GrammaticalOptimization/Grammar.cs @ 11659

Last change on this file since 11659 was 11659, checked in by gkronber, 10 years ago

#2283 initial import of implementation of grammatical optimization problem instances

File size: 11.1 KB
Line 
1using System;
2using System.Collections.Generic;
3using System.Data;
4using System.Diagnostics;
5using System.IO;
6using System.Linq;
7using System.Text;
8using System.Text.RegularExpressions;
9using System.Xml.Linq;
10
11namespace HeuristicLab.Problems.GrammaticalOptimization {
12
13  // phrases and symbols are both strings
14  // spaces are used to separate symbols in phases
15  // terminal and non-terminal symbols must not contain white-space
16  // white-spaces are not counted when determining the length of a sentence, only the number of terminal symbols is relevant
17
18  public class Grammar : IGrammar {
19
20    private readonly Dictionary<char, List<string>> rules;
21    private readonly HashSet<char> terminalSymbols;
22    private readonly char sentenceSymbol;
23    private readonly HashSet<char> nonTerminalSymbols;
24    private readonly Dictionary<string, int> maxPhraseLength = new Dictionary<string, int>();
25    private readonly Dictionary<string, int> minPhraseLength = new Dictionary<string, int>();
26
27    public char SentenceSymbol { get { return sentenceSymbol; } }
28    public IEnumerable<char> NonTerminalSymbols { get { return nonTerminalSymbols; } }
29    public IEnumerable<char> TerminalSymbols { get { return terminalSymbols; } }
30    public IEnumerable<char> Symbols { get { return nonTerminalSymbols.Concat(terminalSymbols); } } // ctor guarantees that intersection is empty
31
32    // cloning ctor
33    public Grammar(Grammar orig) {
34      this.rules = new Dictionary<char, List<string>>(orig.rules);
35      this.terminalSymbols = new HashSet<char>(orig.terminalSymbols);
36      this.sentenceSymbol = orig.sentenceSymbol;
37      this.nonTerminalSymbols = new HashSet<char>(orig.nonTerminalSymbols);
38      this.maxPhraseLength = new Dictionary<string, int>(orig.maxPhraseLength);
39      this.minPhraseLength = new Dictionary<string, int>(orig.minPhraseLength);
40    }
41
42    public Grammar(string grammar) : this((Grammar)FromString(grammar)) { }
43
44    public Grammar(char sentenceSymbol, IEnumerable<char> terminalSymbols, IEnumerable<char> nonTerminalSymbols, IEnumerable<Tuple<char, string>> rules) {
45      this.sentenceSymbol = sentenceSymbol;
46      this.terminalSymbols = new HashSet<char>(terminalSymbols);
47      this.nonTerminalSymbols = new HashSet<char>(nonTerminalSymbols);
48      this.rules = new Dictionary<char, List<string>>();
49      foreach (var r in rules) {
50        if (!this.rules.ContainsKey(r.Item1)) this.rules.Add(r.Item1, new List<string>());
51        this.rules[r.Item1].Add(r.Item2); // here we store an array of symbols for a phase
52      }
53
54      CheckValidity();
55      CalculatePhaseLengthBounds();
56    }
57
58    private void CheckValidity() {
59      // check validity of parameters
60      if (!nonTerminalSymbols.Contains(sentenceSymbol) || // sentence symbol must be a non-terminal
61          nonTerminalSymbols.Any(c => c < 'A' || c > 'Z') || // nonterminal symbols \in 'A' .. 'Z'
62          terminalSymbols.Intersect(nonTerminalSymbols).Any() || // no overlap between non-terminals and terminal
63          rules.Keys.Intersect(nonTerminalSymbols).Count() < nonTerminalSymbols.Count ||
64          nonTerminalSymbols.Any(nt => !rules.ContainsKey(nt)) || // each nt must have at least one rule
65          rules.Values.Any(alternatives => !alternatives.Any()) || // no eps alternatives
66          rules.Any(rule => rule.Value.Any(alternative => alternative.Length == 1 && alternative[0] == rule.Key))
67        // no infinite direct recursions
68        ) {
69        throw new ArgumentException();
70      }
71    }
72
73    private void CalculatePhaseLengthBounds() {
74      minPhraseLength.Clear();
75      maxPhraseLength.Clear();
76      // cache phrase lengths
77      foreach (var nt in nonTerminalSymbols) {
78        var min = int.MaxValue;
79        var max = int.MinValue;
80        foreach (var alt in rules[nt].OrderBy(alt => alt.Length)) {
81          minPhraseLength[alt] = MinPhraseLength(alt);
82          maxPhraseLength[alt] = MaxPhraseLength(alt);
83
84          min = Math.Min(min, minPhraseLength[alt]);
85          max = Math.Max(max, maxPhraseLength[alt]);
86        }
87        minPhraseLength[nt.ToString()] = min;
88        maxPhraseLength[nt.ToString()] = max;
89      }
90    }
91
92
93    public IEnumerable<string> GetAlternatives(char nt) {
94      return rules[nt];
95    }
96
97    public IEnumerable<string> GetTerminalAlternatives(char nt) {
98      return GetAlternatives(nt).Where(alt => alt.All(IsTerminal));
99    }
100
101    public IEnumerable<string> GetNonTerminalAlternatives(char nt) {
102      return GetAlternatives(nt).Where(alt => alt.Any(IsNonTerminal));
103    }
104
105    // caches for this are build in construction of object
106    public int MinPhraseLength(string phrase) {
107      int l;
108      if (minPhraseLength.TryGetValue(phrase, out l)) return l;
109      l = 0;
110      minPhraseLength[phrase] = short.MaxValue; // mark
111
112      foreach (var symb in phrase) {
113        if (IsNonTerminal(symb)) {
114          l += MinSymbolLength(symb);
115        } else {
116          l++;
117        }
118      }
119
120      l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue
121      minPhraseLength[phrase] = l; // update
122      return l;
123    }
124
125    // caches for this are build in construction of object
126    public int MaxPhraseLength(string phrase) {
127      int l;
128      if (maxPhraseLength.TryGetValue(phrase, out l)) return l;
129      l = 0;
130      maxPhraseLength[phrase] = short.MaxValue; // mark
131
132      foreach (var symb in phrase) {
133        if (IsNonTerminal(symb)) {
134          l += MaxSymbolLength(symb);
135        } else {
136          l++;
137        }
138      }
139      l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue
140      maxPhraseLength[phrase] = l; // update
141      return l;
142    }
143
144    private int MinSymbolLength(char nt) {
145      if (IsTerminal(nt)) return 1;
146      else return Math.Min(short.MaxValue, rules[nt].Min(alt => MinPhraseLength(alt))); // maximal allowed value is short.MaxValue
147    }
148    private int MaxSymbolLength(char nt) {
149      if (IsTerminal(nt)) return 1;
150      else return Math.Min(short.MaxValue, rules[nt].Max(alt => MaxPhraseLength(alt))); // maximal allowed value is short.MaxValue
151    }
152
153    public string CompleteSentenceRandomly(Random random, string phrase, int maxLen) {
154      if (phrase.Length > maxLen) throw new ArgumentException();
155      if (MinPhraseLength(phrase) > maxLen) throw new ArgumentException();
156      bool done = phrase.All(IsTerminal); // terminal phrase means we are done
157      while (!done) {
158        int ntIdx; char nt;
159        FindFirstNonTerminal(phrase, out nt, out ntIdx);
160
161        int maxLenOfReplacement = maxLen - (phrase.Length - 1); // replacing aAb with maxLen 4 means we can only use alternatives with a minPhraseLen <= 2
162        Debug.Assert(maxLenOfReplacement > 0);
163
164        var alts = GetAlternatives(nt).Where(alt => MinPhraseLength(alt) <= maxLenOfReplacement);
165        Debug.Assert(alts.Any());
166
167        // replace nt with random alternative
168        var selectedAlt = alts.SelectRandom(random);
169        phrase = phrase.Remove(ntIdx, 1);
170        phrase = phrase.Insert(ntIdx, selectedAlt);
171
172        done = phrase.All(IsTerminal); // terminal phrase means we are done
173      }
174      return phrase;
175    }
176
177    private void FindFirstNonTerminal(string phrase, out char nt, out int ntIdx) {
178      ntIdx = 0;
179      while (ntIdx < phrase.Length && IsTerminal(phrase[ntIdx])) ntIdx++;
180      if (ntIdx >= phrase.Length) {
181        ntIdx = -1;
182        nt = '\0';
183      } else {
184        nt = phrase[ntIdx];
185      }
186    }
187
188    public bool IsTerminal(char symbol) {
189      return terminalSymbols.Contains(symbol);
190    }
191
192    public bool IsNonTerminal(char symbol) {
193      return nonTerminalSymbols.Contains(symbol);
194    }
195
196    private static readonly string symbRegex = @"(?<symb>[^\s\|\.])";
197    private static readonly string altRegex = @"(?<alt>[^\s\|\.]+)";
198    private static readonly string altListRegex = altRegex + @"(\s*\|\s*" + altRegex + @")*";
199    // private static readonly string terminalClassRegex = @"[^\s\|\.]\s*\.\.\s*[^\s\|\.]";
200    private static readonly string terminalClassRegex = symbRegex + @"\s*\.\.\s*" + symbRegex + @"\s*";
201
202    private static readonly Regex sentenceSymbRegex = new Regex(@"\s*G\(" + symbRegex + @"\):\s*$", RegexOptions.ExplicitCapture);
203    private static readonly Regex ruleRegex = new Regex(@"\s*" + symbRegex + @"\s*->\s*(" + altListRegex + "|" + terminalClassRegex + @")\s*$", RegexOptions.ExplicitCapture);
204
205
206    // supports only meta-symbol for alternatives (syntax of formal grammars)
207    // epsilon-alternatives are not supported
208    public static IGrammar FromString(string grammar) {
209      char sentenceSymbol;
210      var nonTerminals = new List<char>();
211      var terminals = new List<char>();
212      var rules = new List<Tuple<char, string>>();
213
214      using (var reader = new StringReader(grammar)) {
215        var line = reader.ReadLine();
216        while (string.IsNullOrWhiteSpace(line)) line = reader.ReadLine();
217        // first line contains "G(...):"
218        var match = sentenceSymbRegex.Match(line);
219        if (!match.Success) throw new ArgumentException();
220        sentenceSymbol = match.Groups["symb"].Value[0]; // only a single symbol is in the capture group
221
222        line = reader.ReadLine();
223        while (!string.IsNullOrWhiteSpace(line)) {
224          match = ruleRegex.Match(line);
225          if (match.Length != line.Length) throw new ArgumentException();
226          var nt = match.Groups["symb"].Captures[0].Value[0];
227          if (!nonTerminals.Contains(nt)) nonTerminals.Add(nt);
228
229          // contains alternatives?
230          if (match.Groups["alt"].Captures.Count > 0) {
231            foreach (var m in match.Groups["alt"].Captures) {
232              rules.Add(Tuple.Create(nt, m.ToString()));
233            }
234          } else if (match.Groups["symb"].Captures.Count == 3) {
235            // contains terminal range
236            var fromSymb = match.Groups["symb"].Captures[1].Value[0];
237            var toSymb = match.Groups["symb"].Captures[2].Value[0];
238            if (fromSymb > toSymb) throw new ArgumentException("left symbol is larger than right symbol in range");
239
240            for (byte b = Convert.ToByte(fromSymb); b <= Convert.ToByte(toSymb); b++) {
241              rules.Add(Tuple.Create(nt, Convert.ToChar(b).ToString()));
242            }
243          }
244
245          line = reader.ReadLine();
246        }
247      }
248
249      foreach (var r in rules) {
250        foreach (var s in r.Item2) {
251          if (!nonTerminals.Contains(s) && !terminals.Contains(s)) terminals.Add(s);
252        }
253      }
254      return new Grammar(sentenceSymbol, terminals, nonTerminals, rules);
255    }
256
257    public override string ToString() {
258      // for debugging
259      var sb = new StringBuilder();
260      sb.AppendFormat("G({0}):", sentenceSymbol).AppendLine();
261      foreach (var r in rules) {
262        foreach (var alt in r.Value) {
263          var phrase = string.Join(" ", alt);
264          sb.AppendFormat("  {0} -> {1} (min: {2}, max {3})", r.Key, phrase, MinPhraseLength(phrase), MaxPhraseLength(phrase))
265            .AppendLine();
266        }
267      }
268      return sb.ToString();
269    }
270  }
271}
Note: See TracBrowser for help on using the repository browser.