1 | using System;
|
---|
2 | using System.Collections.Generic;
|
---|
3 | using System.Data;
|
---|
4 | using System.Diagnostics;
|
---|
5 | using System.IO;
|
---|
6 | using System.Linq;
|
---|
7 | using System.Text;
|
---|
8 | using System.Text.RegularExpressions;
|
---|
9 | using System.Xml.Linq;
|
---|
10 | using HeuristicLab.Common;
|
---|
11 |
|
---|
12 | namespace HeuristicLab.Problems.GrammaticalOptimization {
|
---|
13 |
|
---|
14 | // phrases and symbols are both strings
|
---|
15 | // spaces are used to separate symbols in phases
|
---|
16 | // terminal and non-terminal symbols must not contain white-space
|
---|
17 | // white-spaces are not counted when determining the length of a sentence, only the number of terminal symbols is relevant
|
---|
18 |
|
---|
19 | public class Grammar : IGrammar {
|
---|
20 |
|
---|
21 | private readonly Dictionary<char, List<string>> rules;
|
---|
22 | private readonly HashSet<char> terminalSymbols;
|
---|
23 | private readonly char sentenceSymbol;
|
---|
24 | private readonly HashSet<char> nonTerminalSymbols;
|
---|
25 | private readonly Dictionary<string, int> maxPhraseLength = new Dictionary<string, int>();
|
---|
26 | private readonly Dictionary<string, int> minPhraseLength = new Dictionary<string, int>();
|
---|
27 |
|
---|
28 | public char SentenceSymbol { get { return sentenceSymbol; } }
|
---|
29 | public IEnumerable<char> NonTerminalSymbols { get { return nonTerminalSymbols; } }
|
---|
30 | public IEnumerable<char> TerminalSymbols { get { return terminalSymbols; } }
|
---|
31 | public IEnumerable<char> Symbols { get { return nonTerminalSymbols.Concat(terminalSymbols); } } // ctor guarantees that intersection is empty
|
---|
32 |
|
---|
33 | // cloning ctor
|
---|
34 | public Grammar(Grammar orig) {
|
---|
35 | this.rules = new Dictionary<char, List<string>>(orig.rules);
|
---|
36 | this.terminalSymbols = new HashSet<char>(orig.terminalSymbols);
|
---|
37 | this.sentenceSymbol = orig.sentenceSymbol;
|
---|
38 | this.nonTerminalSymbols = new HashSet<char>(orig.nonTerminalSymbols);
|
---|
39 | this.maxPhraseLength = new Dictionary<string, int>(orig.maxPhraseLength);
|
---|
40 | this.minPhraseLength = new Dictionary<string, int>(orig.minPhraseLength);
|
---|
41 | }
|
---|
42 |
|
---|
43 | public Grammar(string grammar) : this((Grammar)FromString(grammar)) { }
|
---|
44 |
|
---|
45 | public Grammar(char sentenceSymbol, IEnumerable<char> terminalSymbols, IEnumerable<char> nonTerminalSymbols, IEnumerable<Tuple<char, string>> rules) {
|
---|
46 | this.sentenceSymbol = sentenceSymbol;
|
---|
47 | this.terminalSymbols = new HashSet<char>(terminalSymbols);
|
---|
48 | this.nonTerminalSymbols = new HashSet<char>(nonTerminalSymbols);
|
---|
49 | this.rules = new Dictionary<char, List<string>>();
|
---|
50 | foreach (var r in rules) {
|
---|
51 | if (!this.rules.ContainsKey(r.Item1)) this.rules.Add(r.Item1, new List<string>());
|
---|
52 | this.rules[r.Item1].Add(r.Item2); // here we store an array of symbols for a phase
|
---|
53 | }
|
---|
54 |
|
---|
55 | CheckValidity();
|
---|
56 | CalculatePhaseLengthBounds();
|
---|
57 | }
|
---|
58 |
|
---|
59 | private void CheckValidity() {
|
---|
60 | // check validity of parameters
|
---|
61 | if (!nonTerminalSymbols.Contains(sentenceSymbol) || // sentence symbol must be a non-terminal
|
---|
62 | nonTerminalSymbols.Any(c => c < 'A' || c > 'Z') || // nonterminal symbols \in 'A' .. 'Z'
|
---|
63 | terminalSymbols.Intersect(nonTerminalSymbols).Any() || // no overlap between non-terminals and terminal
|
---|
64 | rules.Keys.Intersect(nonTerminalSymbols).Count() < nonTerminalSymbols.Count ||
|
---|
65 | nonTerminalSymbols.Any(nt => !rules.ContainsKey(nt)) || // each nt must have at least one rule
|
---|
66 | rules.Values.Any(alternatives => !alternatives.Any()) || // no eps alternatives
|
---|
67 | rules.Any(rule => rule.Value.Any(alternative => alternative.Length == 1 && alternative[0] == rule.Key))
|
---|
68 | // no infinite direct recursions
|
---|
69 | ) {
|
---|
70 | throw new ArgumentException();
|
---|
71 | }
|
---|
72 | }
|
---|
73 |
|
---|
74 | private void CalculatePhaseLengthBounds() {
|
---|
75 | minPhraseLength.Clear();
|
---|
76 | maxPhraseLength.Clear();
|
---|
77 | // cache phrase lengths
|
---|
78 | foreach (var nt in nonTerminalSymbols) {
|
---|
79 | var min = int.MaxValue;
|
---|
80 | var max = int.MinValue;
|
---|
81 | foreach (var alt in rules[nt].OrderBy(alt => alt.Length)) {
|
---|
82 | minPhraseLength[alt] = MinPhraseLength(alt);
|
---|
83 | maxPhraseLength[alt] = MaxPhraseLength(alt);
|
---|
84 |
|
---|
85 | min = Math.Min(min, minPhraseLength[alt]);
|
---|
86 | max = Math.Max(max, maxPhraseLength[alt]);
|
---|
87 | }
|
---|
88 | minPhraseLength[nt.ToString()] = min;
|
---|
89 | maxPhraseLength[nt.ToString()] = max;
|
---|
90 | }
|
---|
91 | }
|
---|
92 |
|
---|
93 |
|
---|
94 | public IEnumerable<string> GetAlternatives(char nt) {
|
---|
95 | return rules[nt];
|
---|
96 | }
|
---|
97 |
|
---|
98 | public IEnumerable<string> GetTerminalAlternatives(char nt) {
|
---|
99 | return GetAlternatives(nt).Where(alt => alt.All(IsTerminal));
|
---|
100 | }
|
---|
101 |
|
---|
102 | public IEnumerable<string> GetNonTerminalAlternatives(char nt) {
|
---|
103 | return GetAlternatives(nt).Where(alt => alt.Any(IsNonTerminal));
|
---|
104 | }
|
---|
105 |
|
---|
106 | // caches for this are build in construction of object
|
---|
107 | public int MinPhraseLength(string phrase) {
|
---|
108 | int l;
|
---|
109 | if (minPhraseLength.TryGetValue(phrase, out l)) return l;
|
---|
110 | l = 0;
|
---|
111 | minPhraseLength[phrase] = short.MaxValue; // mark
|
---|
112 |
|
---|
113 | foreach (var symb in phrase) {
|
---|
114 | if (IsNonTerminal(symb)) {
|
---|
115 | l += MinSymbolLength(symb);
|
---|
116 | } else {
|
---|
117 | l++;
|
---|
118 | }
|
---|
119 | }
|
---|
120 |
|
---|
121 | l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue
|
---|
122 | minPhraseLength[phrase] = l; // update
|
---|
123 | return l;
|
---|
124 | }
|
---|
125 |
|
---|
126 | // caches for this are build in construction of object
|
---|
127 | public int MaxPhraseLength(string phrase) {
|
---|
128 | int l;
|
---|
129 | if (maxPhraseLength.TryGetValue(phrase, out l)) return l;
|
---|
130 | l = 0;
|
---|
131 | maxPhraseLength[phrase] = short.MaxValue; // mark
|
---|
132 |
|
---|
133 | foreach (var symb in phrase) {
|
---|
134 | if (IsNonTerminal(symb)) {
|
---|
135 | l += MaxSymbolLength(symb);
|
---|
136 | } else {
|
---|
137 | l++;
|
---|
138 | }
|
---|
139 | }
|
---|
140 | l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue
|
---|
141 | maxPhraseLength[phrase] = l; // update
|
---|
142 | return l;
|
---|
143 | }
|
---|
144 |
|
---|
145 | private int MinSymbolLength(char nt) {
|
---|
146 | if (IsTerminal(nt)) return 1;
|
---|
147 | else return Math.Min(short.MaxValue, rules[nt].Min(alt => MinPhraseLength(alt))); // maximal allowed value is short.MaxValue
|
---|
148 | }
|
---|
149 | private int MaxSymbolLength(char nt) {
|
---|
150 | if (IsTerminal(nt)) return 1;
|
---|
151 | else return Math.Min(short.MaxValue, rules[nt].Max(alt => MaxPhraseLength(alt))); // maximal allowed value is short.MaxValue
|
---|
152 | }
|
---|
153 |
|
---|
154 | public string CompleteSentenceRandomly(Random random, string phrase, int maxLen) {
|
---|
155 | if (phrase.Length > maxLen) throw new ArgumentException();
|
---|
156 | if (MinPhraseLength(phrase) > maxLen) throw new ArgumentException();
|
---|
157 | bool done = phrase.All(IsTerminal); // terminal phrase means we are done
|
---|
158 | while (!done) {
|
---|
159 | int ntIdx; char nt;
|
---|
160 | FindFirstNonTerminal(this, phrase, out nt, out ntIdx);
|
---|
161 |
|
---|
162 | int maxLenOfReplacement = maxLen - (phrase.Length - 1); // replacing aAb with maxLen 4 means we can only use alternatives with a minPhraseLen <= 2
|
---|
163 | Debug.Assert(maxLenOfReplacement > 0);
|
---|
164 |
|
---|
165 | var alts = GetAlternatives(nt).Where(alt => MinPhraseLength(alt) <= maxLenOfReplacement);
|
---|
166 | Debug.Assert(alts.Any());
|
---|
167 |
|
---|
168 | // replace nt with random alternative
|
---|
169 | var selectedAlt = alts.SelectRandom(random);
|
---|
170 | phrase = phrase.Remove(ntIdx, 1);
|
---|
171 | phrase = phrase.Insert(ntIdx, selectedAlt);
|
---|
172 |
|
---|
173 | done = phrase.All(IsTerminal); // terminal phrase means we are done
|
---|
174 | }
|
---|
175 | return phrase;
|
---|
176 | }
|
---|
177 |
|
---|
178 | public static void FindFirstNonTerminal(IGrammar g, string phrase, out char nt, out int ntIdx) {
|
---|
179 | ntIdx = 0;
|
---|
180 | while (ntIdx < phrase.Length && g.IsTerminal(phrase[ntIdx])) ntIdx++;
|
---|
181 | if (ntIdx >= phrase.Length) {
|
---|
182 | ntIdx = -1;
|
---|
183 | nt = '\0';
|
---|
184 | } else {
|
---|
185 | nt = phrase[ntIdx];
|
---|
186 | }
|
---|
187 | }
|
---|
188 |
|
---|
189 | public bool IsTerminal(char symbol) {
|
---|
190 | return terminalSymbols.Contains(symbol);
|
---|
191 | }
|
---|
192 |
|
---|
193 | public bool IsNonTerminal(char symbol) {
|
---|
194 | return nonTerminalSymbols.Contains(symbol);
|
---|
195 | }
|
---|
196 |
|
---|
197 | private static readonly string symbRegex = @"(?<symb>[^\s\|\.])";
|
---|
198 | private static readonly string altRegex = @"(?<alt>[^\s\|\.]+)";
|
---|
199 | private static readonly string altListRegex = altRegex + @"(\s*\|\s*" + altRegex + @")*";
|
---|
200 | // private static readonly string terminalClassRegex = @"[^\s\|\.]\s*\.\.\s*[^\s\|\.]";
|
---|
201 | private static readonly string terminalClassRegex = symbRegex + @"\s*\.\.\s*" + symbRegex + @"\s*";
|
---|
202 |
|
---|
203 | private static readonly Regex sentenceSymbRegex = new Regex(@"\s*G\(" + symbRegex + @"\):\s*$", RegexOptions.ExplicitCapture);
|
---|
204 | private static readonly Regex ruleRegex = new Regex(@"\s*" + symbRegex + @"\s*->\s*(" + altListRegex + "|" + terminalClassRegex + @")\s*$", RegexOptions.ExplicitCapture);
|
---|
205 |
|
---|
206 |
|
---|
207 | // supports only meta-symbol for alternatives (syntax of formal grammars)
|
---|
208 | // epsilon-alternatives are not supported
|
---|
209 | public static IGrammar FromString(string grammar) {
|
---|
210 | char sentenceSymbol;
|
---|
211 | var nonTerminals = new List<char>();
|
---|
212 | var terminals = new List<char>();
|
---|
213 | var rules = new List<Tuple<char, string>>();
|
---|
214 |
|
---|
215 | using (var reader = new StringReader(grammar)) {
|
---|
216 | var line = reader.ReadLine();
|
---|
217 | while (string.IsNullOrWhiteSpace(line)) line = reader.ReadLine();
|
---|
218 | // first line contains "G(...):"
|
---|
219 | var match = sentenceSymbRegex.Match(line);
|
---|
220 | if (!match.Success) throw new ArgumentException();
|
---|
221 | sentenceSymbol = match.Groups["symb"].Value[0]; // only a single symbol is in the capture group
|
---|
222 |
|
---|
223 | line = reader.ReadLine();
|
---|
224 | while (!string.IsNullOrWhiteSpace(line)) {
|
---|
225 | match = ruleRegex.Match(line);
|
---|
226 | if (match.Length != line.Length) throw new ArgumentException();
|
---|
227 | var nt = match.Groups["symb"].Captures[0].Value[0];
|
---|
228 | if (!nonTerminals.Contains(nt)) nonTerminals.Add(nt);
|
---|
229 |
|
---|
230 | // contains alternatives?
|
---|
231 | if (match.Groups["alt"].Captures.Count > 0) {
|
---|
232 | foreach (var m in match.Groups["alt"].Captures) {
|
---|
233 | rules.Add(Tuple.Create(nt, m.ToString()));
|
---|
234 | }
|
---|
235 | } else if (match.Groups["symb"].Captures.Count == 3) {
|
---|
236 | // contains terminal range
|
---|
237 | var fromSymb = match.Groups["symb"].Captures[1].Value[0];
|
---|
238 | var toSymb = match.Groups["symb"].Captures[2].Value[0];
|
---|
239 | if (fromSymb > toSymb) throw new ArgumentException("left symbol is larger than right symbol in range");
|
---|
240 |
|
---|
241 | for (byte b = Convert.ToByte(fromSymb); b <= Convert.ToByte(toSymb); b++) {
|
---|
242 | rules.Add(Tuple.Create(nt, Convert.ToChar(b).ToString()));
|
---|
243 | }
|
---|
244 | }
|
---|
245 |
|
---|
246 | line = reader.ReadLine();
|
---|
247 | }
|
---|
248 | }
|
---|
249 |
|
---|
250 | foreach (var r in rules) {
|
---|
251 | foreach (var s in r.Item2) {
|
---|
252 | if (!nonTerminals.Contains(s) && !terminals.Contains(s)) terminals.Add(s);
|
---|
253 | }
|
---|
254 | }
|
---|
255 | return new Grammar(sentenceSymbol, terminals, nonTerminals, rules);
|
---|
256 | }
|
---|
257 |
|
---|
258 | public override string ToString() {
|
---|
259 | // for debugging
|
---|
260 | var sb = new StringBuilder();
|
---|
261 | sb.AppendFormat("G({0}):", sentenceSymbol).AppendLine();
|
---|
262 | foreach (var r in rules) {
|
---|
263 | foreach (var alt in r.Value) {
|
---|
264 | var phrase = string.Join(" ", alt);
|
---|
265 | sb.AppendFormat(" {0} -> {1} (min: {2}, max {3})", r.Key, phrase, MinPhraseLength(phrase), MaxPhraseLength(phrase))
|
---|
266 | .AppendLine();
|
---|
267 | }
|
---|
268 | }
|
---|
269 | return sb.ToString();
|
---|
270 | }
|
---|
271 | }
|
---|
272 | }
|
---|