1 | using System;
|
---|
2 | using System.Collections.Generic;
|
---|
3 | using System.Diagnostics;
|
---|
4 | using System.IO;
|
---|
5 | using System.Linq;
|
---|
6 | using System.Text;
|
---|
7 | using System.Text.RegularExpressions;
|
---|
8 | using HeuristicLab.Common;
|
---|
9 |
|
---|
10 | namespace HeuristicLab.Problems.GrammaticalOptimization {
|
---|
11 |
|
---|
12 | // phrases and symbols are both strings
|
---|
13 | // spaces are used to separate symbols in phases
|
---|
14 | // terminal and non-terminal symbols must not contain white-space
|
---|
15 | // white-spaces are not counted when determining the length of a sentence, only the number of terminal symbols is relevant
|
---|
16 |
|
---|
17 | public class Grammar : IGrammar {
|
---|
18 |
|
---|
19 | private readonly Dictionary<char, List<Sequence>> rules;
|
---|
20 | private readonly HashSet<char> terminalSymbols;
|
---|
21 | private readonly char sentenceSymbol;
|
---|
22 | private readonly HashSet<char> nonTerminalSymbols;
|
---|
23 | private readonly Dictionary<Sequence, int> maxPhraseLength = new Dictionary<Sequence, int>();
|
---|
24 | private readonly Dictionary<Sequence, int> minPhraseLength = new Dictionary<Sequence, int>();
|
---|
25 |
|
---|
26 | public char SentenceSymbol { get { return sentenceSymbol; } }
|
---|
27 | public IEnumerable<char> NonTerminalSymbols { get { return nonTerminalSymbols; } }
|
---|
28 | public IEnumerable<char> TerminalSymbols { get { return terminalSymbols; } }
|
---|
29 | public IEnumerable<char> Symbols { get { return nonTerminalSymbols.Concat(terminalSymbols); } } // ctor guarantees that intersection is empty
|
---|
30 |
|
---|
31 | // cloning ctor
|
---|
32 | public Grammar(Grammar orig) {
|
---|
33 | this.rules = new Dictionary<char, List<Sequence>>();
|
---|
34 | foreach (var r in orig.rules)
|
---|
35 | this.rules.Add(r.Key, new List<Sequence>(r.Value.Select(v => new ReadonlySequence(v)))); // clone sequences
|
---|
36 | this.terminalSymbols = new HashSet<char>(orig.terminalSymbols);
|
---|
37 | this.sentenceSymbol = orig.sentenceSymbol;
|
---|
38 | this.nonTerminalSymbols = new HashSet<char>(orig.nonTerminalSymbols);
|
---|
39 | this.maxPhraseLength = new Dictionary<Sequence, int>();
|
---|
40 | foreach (var p in orig.maxPhraseLength) this.maxPhraseLength.Add(new ReadonlySequence(p.Key), p.Value);
|
---|
41 | this.minPhraseLength = new Dictionary<Sequence, int>();
|
---|
42 | foreach (var p in orig.minPhraseLength) this.minPhraseLength.Add(new ReadonlySequence(p.Key), p.Value);
|
---|
43 | }
|
---|
44 |
|
---|
45 | public Grammar(string grammar) : this((Grammar)FromString(grammar)) { }
|
---|
46 |
|
---|
47 | public Grammar(char sentenceSymbol, IEnumerable<char> terminalSymbols, IEnumerable<char> nonTerminalSymbols, IEnumerable<Tuple<char, string>> rules) {
|
---|
48 | this.sentenceSymbol = sentenceSymbol;
|
---|
49 | this.terminalSymbols = new HashSet<char>(terminalSymbols);
|
---|
50 | this.nonTerminalSymbols = new HashSet<char>(nonTerminalSymbols);
|
---|
51 | this.rules = new Dictionary<char, List<Sequence>>();
|
---|
52 | foreach (var r in rules) {
|
---|
53 | if (!this.rules.ContainsKey(r.Item1)) this.rules.Add(r.Item1, new List<Sequence>());
|
---|
54 | this.rules[r.Item1].Add(new ReadonlySequence(r.Item2)); // here we store an array of symbols for a phase
|
---|
55 | }
|
---|
56 |
|
---|
57 | CheckValidity();
|
---|
58 | CalculatePhraseLengthBounds();
|
---|
59 | }
|
---|
60 |
|
---|
61 | private void CheckValidity() {
|
---|
62 | // check validity of parameters
|
---|
63 | if (!nonTerminalSymbols.Contains(sentenceSymbol) || // sentence symbol must be a non-terminal
|
---|
64 | nonTerminalSymbols.Any(c => c < 'A' || c > 'Z') || // nonterminal symbols \in 'A' .. 'Z'
|
---|
65 | terminalSymbols.Intersect(nonTerminalSymbols).Any() || // no overlap between non-terminals and terminal
|
---|
66 | rules.Keys.Intersect(nonTerminalSymbols).Count() < nonTerminalSymbols.Count ||
|
---|
67 | nonTerminalSymbols.Any(nt => !rules.ContainsKey(nt)) || // each nt must have at least one rule
|
---|
68 | rules.Values.Any(alternatives => !alternatives.Any()) || // no eps alternatives
|
---|
69 | rules.Any(rule => rule.Value.Any(alternative => alternative.Length == 1 && alternative[0] == rule.Key))
|
---|
70 | // no infinite direct recursions
|
---|
71 | ) {
|
---|
72 | throw new ArgumentException();
|
---|
73 | }
|
---|
74 | }
|
---|
75 |
|
---|
76 | private void CalculatePhraseLengthBounds() {
|
---|
77 | // cache phrase lengths
|
---|
78 | foreach (var nt in nonTerminalSymbols) {
|
---|
79 | var min = int.MaxValue;
|
---|
80 | var max = int.MinValue;
|
---|
81 | foreach (var alt in rules[nt].OrderBy(alt => alt.Length)) {
|
---|
82 | CalcAndSetMinPhraseLength(alt);
|
---|
83 | CalcAndSetMaxPhraseLength(alt);
|
---|
84 |
|
---|
85 | min = Math.Min(min, minPhraseLength[alt]);
|
---|
86 | max = Math.Max(max, maxPhraseLength[alt]);
|
---|
87 | }
|
---|
88 | minPhraseLength[new ReadonlySequence(nt)] = min;
|
---|
89 | maxPhraseLength[new ReadonlySequence(nt)] = max;
|
---|
90 | }
|
---|
91 | }
|
---|
92 |
|
---|
93 | public IEnumerable<Sequence> GetAlternatives(char nt) {
|
---|
94 | return rules[nt];
|
---|
95 | }
|
---|
96 |
|
---|
97 | public IEnumerable<Sequence> GetTerminalAlternatives(char nt) {
|
---|
98 | return GetAlternatives(nt).Where(alt => alt.All(IsTerminal));
|
---|
99 | }
|
---|
100 |
|
---|
101 | public IEnumerable<Sequence> GetNonTerminalAlternatives(char nt) {
|
---|
102 | return GetAlternatives(nt).Where(alt => alt.Any(IsNonTerminal));
|
---|
103 | }
|
---|
104 |
|
---|
105 | #region population of minphraselength cache
|
---|
106 | private int CalcAndSetMinSymbolLength(char symb) {
|
---|
107 | if (IsTerminal(symb)) return 1;
|
---|
108 | else return Math.Min(short.MaxValue, rules[symb].Min(alt => CalcAndSetMinPhraseLength(alt))); // maximal allowed value is short.MaxValue
|
---|
109 | }
|
---|
110 | private int CalcAndSetMinPhraseLength(Sequence phrase) {
|
---|
111 | Debug.Assert(phrase is ReadonlySequence);
|
---|
112 | int l;
|
---|
113 | if (minPhraseLength.TryGetValue(phrase, out l)) return l;
|
---|
114 | l = 0;
|
---|
115 | minPhraseLength[phrase] = short.MaxValue; // mark
|
---|
116 |
|
---|
117 | foreach (var symb in phrase) {
|
---|
118 | l += CalcAndSetMinSymbolLength(symb);
|
---|
119 | }
|
---|
120 |
|
---|
121 | l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue
|
---|
122 | minPhraseLength[phrase] = l; // update
|
---|
123 | return l;
|
---|
124 | }
|
---|
125 | #endregion
|
---|
126 |
|
---|
127 | // read only access to caches
|
---|
128 | private int MinSymbolLength(char symb) {
|
---|
129 | if (IsTerminal(symb)) return 1;
|
---|
130 | else return Math.Min(short.MaxValue, rules[symb].Min(alt => MinPhraseLength(alt))); // maximal allowed value is short.MaxValue
|
---|
131 | }
|
---|
132 | public int MinPhraseLength(Sequence phrase) {
|
---|
133 | var minLen = 0;
|
---|
134 | if (minPhraseLength.TryGetValue(phrase, out minLen)) return minLen;
|
---|
135 | foreach (var s in phrase) {
|
---|
136 | minLen += MinSymbolLength(s);
|
---|
137 | }
|
---|
138 | return Math.Min(short.MaxValue, minLen); // maximal allowed value is short.MaxValue
|
---|
139 | }
|
---|
140 |
|
---|
141 | #region population of maxphraselength cache
|
---|
142 | private int CalcAndSetMaxSymbolLength(char symb) {
|
---|
143 | if (IsTerminal(symb)) return 1;
|
---|
144 | else return Math.Min(short.MaxValue, rules[symb].Max(alt => CalcAndSetMaxPhraseLength(alt))); // maximal allowed value is short.MaxValue
|
---|
145 | }
|
---|
146 | // caches for this are build in construction of object
|
---|
147 | private int CalcAndSetMaxPhraseLength(Sequence phrase) {
|
---|
148 | Debug.Assert(phrase is ReadonlySequence);
|
---|
149 | int l;
|
---|
150 | if (maxPhraseLength.TryGetValue(phrase, out l)) return l;
|
---|
151 | l = 0;
|
---|
152 | maxPhraseLength[phrase] = short.MaxValue; // mark
|
---|
153 |
|
---|
154 | foreach (var symb in phrase) {
|
---|
155 | l += CalcAndSetMaxSymbolLength(symb);
|
---|
156 | }
|
---|
157 | l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue
|
---|
158 | maxPhraseLength[phrase] = l; // update
|
---|
159 | return l;
|
---|
160 | }
|
---|
161 | #endregion
|
---|
162 |
|
---|
163 | // read only access to caches
|
---|
164 | public int MaxSymbolLength(char symb) {
|
---|
165 | if (IsTerminal(symb)) return 1;
|
---|
166 | else return Math.Min(short.MaxValue, rules[symb].Max(alt => MaxPhraseLength(alt))); // maximal allowed value is short.MaxValue
|
---|
167 | }
|
---|
168 | public int MaxPhraseLength(Sequence phrase) {
|
---|
169 | var maxLen = 0;
|
---|
170 | if (maxPhraseLength.TryGetValue(phrase, out maxLen)) return maxLen;
|
---|
171 | foreach (var s in phrase) {
|
---|
172 | maxLen += MaxSymbolLength(s);
|
---|
173 | }
|
---|
174 | return Math.Min(short.MaxValue, maxLen); // maximal allowed value is short.MaxValue
|
---|
175 | }
|
---|
176 |
|
---|
177 | public Sequence CompleteSentenceRandomly(System.Random random, Sequence phrase, int maxLen) {
|
---|
178 | if (phrase.Length > maxLen) throw new ArgumentException();
|
---|
179 | if (MinPhraseLength(phrase) > maxLen) throw new ArgumentException();
|
---|
180 | while (!phrase.IsTerminal) {
|
---|
181 | char nt = phrase.FirstNonTerminal;
|
---|
182 |
|
---|
183 | int maxLenOfReplacement = maxLen - (phrase.Length - 1); // replacing aAb with maxLen 4 means we can only use alternatives with a minPhraseLen <= 2
|
---|
184 | Debug.Assert(maxLenOfReplacement > 0);
|
---|
185 |
|
---|
186 |
|
---|
187 | IEnumerable<Sequence> alts;
|
---|
188 | if (random.NextDouble() < 0.005) {
|
---|
189 | alts = GetTerminalAlternatives(nt).Where(alt => MinPhraseLength(alt) <= maxLenOfReplacement);
|
---|
190 | } else {
|
---|
191 | alts = GetNonTerminalAlternatives(nt).Where(alt => MinPhraseLength(alt) <= maxLenOfReplacement);
|
---|
192 | if (!alts.Any())
|
---|
193 | alts = GetTerminalAlternatives(nt).Where(alt => MinPhraseLength(alt) <= maxLenOfReplacement);
|
---|
194 | }
|
---|
195 | Debug.Assert(alts.Any());
|
---|
196 |
|
---|
197 | // replace nt with random alternative
|
---|
198 | var selectedAlt = alts.SelectRandom(random);
|
---|
199 | phrase.ReplaceAt(phrase.FirstNonTerminalIndex, 1, selectedAlt);
|
---|
200 |
|
---|
201 | }
|
---|
202 | return phrase;
|
---|
203 | }
|
---|
204 |
|
---|
205 | public bool IsTerminal(char symbol) {
|
---|
206 | return terminalSymbols.Contains(symbol);
|
---|
207 | }
|
---|
208 |
|
---|
209 | public bool IsTerminal(string phrase) {
|
---|
210 | // reverse because for our grammars and left-canonical derivation it is more likely that NTs occur near the end of the sequence
|
---|
211 | for (int i = phrase.Length - 1; i >= 0; i--) {
|
---|
212 | if (!IsTerminal(phrase[i])) return false;
|
---|
213 | }
|
---|
214 | return true;
|
---|
215 | }
|
---|
216 |
|
---|
217 | public bool IsNonTerminal(char symbol) {
|
---|
218 | return nonTerminalSymbols.Contains(symbol);
|
---|
219 | }
|
---|
220 |
|
---|
221 | private static readonly string symbRegex = @"(?<symb>[^\s\|\.])";
|
---|
222 | private static readonly string altRegex = @"(?<alt>[^\s\|\.]+)";
|
---|
223 | private static readonly string altListRegex = altRegex + @"(\s*\|\s*" + altRegex + @")*";
|
---|
224 | // private static readonly string terminalClassRegex = @"[^\s\|\.]\s*\.\.\s*[^\s\|\.]";
|
---|
225 | private static readonly string terminalClassRegex = symbRegex + @"\s*\.\.\s*" + symbRegex + @"\s*";
|
---|
226 |
|
---|
227 | private static readonly Regex sentenceSymbRegex = new Regex(@"\s*G\(" + symbRegex + @"\):\s*$", RegexOptions.ExplicitCapture);
|
---|
228 | private static readonly Regex ruleRegex = new Regex(@"\s*" + symbRegex + @"\s*->\s*(" + altListRegex + "|" + terminalClassRegex + @")\s*$", RegexOptions.ExplicitCapture);
|
---|
229 |
|
---|
230 |
|
---|
231 | // supports only meta-symbol for alternatives (syntax of formal grammars)
|
---|
232 | // epsilon-alternatives are not supported
|
---|
233 | public static IGrammar FromString(string grammar) {
|
---|
234 | char sentenceSymbol;
|
---|
235 | var nonTerminals = new List<char>();
|
---|
236 | var terminals = new List<char>();
|
---|
237 | var rules = new List<Tuple<char, string>>();
|
---|
238 |
|
---|
239 | using (var reader = new StringReader(grammar)) {
|
---|
240 | var line = reader.ReadLine();
|
---|
241 | while (string.IsNullOrWhiteSpace(line)) line = reader.ReadLine();
|
---|
242 | // first line contains "G(...):"
|
---|
243 | var match = sentenceSymbRegex.Match(line);
|
---|
244 | if (!match.Success) throw new ArgumentException();
|
---|
245 | sentenceSymbol = match.Groups["symb"].Value[0]; // only a single symbol is in the capture group
|
---|
246 |
|
---|
247 | line = reader.ReadLine();
|
---|
248 | while (!string.IsNullOrWhiteSpace(line)) {
|
---|
249 | match = ruleRegex.Match(line);
|
---|
250 | if (match.Length != line.Length) throw new ArgumentException();
|
---|
251 | var nt = match.Groups["symb"].Captures[0].Value[0];
|
---|
252 | if (!nonTerminals.Contains(nt)) nonTerminals.Add(nt);
|
---|
253 |
|
---|
254 | // contains alternatives?
|
---|
255 | if (match.Groups["alt"].Captures.Count > 0) {
|
---|
256 | foreach (var m in match.Groups["alt"].Captures) {
|
---|
257 | rules.Add(Tuple.Create(nt, m.ToString()));
|
---|
258 | }
|
---|
259 | } else if (match.Groups["symb"].Captures.Count == 3) {
|
---|
260 | // contains terminal range
|
---|
261 | var fromSymb = match.Groups["symb"].Captures[1].Value[0];
|
---|
262 | var toSymb = match.Groups["symb"].Captures[2].Value[0];
|
---|
263 | if (fromSymb > toSymb) throw new ArgumentException("left symbol is larger than right symbol in range");
|
---|
264 |
|
---|
265 | for (byte b = Convert.ToByte(fromSymb); b <= Convert.ToByte(toSymb); b++) {
|
---|
266 | rules.Add(Tuple.Create(nt, Convert.ToChar(b).ToString()));
|
---|
267 | }
|
---|
268 | }
|
---|
269 |
|
---|
270 | line = reader.ReadLine();
|
---|
271 | }
|
---|
272 | }
|
---|
273 |
|
---|
274 | foreach (var r in rules) {
|
---|
275 | foreach (var s in r.Item2) {
|
---|
276 | if (!nonTerminals.Contains(s) && !terminals.Contains(s)) terminals.Add(s);
|
---|
277 | }
|
---|
278 | }
|
---|
279 | return new Grammar(sentenceSymbol, terminals, nonTerminals, rules);
|
---|
280 | }
|
---|
281 |
|
---|
282 | public override string ToString() {
|
---|
283 | // for debugging
|
---|
284 | var sb = new StringBuilder();
|
---|
285 | sb.AppendFormat("G({0}):", sentenceSymbol).AppendLine();
|
---|
286 | foreach (var r in rules) {
|
---|
287 | foreach (var alt in r.Value) {
|
---|
288 | sb.AppendFormat(" {0} -> {1} (min: {2}, max {3})", r.Key, alt, MinPhraseLength(alt), MaxPhraseLength(alt))
|
---|
289 | .AppendLine();
|
---|
290 | }
|
---|
291 | }
|
---|
292 | return sb.ToString();
|
---|
293 | }
|
---|
294 | }
|
---|
295 | }
|
---|