[11659] | 1 | using System;
|
---|
| 2 | using System.Collections.Generic;
|
---|
| 3 | using System.Diagnostics;
|
---|
| 4 | using System.IO;
|
---|
| 5 | using System.Linq;
|
---|
| 6 | using System.Text;
|
---|
| 7 | using System.Text.RegularExpressions;
|
---|
[11727] | 8 | using HeuristicLab.Common;
|
---|
[11659] | 9 |
|
---|
| 10 | namespace HeuristicLab.Problems.GrammaticalOptimization {
|
---|
| 11 |
|
---|
| 12 | // phrases and symbols are both strings
|
---|
| 13 | // spaces are used to separate symbols in phases
|
---|
| 14 | // terminal and non-terminal symbols must not contain white-space
|
---|
| 15 | // white-spaces are not counted when determining the length of a sentence, only the number of terminal symbols is relevant
|
---|
| 16 |
|
---|
| 17 | public class Grammar : IGrammar {
|
---|
| 18 |
|
---|
[11730] | 19 | private readonly Dictionary<char, List<Sequence>> rules;
|
---|
[11659] | 20 | private readonly HashSet<char> terminalSymbols;
|
---|
| 21 | private readonly char sentenceSymbol;
|
---|
| 22 | private readonly HashSet<char> nonTerminalSymbols;
|
---|
[11730] | 23 | private readonly Dictionary<Sequence, int> maxPhraseLength = new Dictionary<Sequence, int>();
|
---|
| 24 | private readonly Dictionary<Sequence, int> minPhraseLength = new Dictionary<Sequence, int>();
|
---|
[11659] | 25 |
|
---|
| 26 | public char SentenceSymbol { get { return sentenceSymbol; } }
|
---|
| 27 | public IEnumerable<char> NonTerminalSymbols { get { return nonTerminalSymbols; } }
|
---|
| 28 | public IEnumerable<char> TerminalSymbols { get { return terminalSymbols; } }
|
---|
| 29 | public IEnumerable<char> Symbols { get { return nonTerminalSymbols.Concat(terminalSymbols); } } // ctor guarantees that intersection is empty
|
---|
| 30 |
|
---|
| 31 | // cloning ctor
|
---|
| 32 | public Grammar(Grammar orig) {
|
---|
[11730] | 33 | this.rules = new Dictionary<char, List<Sequence>>();
|
---|
| 34 | foreach (var r in orig.rules)
|
---|
[11732] | 35 | this.rules.Add(r.Key, new List<Sequence>(r.Value.Select(v => new ReadonlySequence(v)))); // clone sequences
|
---|
[11659] | 36 | this.terminalSymbols = new HashSet<char>(orig.terminalSymbols);
|
---|
| 37 | this.sentenceSymbol = orig.sentenceSymbol;
|
---|
| 38 | this.nonTerminalSymbols = new HashSet<char>(orig.nonTerminalSymbols);
|
---|
[11730] | 39 | this.maxPhraseLength = new Dictionary<Sequence, int>();
|
---|
[11732] | 40 | foreach (var p in orig.maxPhraseLength) this.maxPhraseLength.Add(new ReadonlySequence(p.Key), p.Value);
|
---|
[11730] | 41 | this.minPhraseLength = new Dictionary<Sequence, int>();
|
---|
[11732] | 42 | foreach (var p in orig.minPhraseLength) this.minPhraseLength.Add(new ReadonlySequence(p.Key), p.Value);
|
---|
[11659] | 43 | }
|
---|
| 44 |
|
---|
| 45 | public Grammar(string grammar) : this((Grammar)FromString(grammar)) { }
|
---|
| 46 |
|
---|
| 47 | public Grammar(char sentenceSymbol, IEnumerable<char> terminalSymbols, IEnumerable<char> nonTerminalSymbols, IEnumerable<Tuple<char, string>> rules) {
|
---|
| 48 | this.sentenceSymbol = sentenceSymbol;
|
---|
| 49 | this.terminalSymbols = new HashSet<char>(terminalSymbols);
|
---|
| 50 | this.nonTerminalSymbols = new HashSet<char>(nonTerminalSymbols);
|
---|
[11730] | 51 | this.rules = new Dictionary<char, List<Sequence>>();
|
---|
[11659] | 52 | foreach (var r in rules) {
|
---|
[11730] | 53 | if (!this.rules.ContainsKey(r.Item1)) this.rules.Add(r.Item1, new List<Sequence>());
|
---|
[11732] | 54 | this.rules[r.Item1].Add(new ReadonlySequence(r.Item2)); // here we store an array of symbols for a phase
|
---|
[11659] | 55 | }
|
---|
| 56 |
|
---|
| 57 | CheckValidity();
|
---|
[11732] | 58 | CalculatePhraseLengthBounds();
|
---|
[11659] | 59 | }
|
---|
| 60 |
|
---|
| 61 | private void CheckValidity() {
|
---|
| 62 | // check validity of parameters
|
---|
| 63 | if (!nonTerminalSymbols.Contains(sentenceSymbol) || // sentence symbol must be a non-terminal
|
---|
| 64 | nonTerminalSymbols.Any(c => c < 'A' || c > 'Z') || // nonterminal symbols \in 'A' .. 'Z'
|
---|
| 65 | terminalSymbols.Intersect(nonTerminalSymbols).Any() || // no overlap between non-terminals and terminal
|
---|
| 66 | rules.Keys.Intersect(nonTerminalSymbols).Count() < nonTerminalSymbols.Count ||
|
---|
| 67 | nonTerminalSymbols.Any(nt => !rules.ContainsKey(nt)) || // each nt must have at least one rule
|
---|
| 68 | rules.Values.Any(alternatives => !alternatives.Any()) || // no eps alternatives
|
---|
| 69 | rules.Any(rule => rule.Value.Any(alternative => alternative.Length == 1 && alternative[0] == rule.Key))
|
---|
| 70 | // no infinite direct recursions
|
---|
| 71 | ) {
|
---|
| 72 | throw new ArgumentException();
|
---|
| 73 | }
|
---|
| 74 | }
|
---|
| 75 |
|
---|
[11732] | 76 | private void CalculatePhraseLengthBounds() {
|
---|
[11659] | 77 | // cache phrase lengths
|
---|
| 78 | foreach (var nt in nonTerminalSymbols) {
|
---|
| 79 | var min = int.MaxValue;
|
---|
| 80 | var max = int.MinValue;
|
---|
| 81 | foreach (var alt in rules[nt].OrderBy(alt => alt.Length)) {
|
---|
[11732] | 82 | CalcAndSetMinPhraseLength(alt);
|
---|
| 83 | CalcAndSetMaxPhraseLength(alt);
|
---|
[11659] | 84 |
|
---|
| 85 | min = Math.Min(min, minPhraseLength[alt]);
|
---|
| 86 | max = Math.Max(max, maxPhraseLength[alt]);
|
---|
| 87 | }
|
---|
[11732] | 88 | minPhraseLength[new ReadonlySequence(nt)] = min;
|
---|
| 89 | maxPhraseLength[new ReadonlySequence(nt)] = max;
|
---|
[11659] | 90 | }
|
---|
| 91 | }
|
---|
| 92 |
|
---|
[11730] | 93 | public IEnumerable<Sequence> GetAlternatives(char nt) {
|
---|
[11659] | 94 | return rules[nt];
|
---|
| 95 | }
|
---|
| 96 |
|
---|
[11730] | 97 | public IEnumerable<Sequence> GetTerminalAlternatives(char nt) {
|
---|
[11659] | 98 | return GetAlternatives(nt).Where(alt => alt.All(IsTerminal));
|
---|
| 99 | }
|
---|
| 100 |
|
---|
[11730] | 101 | public IEnumerable<Sequence> GetNonTerminalAlternatives(char nt) {
|
---|
[11659] | 102 | return GetAlternatives(nt).Where(alt => alt.Any(IsNonTerminal));
|
---|
| 103 | }
|
---|
| 104 |
|
---|
[11732] | 105 | #region population of minphraselength cache
|
---|
| 106 | private int CalcAndSetMinSymbolLength(char symb) {
|
---|
| 107 | if (IsTerminal(symb)) return 1;
|
---|
| 108 | else return Math.Min(short.MaxValue, rules[symb].Min(alt => CalcAndSetMinPhraseLength(alt))); // maximal allowed value is short.MaxValue
|
---|
| 109 | }
|
---|
| 110 | private int CalcAndSetMinPhraseLength(Sequence phrase) {
|
---|
| 111 | Debug.Assert(phrase is ReadonlySequence);
|
---|
[11659] | 112 | int l;
|
---|
| 113 | if (minPhraseLength.TryGetValue(phrase, out l)) return l;
|
---|
| 114 | l = 0;
|
---|
| 115 | minPhraseLength[phrase] = short.MaxValue; // mark
|
---|
| 116 |
|
---|
| 117 | foreach (var symb in phrase) {
|
---|
[11732] | 118 | l += CalcAndSetMinSymbolLength(symb);
|
---|
[11659] | 119 | }
|
---|
| 120 |
|
---|
| 121 | l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue
|
---|
| 122 | minPhraseLength[phrase] = l; // update
|
---|
| 123 | return l;
|
---|
| 124 | }
|
---|
[11732] | 125 | #endregion
|
---|
[11659] | 126 |
|
---|
[11732] | 127 | // read only access to caches
|
---|
| 128 | private int MinSymbolLength(char symb) {
|
---|
| 129 | if (IsTerminal(symb)) return 1;
|
---|
| 130 | else return Math.Min(short.MaxValue, rules[symb].Min(alt => MinPhraseLength(alt))); // maximal allowed value is short.MaxValue
|
---|
| 131 | }
|
---|
| 132 | public int MinPhraseLength(Sequence phrase) {
|
---|
| 133 | var minLen = 0;
|
---|
| 134 | if (minPhraseLength.TryGetValue(phrase, out minLen)) return minLen;
|
---|
| 135 | foreach (var s in phrase) {
|
---|
| 136 | minLen += MinSymbolLength(s);
|
---|
| 137 | }
|
---|
| 138 | return Math.Min(short.MaxValue, minLen); // maximal allowed value is short.MaxValue
|
---|
| 139 | }
|
---|
| 140 |
|
---|
| 141 | #region population of maxphraselength cache
|
---|
| 142 | private int CalcAndSetMaxSymbolLength(char symb) {
|
---|
| 143 | if (IsTerminal(symb)) return 1;
|
---|
| 144 | else return Math.Min(short.MaxValue, rules[symb].Max(alt => CalcAndSetMaxPhraseLength(alt))); // maximal allowed value is short.MaxValue
|
---|
| 145 | }
|
---|
[11659] | 146 | // caches for this are build in construction of object
|
---|
[11732] | 147 | private int CalcAndSetMaxPhraseLength(Sequence phrase) {
|
---|
| 148 | Debug.Assert(phrase is ReadonlySequence);
|
---|
[11659] | 149 | int l;
|
---|
| 150 | if (maxPhraseLength.TryGetValue(phrase, out l)) return l;
|
---|
| 151 | l = 0;
|
---|
| 152 | maxPhraseLength[phrase] = short.MaxValue; // mark
|
---|
| 153 |
|
---|
| 154 | foreach (var symb in phrase) {
|
---|
[11732] | 155 | l += CalcAndSetMaxSymbolLength(symb);
|
---|
[11659] | 156 | }
|
---|
| 157 | l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue
|
---|
| 158 | maxPhraseLength[phrase] = l; // update
|
---|
| 159 | return l;
|
---|
| 160 | }
|
---|
[11732] | 161 | #endregion
|
---|
[11659] | 162 |
|
---|
[11732] | 163 | // read only access to caches
|
---|
| 164 | public int MaxSymbolLength(char symb) {
|
---|
| 165 | if (IsTerminal(symb)) return 1;
|
---|
| 166 | else return Math.Min(short.MaxValue, rules[symb].Max(alt => MaxPhraseLength(alt))); // maximal allowed value is short.MaxValue
|
---|
[11659] | 167 | }
|
---|
[11732] | 168 | public int MaxPhraseLength(Sequence phrase) {
|
---|
| 169 | var maxLen = 0;
|
---|
| 170 | if (maxPhraseLength.TryGetValue(phrase, out maxLen)) return maxLen;
|
---|
| 171 | foreach (var s in phrase) {
|
---|
| 172 | maxLen += MaxSymbolLength(s);
|
---|
| 173 | }
|
---|
| 174 | return Math.Min(short.MaxValue, maxLen); // maximal allowed value is short.MaxValue
|
---|
[11659] | 175 | }
|
---|
| 176 |
|
---|
[12391] | 177 | public Sequence CompleteSentenceRandomly(System.Random random, Sequence phrase, int maxLen) {
|
---|
[11659] | 178 | if (phrase.Length > maxLen) throw new ArgumentException();
|
---|
| 179 | if (MinPhraseLength(phrase) > maxLen) throw new ArgumentException();
|
---|
[11732] | 180 | while (!phrase.IsTerminal) {
|
---|
[11730] | 181 | char nt = phrase.FirstNonTerminal;
|
---|
[11659] | 182 |
|
---|
| 183 | int maxLenOfReplacement = maxLen - (phrase.Length - 1); // replacing aAb with maxLen 4 means we can only use alternatives with a minPhraseLen <= 2
|
---|
| 184 | Debug.Assert(maxLenOfReplacement > 0);
|
---|
| 185 |
|
---|
[12893] | 186 |
|
---|
| 187 | IEnumerable<Sequence> alts;
|
---|
| 188 | if (random.NextDouble() < 0.005) {
|
---|
| 189 | alts = GetTerminalAlternatives(nt).Where(alt => MinPhraseLength(alt) <= maxLenOfReplacement);
|
---|
| 190 | } else {
|
---|
| 191 | alts = GetNonTerminalAlternatives(nt).Where(alt => MinPhraseLength(alt) <= maxLenOfReplacement);
|
---|
| 192 | if (!alts.Any())
|
---|
| 193 | alts = GetTerminalAlternatives(nt).Where(alt => MinPhraseLength(alt) <= maxLenOfReplacement);
|
---|
| 194 | }
|
---|
[11659] | 195 | Debug.Assert(alts.Any());
|
---|
| 196 |
|
---|
| 197 | // replace nt with random alternative
|
---|
| 198 | var selectedAlt = alts.SelectRandom(random);
|
---|
[11730] | 199 | phrase.ReplaceAt(phrase.FirstNonTerminalIndex, 1, selectedAlt);
|
---|
[11659] | 200 |
|
---|
| 201 | }
|
---|
| 202 | return phrase;
|
---|
| 203 | }
|
---|
| 204 |
|
---|
| 205 | public bool IsTerminal(char symbol) {
|
---|
| 206 | return terminalSymbols.Contains(symbol);
|
---|
| 207 | }
|
---|
| 208 |
|
---|
[11793] | 209 | public bool IsTerminal(string phrase) {
|
---|
| 210 | // reverse because for our grammars and left-canonical derivation it is more likely that NTs occur near the end of the sequence
|
---|
[11799] | 211 | for (int i = phrase.Length - 1; i >= 0; i--) {
|
---|
| 212 | if (!IsTerminal(phrase[i])) return false;
|
---|
| 213 | }
|
---|
| 214 | return true;
|
---|
[11793] | 215 | }
|
---|
| 216 |
|
---|
[11659] | 217 | public bool IsNonTerminal(char symbol) {
|
---|
| 218 | return nonTerminalSymbols.Contains(symbol);
|
---|
| 219 | }
|
---|
| 220 |
|
---|
| 221 | private static readonly string symbRegex = @"(?<symb>[^\s\|\.])";
|
---|
| 222 | private static readonly string altRegex = @"(?<alt>[^\s\|\.]+)";
|
---|
| 223 | private static readonly string altListRegex = altRegex + @"(\s*\|\s*" + altRegex + @")*";
|
---|
| 224 | // private static readonly string terminalClassRegex = @"[^\s\|\.]\s*\.\.\s*[^\s\|\.]";
|
---|
| 225 | private static readonly string terminalClassRegex = symbRegex + @"\s*\.\.\s*" + symbRegex + @"\s*";
|
---|
| 226 |
|
---|
| 227 | private static readonly Regex sentenceSymbRegex = new Regex(@"\s*G\(" + symbRegex + @"\):\s*$", RegexOptions.ExplicitCapture);
|
---|
| 228 | private static readonly Regex ruleRegex = new Regex(@"\s*" + symbRegex + @"\s*->\s*(" + altListRegex + "|" + terminalClassRegex + @")\s*$", RegexOptions.ExplicitCapture);
|
---|
| 229 |
|
---|
| 230 |
|
---|
| 231 | // supports only meta-symbol for alternatives (syntax of formal grammars)
|
---|
| 232 | // epsilon-alternatives are not supported
|
---|
| 233 | public static IGrammar FromString(string grammar) {
|
---|
| 234 | char sentenceSymbol;
|
---|
| 235 | var nonTerminals = new List<char>();
|
---|
| 236 | var terminals = new List<char>();
|
---|
| 237 | var rules = new List<Tuple<char, string>>();
|
---|
| 238 |
|
---|
| 239 | using (var reader = new StringReader(grammar)) {
|
---|
| 240 | var line = reader.ReadLine();
|
---|
| 241 | while (string.IsNullOrWhiteSpace(line)) line = reader.ReadLine();
|
---|
| 242 | // first line contains "G(...):"
|
---|
| 243 | var match = sentenceSymbRegex.Match(line);
|
---|
| 244 | if (!match.Success) throw new ArgumentException();
|
---|
| 245 | sentenceSymbol = match.Groups["symb"].Value[0]; // only a single symbol is in the capture group
|
---|
| 246 |
|
---|
| 247 | line = reader.ReadLine();
|
---|
| 248 | while (!string.IsNullOrWhiteSpace(line)) {
|
---|
| 249 | match = ruleRegex.Match(line);
|
---|
| 250 | if (match.Length != line.Length) throw new ArgumentException();
|
---|
| 251 | var nt = match.Groups["symb"].Captures[0].Value[0];
|
---|
| 252 | if (!nonTerminals.Contains(nt)) nonTerminals.Add(nt);
|
---|
| 253 |
|
---|
| 254 | // contains alternatives?
|
---|
| 255 | if (match.Groups["alt"].Captures.Count > 0) {
|
---|
| 256 | foreach (var m in match.Groups["alt"].Captures) {
|
---|
| 257 | rules.Add(Tuple.Create(nt, m.ToString()));
|
---|
| 258 | }
|
---|
| 259 | } else if (match.Groups["symb"].Captures.Count == 3) {
|
---|
| 260 | // contains terminal range
|
---|
| 261 | var fromSymb = match.Groups["symb"].Captures[1].Value[0];
|
---|
| 262 | var toSymb = match.Groups["symb"].Captures[2].Value[0];
|
---|
| 263 | if (fromSymb > toSymb) throw new ArgumentException("left symbol is larger than right symbol in range");
|
---|
| 264 |
|
---|
| 265 | for (byte b = Convert.ToByte(fromSymb); b <= Convert.ToByte(toSymb); b++) {
|
---|
| 266 | rules.Add(Tuple.Create(nt, Convert.ToChar(b).ToString()));
|
---|
| 267 | }
|
---|
| 268 | }
|
---|
| 269 |
|
---|
| 270 | line = reader.ReadLine();
|
---|
| 271 | }
|
---|
| 272 | }
|
---|
| 273 |
|
---|
| 274 | foreach (var r in rules) {
|
---|
| 275 | foreach (var s in r.Item2) {
|
---|
| 276 | if (!nonTerminals.Contains(s) && !terminals.Contains(s)) terminals.Add(s);
|
---|
| 277 | }
|
---|
| 278 | }
|
---|
| 279 | return new Grammar(sentenceSymbol, terminals, nonTerminals, rules);
|
---|
| 280 | }
|
---|
| 281 |
|
---|
| 282 | public override string ToString() {
|
---|
| 283 | // for debugging
|
---|
| 284 | var sb = new StringBuilder();
|
---|
| 285 | sb.AppendFormat("G({0}):", sentenceSymbol).AppendLine();
|
---|
| 286 | foreach (var r in rules) {
|
---|
| 287 | foreach (var alt in r.Value) {
|
---|
[11730] | 288 | sb.AppendFormat(" {0} -> {1} (min: {2}, max {3})", r.Key, alt, MinPhraseLength(alt), MaxPhraseLength(alt))
|
---|
[11659] | 289 | .AppendLine();
|
---|
| 290 | }
|
---|
| 291 | }
|
---|
| 292 | return sb.ToString();
|
---|
| 293 | }
|
---|
| 294 | }
|
---|
| 295 | }
|
---|