[11659] | 1 | using System;
|
---|
| 2 | using System.Collections.Generic;
|
---|
| 3 | using System.Data;
|
---|
| 4 | using System.Diagnostics;
|
---|
| 5 | using System.IO;
|
---|
| 6 | using System.Linq;
|
---|
| 7 | using System.Text;
|
---|
| 8 | using System.Text.RegularExpressions;
|
---|
| 9 | using System.Xml.Linq;
|
---|
[11727] | 10 | using HeuristicLab.Common;
|
---|
[11659] | 11 |
|
---|
| 12 | namespace HeuristicLab.Problems.GrammaticalOptimization {
|
---|
| 13 |
|
---|
| 14 | // phrases and symbols are both strings
|
---|
| 15 | // spaces are used to separate symbols in phases
|
---|
| 16 | // terminal and non-terminal symbols must not contain white-space
|
---|
| 17 | // white-spaces are not counted when determining the length of a sentence, only the number of terminal symbols is relevant
|
---|
| 18 |
|
---|
| 19 | public class Grammar : IGrammar {
|
---|
| 20 |
|
---|
| 21 | private readonly Dictionary<char, List<string>> rules;
|
---|
| 22 | private readonly HashSet<char> terminalSymbols;
|
---|
| 23 | private readonly char sentenceSymbol;
|
---|
| 24 | private readonly HashSet<char> nonTerminalSymbols;
|
---|
| 25 | private readonly Dictionary<string, int> maxPhraseLength = new Dictionary<string, int>();
|
---|
| 26 | private readonly Dictionary<string, int> minPhraseLength = new Dictionary<string, int>();
|
---|
| 27 |
|
---|
| 28 | public char SentenceSymbol { get { return sentenceSymbol; } }
|
---|
| 29 | public IEnumerable<char> NonTerminalSymbols { get { return nonTerminalSymbols; } }
|
---|
| 30 | public IEnumerable<char> TerminalSymbols { get { return terminalSymbols; } }
|
---|
| 31 | public IEnumerable<char> Symbols { get { return nonTerminalSymbols.Concat(terminalSymbols); } } // ctor guarantees that intersection is empty
|
---|
| 32 |
|
---|
| 33 | // cloning ctor
|
---|
| 34 | public Grammar(Grammar orig) {
|
---|
| 35 | this.rules = new Dictionary<char, List<string>>(orig.rules);
|
---|
| 36 | this.terminalSymbols = new HashSet<char>(orig.terminalSymbols);
|
---|
| 37 | this.sentenceSymbol = orig.sentenceSymbol;
|
---|
| 38 | this.nonTerminalSymbols = new HashSet<char>(orig.nonTerminalSymbols);
|
---|
| 39 | this.maxPhraseLength = new Dictionary<string, int>(orig.maxPhraseLength);
|
---|
| 40 | this.minPhraseLength = new Dictionary<string, int>(orig.minPhraseLength);
|
---|
| 41 | }
|
---|
| 42 |
|
---|
| 43 | public Grammar(string grammar) : this((Grammar)FromString(grammar)) { }
|
---|
| 44 |
|
---|
| 45 | public Grammar(char sentenceSymbol, IEnumerable<char> terminalSymbols, IEnumerable<char> nonTerminalSymbols, IEnumerable<Tuple<char, string>> rules) {
|
---|
| 46 | this.sentenceSymbol = sentenceSymbol;
|
---|
| 47 | this.terminalSymbols = new HashSet<char>(terminalSymbols);
|
---|
| 48 | this.nonTerminalSymbols = new HashSet<char>(nonTerminalSymbols);
|
---|
| 49 | this.rules = new Dictionary<char, List<string>>();
|
---|
| 50 | foreach (var r in rules) {
|
---|
| 51 | if (!this.rules.ContainsKey(r.Item1)) this.rules.Add(r.Item1, new List<string>());
|
---|
| 52 | this.rules[r.Item1].Add(r.Item2); // here we store an array of symbols for a phase
|
---|
| 53 | }
|
---|
| 54 |
|
---|
| 55 | CheckValidity();
|
---|
| 56 | CalculatePhaseLengthBounds();
|
---|
| 57 | }
|
---|
| 58 |
|
---|
| 59 | private void CheckValidity() {
|
---|
| 60 | // check validity of parameters
|
---|
| 61 | if (!nonTerminalSymbols.Contains(sentenceSymbol) || // sentence symbol must be a non-terminal
|
---|
| 62 | nonTerminalSymbols.Any(c => c < 'A' || c > 'Z') || // nonterminal symbols \in 'A' .. 'Z'
|
---|
| 63 | terminalSymbols.Intersect(nonTerminalSymbols).Any() || // no overlap between non-terminals and terminal
|
---|
| 64 | rules.Keys.Intersect(nonTerminalSymbols).Count() < nonTerminalSymbols.Count ||
|
---|
| 65 | nonTerminalSymbols.Any(nt => !rules.ContainsKey(nt)) || // each nt must have at least one rule
|
---|
| 66 | rules.Values.Any(alternatives => !alternatives.Any()) || // no eps alternatives
|
---|
| 67 | rules.Any(rule => rule.Value.Any(alternative => alternative.Length == 1 && alternative[0] == rule.Key))
|
---|
| 68 | // no infinite direct recursions
|
---|
| 69 | ) {
|
---|
| 70 | throw new ArgumentException();
|
---|
| 71 | }
|
---|
| 72 | }
|
---|
| 73 |
|
---|
| 74 | private void CalculatePhaseLengthBounds() {
|
---|
| 75 | minPhraseLength.Clear();
|
---|
| 76 | maxPhraseLength.Clear();
|
---|
| 77 | // cache phrase lengths
|
---|
| 78 | foreach (var nt in nonTerminalSymbols) {
|
---|
| 79 | var min = int.MaxValue;
|
---|
| 80 | var max = int.MinValue;
|
---|
| 81 | foreach (var alt in rules[nt].OrderBy(alt => alt.Length)) {
|
---|
| 82 | minPhraseLength[alt] = MinPhraseLength(alt);
|
---|
| 83 | maxPhraseLength[alt] = MaxPhraseLength(alt);
|
---|
| 84 |
|
---|
| 85 | min = Math.Min(min, minPhraseLength[alt]);
|
---|
| 86 | max = Math.Max(max, maxPhraseLength[alt]);
|
---|
| 87 | }
|
---|
| 88 | minPhraseLength[nt.ToString()] = min;
|
---|
| 89 | maxPhraseLength[nt.ToString()] = max;
|
---|
| 90 | }
|
---|
| 91 | }
|
---|
| 92 |
|
---|
| 93 |
|
---|
| 94 | public IEnumerable<string> GetAlternatives(char nt) {
|
---|
| 95 | return rules[nt];
|
---|
| 96 | }
|
---|
| 97 |
|
---|
| 98 | public IEnumerable<string> GetTerminalAlternatives(char nt) {
|
---|
| 99 | return GetAlternatives(nt).Where(alt => alt.All(IsTerminal));
|
---|
| 100 | }
|
---|
| 101 |
|
---|
| 102 | public IEnumerable<string> GetNonTerminalAlternatives(char nt) {
|
---|
| 103 | return GetAlternatives(nt).Where(alt => alt.Any(IsNonTerminal));
|
---|
| 104 | }
|
---|
| 105 |
|
---|
| 106 | // caches for this are build in construction of object
|
---|
| 107 | public int MinPhraseLength(string phrase) {
|
---|
| 108 | int l;
|
---|
| 109 | if (minPhraseLength.TryGetValue(phrase, out l)) return l;
|
---|
| 110 | l = 0;
|
---|
| 111 | minPhraseLength[phrase] = short.MaxValue; // mark
|
---|
| 112 |
|
---|
| 113 | foreach (var symb in phrase) {
|
---|
| 114 | if (IsNonTerminal(symb)) {
|
---|
| 115 | l += MinSymbolLength(symb);
|
---|
| 116 | } else {
|
---|
| 117 | l++;
|
---|
| 118 | }
|
---|
| 119 | }
|
---|
| 120 |
|
---|
| 121 | l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue
|
---|
| 122 | minPhraseLength[phrase] = l; // update
|
---|
| 123 | return l;
|
---|
| 124 | }
|
---|
| 125 |
|
---|
| 126 | // caches for this are build in construction of object
|
---|
| 127 | public int MaxPhraseLength(string phrase) {
|
---|
| 128 | int l;
|
---|
| 129 | if (maxPhraseLength.TryGetValue(phrase, out l)) return l;
|
---|
| 130 | l = 0;
|
---|
| 131 | maxPhraseLength[phrase] = short.MaxValue; // mark
|
---|
| 132 |
|
---|
| 133 | foreach (var symb in phrase) {
|
---|
| 134 | if (IsNonTerminal(symb)) {
|
---|
| 135 | l += MaxSymbolLength(symb);
|
---|
| 136 | } else {
|
---|
| 137 | l++;
|
---|
| 138 | }
|
---|
| 139 | }
|
---|
| 140 | l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue
|
---|
| 141 | maxPhraseLength[phrase] = l; // update
|
---|
| 142 | return l;
|
---|
| 143 | }
|
---|
| 144 |
|
---|
| 145 | private int MinSymbolLength(char nt) {
|
---|
| 146 | if (IsTerminal(nt)) return 1;
|
---|
| 147 | else return Math.Min(short.MaxValue, rules[nt].Min(alt => MinPhraseLength(alt))); // maximal allowed value is short.MaxValue
|
---|
| 148 | }
|
---|
| 149 | private int MaxSymbolLength(char nt) {
|
---|
| 150 | if (IsTerminal(nt)) return 1;
|
---|
| 151 | else return Math.Min(short.MaxValue, rules[nt].Max(alt => MaxPhraseLength(alt))); // maximal allowed value is short.MaxValue
|
---|
| 152 | }
|
---|
| 153 |
|
---|
| 154 | public string CompleteSentenceRandomly(Random random, string phrase, int maxLen) {
|
---|
| 155 | if (phrase.Length > maxLen) throw new ArgumentException();
|
---|
| 156 | if (MinPhraseLength(phrase) > maxLen) throw new ArgumentException();
|
---|
| 157 | bool done = phrase.All(IsTerminal); // terminal phrase means we are done
|
---|
| 158 | while (!done) {
|
---|
| 159 | int ntIdx; char nt;
|
---|
[11727] | 160 | FindFirstNonTerminal(this, phrase, out nt, out ntIdx);
|
---|
[11659] | 161 |
|
---|
| 162 | int maxLenOfReplacement = maxLen - (phrase.Length - 1); // replacing aAb with maxLen 4 means we can only use alternatives with a minPhraseLen <= 2
|
---|
| 163 | Debug.Assert(maxLenOfReplacement > 0);
|
---|
| 164 |
|
---|
| 165 | var alts = GetAlternatives(nt).Where(alt => MinPhraseLength(alt) <= maxLenOfReplacement);
|
---|
| 166 | Debug.Assert(alts.Any());
|
---|
| 167 |
|
---|
| 168 | // replace nt with random alternative
|
---|
| 169 | var selectedAlt = alts.SelectRandom(random);
|
---|
| 170 | phrase = phrase.Remove(ntIdx, 1);
|
---|
| 171 | phrase = phrase.Insert(ntIdx, selectedAlt);
|
---|
| 172 |
|
---|
| 173 | done = phrase.All(IsTerminal); // terminal phrase means we are done
|
---|
| 174 | }
|
---|
| 175 | return phrase;
|
---|
| 176 | }
|
---|
| 177 |
|
---|
[11727] | 178 | public static void FindFirstNonTerminal(IGrammar g, string phrase, out char nt, out int ntIdx) {
|
---|
[11659] | 179 | ntIdx = 0;
|
---|
[11727] | 180 | while (ntIdx < phrase.Length && g.IsTerminal(phrase[ntIdx])) ntIdx++;
|
---|
[11659] | 181 | if (ntIdx >= phrase.Length) {
|
---|
| 182 | ntIdx = -1;
|
---|
| 183 | nt = '\0';
|
---|
| 184 | } else {
|
---|
| 185 | nt = phrase[ntIdx];
|
---|
| 186 | }
|
---|
| 187 | }
|
---|
| 188 |
|
---|
| 189 | public bool IsTerminal(char symbol) {
|
---|
| 190 | return terminalSymbols.Contains(symbol);
|
---|
| 191 | }
|
---|
| 192 |
|
---|
| 193 | public bool IsNonTerminal(char symbol) {
|
---|
| 194 | return nonTerminalSymbols.Contains(symbol);
|
---|
| 195 | }
|
---|
| 196 |
|
---|
| 197 | private static readonly string symbRegex = @"(?<symb>[^\s\|\.])";
|
---|
| 198 | private static readonly string altRegex = @"(?<alt>[^\s\|\.]+)";
|
---|
| 199 | private static readonly string altListRegex = altRegex + @"(\s*\|\s*" + altRegex + @")*";
|
---|
| 200 | // private static readonly string terminalClassRegex = @"[^\s\|\.]\s*\.\.\s*[^\s\|\.]";
|
---|
| 201 | private static readonly string terminalClassRegex = symbRegex + @"\s*\.\.\s*" + symbRegex + @"\s*";
|
---|
| 202 |
|
---|
| 203 | private static readonly Regex sentenceSymbRegex = new Regex(@"\s*G\(" + symbRegex + @"\):\s*$", RegexOptions.ExplicitCapture);
|
---|
| 204 | private static readonly Regex ruleRegex = new Regex(@"\s*" + symbRegex + @"\s*->\s*(" + altListRegex + "|" + terminalClassRegex + @")\s*$", RegexOptions.ExplicitCapture);
|
---|
| 205 |
|
---|
| 206 |
|
---|
| 207 | // supports only meta-symbol for alternatives (syntax of formal grammars)
|
---|
| 208 | // epsilon-alternatives are not supported
|
---|
| 209 | public static IGrammar FromString(string grammar) {
|
---|
| 210 | char sentenceSymbol;
|
---|
| 211 | var nonTerminals = new List<char>();
|
---|
| 212 | var terminals = new List<char>();
|
---|
| 213 | var rules = new List<Tuple<char, string>>();
|
---|
| 214 |
|
---|
| 215 | using (var reader = new StringReader(grammar)) {
|
---|
| 216 | var line = reader.ReadLine();
|
---|
| 217 | while (string.IsNullOrWhiteSpace(line)) line = reader.ReadLine();
|
---|
| 218 | // first line contains "G(...):"
|
---|
| 219 | var match = sentenceSymbRegex.Match(line);
|
---|
| 220 | if (!match.Success) throw new ArgumentException();
|
---|
| 221 | sentenceSymbol = match.Groups["symb"].Value[0]; // only a single symbol is in the capture group
|
---|
| 222 |
|
---|
| 223 | line = reader.ReadLine();
|
---|
| 224 | while (!string.IsNullOrWhiteSpace(line)) {
|
---|
| 225 | match = ruleRegex.Match(line);
|
---|
| 226 | if (match.Length != line.Length) throw new ArgumentException();
|
---|
| 227 | var nt = match.Groups["symb"].Captures[0].Value[0];
|
---|
| 228 | if (!nonTerminals.Contains(nt)) nonTerminals.Add(nt);
|
---|
| 229 |
|
---|
| 230 | // contains alternatives?
|
---|
| 231 | if (match.Groups["alt"].Captures.Count > 0) {
|
---|
| 232 | foreach (var m in match.Groups["alt"].Captures) {
|
---|
| 233 | rules.Add(Tuple.Create(nt, m.ToString()));
|
---|
| 234 | }
|
---|
| 235 | } else if (match.Groups["symb"].Captures.Count == 3) {
|
---|
| 236 | // contains terminal range
|
---|
| 237 | var fromSymb = match.Groups["symb"].Captures[1].Value[0];
|
---|
| 238 | var toSymb = match.Groups["symb"].Captures[2].Value[0];
|
---|
| 239 | if (fromSymb > toSymb) throw new ArgumentException("left symbol is larger than right symbol in range");
|
---|
| 240 |
|
---|
| 241 | for (byte b = Convert.ToByte(fromSymb); b <= Convert.ToByte(toSymb); b++) {
|
---|
| 242 | rules.Add(Tuple.Create(nt, Convert.ToChar(b).ToString()));
|
---|
| 243 | }
|
---|
| 244 | }
|
---|
| 245 |
|
---|
| 246 | line = reader.ReadLine();
|
---|
| 247 | }
|
---|
| 248 | }
|
---|
| 249 |
|
---|
| 250 | foreach (var r in rules) {
|
---|
| 251 | foreach (var s in r.Item2) {
|
---|
| 252 | if (!nonTerminals.Contains(s) && !terminals.Contains(s)) terminals.Add(s);
|
---|
| 253 | }
|
---|
| 254 | }
|
---|
| 255 | return new Grammar(sentenceSymbol, terminals, nonTerminals, rules);
|
---|
| 256 | }
|
---|
| 257 |
|
---|
| 258 | public override string ToString() {
|
---|
| 259 | // for debugging
|
---|
| 260 | var sb = new StringBuilder();
|
---|
| 261 | sb.AppendFormat("G({0}):", sentenceSymbol).AppendLine();
|
---|
| 262 | foreach (var r in rules) {
|
---|
| 263 | foreach (var alt in r.Value) {
|
---|
| 264 | var phrase = string.Join(" ", alt);
|
---|
| 265 | sb.AppendFormat(" {0} -> {1} (min: {2}, max {3})", r.Key, phrase, MinPhraseLength(phrase), MaxPhraseLength(phrase))
|
---|
| 266 | .AppendLine();
|
---|
| 267 | }
|
---|
| 268 | }
|
---|
| 269 | return sb.ToString();
|
---|
| 270 | }
|
---|
| 271 | }
|
---|
| 272 | }
|
---|