using System; using System.Collections.Generic; using System.Data; using System.Diagnostics; using System.IO; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Xml.Linq; using HeuristicLab.Common; namespace HeuristicLab.Problems.GrammaticalOptimization { // phrases and symbols are both strings // spaces are used to separate symbols in phases // terminal and non-terminal symbols must not contain white-space // white-spaces are not counted when determining the length of a sentence, only the number of terminal symbols is relevant public class Grammar : IGrammar { private readonly Dictionary> rules; private readonly HashSet terminalSymbols; private readonly char sentenceSymbol; private readonly HashSet nonTerminalSymbols; private readonly Dictionary maxPhraseLength = new Dictionary(); private readonly Dictionary minPhraseLength = new Dictionary(); public char SentenceSymbol { get { return sentenceSymbol; } } public IEnumerable NonTerminalSymbols { get { return nonTerminalSymbols; } } public IEnumerable TerminalSymbols { get { return terminalSymbols; } } public IEnumerable Symbols { get { return nonTerminalSymbols.Concat(terminalSymbols); } } // ctor guarantees that intersection is empty // cloning ctor public Grammar(Grammar orig) { this.rules = new Dictionary>(); foreach (var r in orig.rules) this.rules.Add(r.Key, new List(r.Value.Select(v => new ReadonlySequence(v)))); // clone sequences this.terminalSymbols = new HashSet(orig.terminalSymbols); this.sentenceSymbol = orig.sentenceSymbol; this.nonTerminalSymbols = new HashSet(orig.nonTerminalSymbols); this.maxPhraseLength = new Dictionary(); foreach (var p in orig.maxPhraseLength) this.maxPhraseLength.Add(new ReadonlySequence(p.Key), p.Value); this.minPhraseLength = new Dictionary(); foreach (var p in orig.minPhraseLength) this.minPhraseLength.Add(new ReadonlySequence(p.Key), p.Value); } public Grammar(string grammar) : this((Grammar)FromString(grammar)) { } public Grammar(char sentenceSymbol, IEnumerable terminalSymbols, IEnumerable nonTerminalSymbols, IEnumerable> rules) { this.sentenceSymbol = sentenceSymbol; this.terminalSymbols = new HashSet(terminalSymbols); this.nonTerminalSymbols = new HashSet(nonTerminalSymbols); this.rules = new Dictionary>(); foreach (var r in rules) { if (!this.rules.ContainsKey(r.Item1)) this.rules.Add(r.Item1, new List()); this.rules[r.Item1].Add(new ReadonlySequence(r.Item2)); // here we store an array of symbols for a phase } CheckValidity(); CalculatePhraseLengthBounds(); } private void CheckValidity() { // check validity of parameters if (!nonTerminalSymbols.Contains(sentenceSymbol) || // sentence symbol must be a non-terminal nonTerminalSymbols.Any(c => c < 'A' || c > 'Z') || // nonterminal symbols \in 'A' .. 'Z' terminalSymbols.Intersect(nonTerminalSymbols).Any() || // no overlap between non-terminals and terminal rules.Keys.Intersect(nonTerminalSymbols).Count() < nonTerminalSymbols.Count || nonTerminalSymbols.Any(nt => !rules.ContainsKey(nt)) || // each nt must have at least one rule rules.Values.Any(alternatives => !alternatives.Any()) || // no eps alternatives rules.Any(rule => rule.Value.Any(alternative => alternative.Length == 1 && alternative[0] == rule.Key)) // no infinite direct recursions ) { throw new ArgumentException(); } } private void CalculatePhraseLengthBounds() { // cache phrase lengths foreach (var nt in nonTerminalSymbols) { var min = int.MaxValue; var max = int.MinValue; foreach (var alt in rules[nt].OrderBy(alt => alt.Length)) { CalcAndSetMinPhraseLength(alt); CalcAndSetMaxPhraseLength(alt); min = Math.Min(min, minPhraseLength[alt]); max = Math.Max(max, maxPhraseLength[alt]); } minPhraseLength[new ReadonlySequence(nt)] = min; maxPhraseLength[new ReadonlySequence(nt)] = max; } } public IEnumerable GetAlternatives(char nt) { return rules[nt]; } public IEnumerable GetTerminalAlternatives(char nt) { return GetAlternatives(nt).Where(alt => alt.All(IsTerminal)); } public IEnumerable GetNonTerminalAlternatives(char nt) { return GetAlternatives(nt).Where(alt => alt.Any(IsNonTerminal)); } #region population of minphraselength cache private int CalcAndSetMinSymbolLength(char symb) { if (IsTerminal(symb)) return 1; else return Math.Min(short.MaxValue, rules[symb].Min(alt => CalcAndSetMinPhraseLength(alt))); // maximal allowed value is short.MaxValue } private int CalcAndSetMinPhraseLength(Sequence phrase) { Debug.Assert(phrase is ReadonlySequence); int l; if (minPhraseLength.TryGetValue(phrase, out l)) return l; l = 0; minPhraseLength[phrase] = short.MaxValue; // mark foreach (var symb in phrase) { l += CalcAndSetMinSymbolLength(symb); } l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue minPhraseLength[phrase] = l; // update return l; } #endregion // read only access to caches private int MinSymbolLength(char symb) { if (IsTerminal(symb)) return 1; else return Math.Min(short.MaxValue, rules[symb].Min(alt => MinPhraseLength(alt))); // maximal allowed value is short.MaxValue } public int MinPhraseLength(Sequence phrase) { var minLen = 0; if (minPhraseLength.TryGetValue(phrase, out minLen)) return minLen; foreach (var s in phrase) { minLen += MinSymbolLength(s); } return Math.Min(short.MaxValue, minLen); // maximal allowed value is short.MaxValue } #region population of maxphraselength cache private int CalcAndSetMaxSymbolLength(char symb) { if (IsTerminal(symb)) return 1; else return Math.Min(short.MaxValue, rules[symb].Max(alt => CalcAndSetMaxPhraseLength(alt))); // maximal allowed value is short.MaxValue } // caches for this are build in construction of object private int CalcAndSetMaxPhraseLength(Sequence phrase) { Debug.Assert(phrase is ReadonlySequence); int l; if (maxPhraseLength.TryGetValue(phrase, out l)) return l; l = 0; maxPhraseLength[phrase] = short.MaxValue; // mark foreach (var symb in phrase) { l += CalcAndSetMaxSymbolLength(symb); } l = Math.Min(short.MaxValue, l); // maximal allowed value is short.MaxValue maxPhraseLength[phrase] = l; // update return l; } #endregion // read only access to caches public int MaxSymbolLength(char symb) { if (IsTerminal(symb)) return 1; else return Math.Min(short.MaxValue, rules[symb].Max(alt => MaxPhraseLength(alt))); // maximal allowed value is short.MaxValue } public int MaxPhraseLength(Sequence phrase) { var maxLen = 0; if (maxPhraseLength.TryGetValue(phrase, out maxLen)) return maxLen; foreach (var s in phrase) { maxLen += MaxSymbolLength(s); } return Math.Min(short.MaxValue, maxLen); // maximal allowed value is short.MaxValue } public Sequence CompleteSentenceRandomly(Random random, Sequence phrase, int maxLen) { if (phrase.Length > maxLen) throw new ArgumentException(); if (MinPhraseLength(phrase) > maxLen) throw new ArgumentException(); while (!phrase.IsTerminal) { char nt = phrase.FirstNonTerminal; int maxLenOfReplacement = maxLen - (phrase.Length - 1); // replacing aAb with maxLen 4 means we can only use alternatives with a minPhraseLen <= 2 Debug.Assert(maxLenOfReplacement > 0); var alts = GetAlternatives(nt).Where(alt => MinPhraseLength(alt) <= maxLenOfReplacement); Debug.Assert(alts.Any()); // replace nt with random alternative var selectedAlt = alts.SelectRandom(random); phrase.ReplaceAt(phrase.FirstNonTerminalIndex, 1, selectedAlt); } return phrase; } public bool IsTerminal(char symbol) { return terminalSymbols.Contains(symbol); } public bool IsNonTerminal(char symbol) { return nonTerminalSymbols.Contains(symbol); } private static readonly string symbRegex = @"(?[^\s\|\.])"; private static readonly string altRegex = @"(?[^\s\|\.]+)"; private static readonly string altListRegex = altRegex + @"(\s*\|\s*" + altRegex + @")*"; // private static readonly string terminalClassRegex = @"[^\s\|\.]\s*\.\.\s*[^\s\|\.]"; private static readonly string terminalClassRegex = symbRegex + @"\s*\.\.\s*" + symbRegex + @"\s*"; private static readonly Regex sentenceSymbRegex = new Regex(@"\s*G\(" + symbRegex + @"\):\s*$", RegexOptions.ExplicitCapture); private static readonly Regex ruleRegex = new Regex(@"\s*" + symbRegex + @"\s*->\s*(" + altListRegex + "|" + terminalClassRegex + @")\s*$", RegexOptions.ExplicitCapture); // supports only meta-symbol for alternatives (syntax of formal grammars) // epsilon-alternatives are not supported public static IGrammar FromString(string grammar) { char sentenceSymbol; var nonTerminals = new List(); var terminals = new List(); var rules = new List>(); using (var reader = new StringReader(grammar)) { var line = reader.ReadLine(); while (string.IsNullOrWhiteSpace(line)) line = reader.ReadLine(); // first line contains "G(...):" var match = sentenceSymbRegex.Match(line); if (!match.Success) throw new ArgumentException(); sentenceSymbol = match.Groups["symb"].Value[0]; // only a single symbol is in the capture group line = reader.ReadLine(); while (!string.IsNullOrWhiteSpace(line)) { match = ruleRegex.Match(line); if (match.Length != line.Length) throw new ArgumentException(); var nt = match.Groups["symb"].Captures[0].Value[0]; if (!nonTerminals.Contains(nt)) nonTerminals.Add(nt); // contains alternatives? if (match.Groups["alt"].Captures.Count > 0) { foreach (var m in match.Groups["alt"].Captures) { rules.Add(Tuple.Create(nt, m.ToString())); } } else if (match.Groups["symb"].Captures.Count == 3) { // contains terminal range var fromSymb = match.Groups["symb"].Captures[1].Value[0]; var toSymb = match.Groups["symb"].Captures[2].Value[0]; if (fromSymb > toSymb) throw new ArgumentException("left symbol is larger than right symbol in range"); for (byte b = Convert.ToByte(fromSymb); b <= Convert.ToByte(toSymb); b++) { rules.Add(Tuple.Create(nt, Convert.ToChar(b).ToString())); } } line = reader.ReadLine(); } } foreach (var r in rules) { foreach (var s in r.Item2) { if (!nonTerminals.Contains(s) && !terminals.Contains(s)) terminals.Add(s); } } return new Grammar(sentenceSymbol, terminals, nonTerminals, rules); } public override string ToString() { // for debugging var sb = new StringBuilder(); sb.AppendFormat("G({0}):", sentenceSymbol).AppendLine(); foreach (var r in rules) { foreach (var alt in r.Value) { sb.AppendFormat(" {0} -> {1} (min: {2}, max {3})", r.Key, alt, MinPhraseLength(alt), MaxPhraseLength(alt)) .AppendLine(); } } return sb.ToString(); } } }