Free cookie consent management tool by TermsFeed Policy Generator

source: branches/2929_PrioritizedGrammarEnumeration/HeuristicLab.Problems.DataAnalysis.Symbolic/3.4/Importer/InfixExpressionParser.cs

Last change on this file was 16315, checked in by gkronber, 6 years ago

#2929: worked on display of results, converting PGE expressions to HL Solutions.

File size: 22.2 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2018 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.Linq;
26using System.Text;
27using HeuristicLab.Collections;
28using HeuristicLab.Common;
29using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
30
31namespace HeuristicLab.Problems.DataAnalysis.Symbolic {
32  /// <summary>
33  /// Parses mathematical expressions in infix form. E.g. x1 * (3.0 * x2 + x3)
34  /// Identifier format (functions or variables): '_' | letter { '_' | letter | digit }
35  /// Variables names and variable values can be set under quotes "" or '' because variable names might contain spaces.
36  ///   Variable = ident | " ident " | ' ident '
37  /// It is also possible to use functions e.g. log("x1") or real-valued constants e.g. 3.1415 .
38  /// Variable names are case sensitive. Function names are not case sensitive.
39  ///
40  ///
41  /// S             = Expr EOF
42  /// Expr          = ['-' | '+'] Term { '+' Term | '-' Term }
43  /// Term          = Fact { '*' Fact | '/' Fact }
44  /// Fact          = SimpleFact [ '^' SimpleFact ]
45  /// SimpleFact    = '(' Expr ')'
46  ///                 | '{' Expr '}'
47  ///                 | 'LAG' '(' varId ',' ['+' | '-' ] number ')
48  ///                 | funcId '(' ArgList ')'
49  ///                 | VarExpr
50  ///                 | number
51  /// ArgList       = Expr { ',' Expr }
52  /// VarExpr       = varId OptFactorPart
53  /// OptFactorPart = [ ('=' varVal | '[' ['+' | '-' ] number {',' ['+' | '-' ] number } ']' ) ]
54  /// varId         =  ident | ' ident ' | " ident "
55  /// varVal        =  ident | ' ident ' | " ident "
56  /// ident         =  '_' | letter { '_' | letter | digit }
57  /// </summary>
58  public sealed class InfixExpressionParser {
59    private enum TokenType { Operator, Identifier, Number, LeftPar, RightPar, LeftBracket, RightBracket, Comma, Eq, End, NA };
60    private class Token {
61      internal double doubleVal;
62      internal string strVal;
63      internal TokenType TokenType;
64    }
65
66    private class SymbolNameComparer : IEqualityComparer<ISymbol>, IComparer<ISymbol> {
67      public int Compare(ISymbol x, ISymbol y) {
68        return x.Name.CompareTo(y.Name);
69      }
70
71      public bool Equals(ISymbol x, ISymbol y) {
72        return Compare(x, y) == 0;
73      }
74
75      public int GetHashCode(ISymbol obj) {
76        return obj.Name.GetHashCode();
77      }
78    }
79    // format name <-> symbol
80    // the lookup table is also used in the corresponding formatter
81    internal static readonly BidirectionalLookup<string, ISymbol>
82      knownSymbols = new BidirectionalLookup<string, ISymbol>(StringComparer.InvariantCulture, new SymbolNameComparer());
83
84    private Constant constant = new Constant();
85    private Variable variable = new Variable();
86    private BinaryFactorVariable binaryFactorVar = new BinaryFactorVariable();
87    private FactorVariable factorVar = new FactorVariable();
88
89    private ProgramRootSymbol programRootSymbol = new ProgramRootSymbol();
90    private StartSymbol startSymbol = new StartSymbol();
91
92    static InfixExpressionParser() {
93      // populate bidirectional lookup
94      var dict = new Dictionary<string, ISymbol>
95      {
96        { "+", new Addition()},
97        { "/", new Division()},
98        { "*", new Multiplication()},
99        { "-", new Subtraction()},
100        { "^", new Power() },
101        { "EXP", new Exponential()},
102        { "LOG", new Logarithm()},
103        { "POW", new Power()},
104        { "ROOT", new Root()},
105        { "SQR", new Square() },
106        { "SQRT", new SquareRoot() },
107        { "SIN",new Sine()},
108        { "COS", new Cosine()},
109        { "TAN", new Tangent()},
110        { "AIRYA", new AiryA()},
111        { "AIRYB", new AiryB()},
112        { "BESSEL", new Bessel()},
113        { "COSINT", new CosineIntegral()},
114        { "SININT", new SineIntegral()},
115        { "HYPCOSINT", new HyperbolicCosineIntegral()},
116        { "HYPSININT", new HyperbolicSineIntegral()},
117        { "FRESNELSININT", new FresnelSineIntegral()},
118        { "FRESNELCOSINT", new FresnelCosineIntegral()},
119        { "NORM", new Norm()},
120        { "ERF", new Erf()},
121        { "GAMMA", new Gamma()},
122        { "PSI", new Psi()},
123        { "DAWSON", new Dawson()},
124        { "EXPINT", new ExponentialIntegralEi()},
125        { "MEAN", new Average()},
126        { "IF", new IfThenElse()},
127        { "GT", new GreaterThan()},
128        { "LT", new LessThan()},
129        { "AND", new And()},
130        { "OR", new Or()},
131        { "NOT", new Not()},
132        { "XOR", new Xor()},
133        { "DIFF", new Derivative()},
134        { "LAG", new LaggedVariable() },
135      };
136
137
138      foreach (var kvp in dict) {
139        knownSymbols.Add(kvp.Key, kvp.Value);
140      }
141    }
142
143    public ISymbolicExpressionTree Parse(string str) {
144      ISymbolicExpressionTreeNode root = programRootSymbol.CreateTreeNode();
145      ISymbolicExpressionTreeNode start = startSymbol.CreateTreeNode();
146      var allTokens = GetAllTokens(str).ToArray();
147      ISymbolicExpressionTreeNode mainBranch = ParseS(new Queue<Token>(allTokens));
148
149      // only a main branch was given => insert the main branch into the default tree template
150      root.AddSubtree(start);
151      start.AddSubtree(mainBranch);
152      return new SymbolicExpressionTree(root);
153    }
154
155    private IEnumerable<Token> GetAllTokens(string str) {
156      int pos = 0;
157      while (true) {
158        while (pos < str.Length && Char.IsWhiteSpace(str[pos])) pos++;
159        if (pos >= str.Length) {
160          yield return new Token { TokenType = TokenType.End, strVal = "" };
161          yield break;
162        }
163        if (char.IsDigit(str[pos])) {
164          // read number (=> read until white space or operator or comma)
165          var sb = new StringBuilder();
166          sb.Append(str[pos]);
167          pos++;
168          while (pos < str.Length && !char.IsWhiteSpace(str[pos])
169            && (str[pos] != '+' || str[pos - 1] == 'e' || str[pos - 1] == 'E')     // continue reading exponents
170            && (str[pos] != '-' || str[pos - 1] == 'e' || str[pos - 1] == 'E')
171            && str[pos] != '*'
172            && str[pos] != '/'
173            && str[pos] != '^'
174            && str[pos] != ')'
175            && str[pos] != ']'
176            && str[pos] != '}'
177            && str[pos] != ',') {
178            sb.Append(str[pos]);
179            pos++;
180          }
181          double dblVal;
182          if (double.TryParse(sb.ToString(), NumberStyles.Float, CultureInfo.InvariantCulture, out dblVal))
183            yield return new Token { TokenType = TokenType.Number, strVal = sb.ToString(), doubleVal = dblVal };
184          else yield return new Token { TokenType = TokenType.NA, strVal = sb.ToString() };
185        } else if (char.IsLetter(str[pos]) || str[pos] == '_') {
186          // read ident
187          var sb = new StringBuilder();
188          sb.Append(str[pos]);
189          pos++;
190          while (pos < str.Length &&
191            (char.IsLetter(str[pos]) || str[pos] == '_' || char.IsDigit(str[pos]))) {
192            sb.Append(str[pos]);
193            pos++;
194          }
195          yield return new Token { TokenType = TokenType.Identifier, strVal = sb.ToString() };
196        } else if (str[pos] == '"') {
197          // read to next "
198          pos++;
199          var sb = new StringBuilder();
200          while (pos < str.Length && str[pos] != '"') {
201            sb.Append(str[pos]);
202            pos++;
203          }
204          if (pos < str.Length && str[pos] == '"') {
205            pos++; // skip "
206            yield return new Token { TokenType = TokenType.Identifier, strVal = sb.ToString() };
207          } else
208            yield return new Token { TokenType = TokenType.NA };
209
210        } else if (str[pos] == '\'') {
211          // read to next '
212          pos++;
213          var sb = new StringBuilder();
214          while (pos < str.Length && str[pos] != '\'') {
215            sb.Append(str[pos]);
216            pos++;
217          }
218          if (pos < str.Length && str[pos] == '\'') {
219            pos++; // skip '
220            yield return new Token { TokenType = TokenType.Identifier, strVal = sb.ToString() };
221          } else
222            yield return new Token { TokenType = TokenType.NA };
223        } else if (str[pos] == '+') {
224          pos++;
225          yield return new Token { TokenType = TokenType.Operator, strVal = "+" };
226        } else if (str[pos] == '-') {
227          pos++;
228          yield return new Token { TokenType = TokenType.Operator, strVal = "-" };
229        } else if (str[pos] == '/') {
230          pos++;
231          yield return new Token { TokenType = TokenType.Operator, strVal = "/" };
232        } else if (str[pos] == '*') {
233          pos++;
234          yield return new Token { TokenType = TokenType.Operator, strVal = "*" };
235        } else if (str[pos] == '^') {
236          pos++;
237          yield return new Token { TokenType = TokenType.Operator, strVal = "^" };
238        } else if (str[pos] == '(') {
239          pos++;
240          yield return new Token { TokenType = TokenType.LeftPar, strVal = "(" };
241        } else if (str[pos] == ')') {
242          pos++;
243          yield return new Token { TokenType = TokenType.RightPar, strVal = ")" };
244        } else if (str[pos] == '[') {
245          pos++;
246          yield return new Token { TokenType = TokenType.LeftBracket, strVal = "[" };
247        } else if (str[pos] == ']') {
248          pos++;
249          yield return new Token { TokenType = TokenType.RightBracket, strVal = "]" };
250        } else if (str[pos] == '{') {
251          pos++;
252          yield return new Token { TokenType = TokenType.LeftPar, strVal = "{" };
253        } else if (str[pos] == '}') {
254          pos++;
255          yield return new Token { TokenType = TokenType.RightPar, strVal = "}" };
256        } else if (str[pos] == '=') {
257          pos++;
258          yield return new Token { TokenType = TokenType.Eq, strVal = "=" };
259        } else if (str[pos] == ',') {
260          pos++;
261          yield return new Token { TokenType = TokenType.Comma, strVal = "," };
262        } else {
263          throw new ArgumentException("Invalid character: " + str[pos]);
264        }
265      }
266    }
267    /// S             = Expr EOF
268    private ISymbolicExpressionTreeNode ParseS(Queue<Token> tokens) {
269      var expr = ParseExpr(tokens);
270
271      var endTok = tokens.Dequeue();
272      if (endTok.TokenType != TokenType.End)
273        throw new ArgumentException(string.Format("Expected end of expression (got {0})", endTok.strVal));
274
275      return expr;
276    }
277
278    /// Expr          = ['-' | '+'] Term { '+' Term | '-' Term }
279    private ISymbolicExpressionTreeNode ParseExpr(Queue<Token> tokens) {
280      var next = tokens.Peek();
281      var posTerms = new List<ISymbolicExpressionTreeNode>();
282      var negTerms = new List<ISymbolicExpressionTreeNode>();
283      bool negateFirstTerm = false;
284      if (next.TokenType == TokenType.Operator && (next.strVal == "+" || next.strVal == "-")) {
285        tokens.Dequeue();
286        if (next.strVal == "-")
287          negateFirstTerm = true;
288      }
289      var t = ParseTerm(tokens);
290      if (negateFirstTerm) negTerms.Add(t);
291      else posTerms.Add(t);
292
293      next = tokens.Peek();
294      while (next.strVal == "+" || next.strVal == "-") {
295        switch (next.strVal) {
296          case "+": {
297              tokens.Dequeue();
298              var term = ParseTerm(tokens);
299              posTerms.Add(term);
300              break;
301            }
302          case "-": {
303              tokens.Dequeue();
304              var term = ParseTerm(tokens);
305              negTerms.Add(term);
306              break;
307            }
308        }
309        next = tokens.Peek();
310      }
311
312      var sum = GetSymbol("+").CreateTreeNode();
313      foreach (var posTerm in posTerms) sum.AddSubtree(posTerm);
314      if (negTerms.Any()) {
315        if (negTerms.Count == 1) {
316          var sub = GetSymbol("-").CreateTreeNode();
317          sub.AddSubtree(negTerms.Single());
318          sum.AddSubtree(sub);
319        } else {
320          var sumNeg = GetSymbol("+").CreateTreeNode();
321          foreach (var negTerm in negTerms) sumNeg.AddSubtree(negTerm);
322
323          var constNode = (ConstantTreeNode)constant.CreateTreeNode();
324          constNode.Value = -1.0;
325          var prod = GetSymbol("*").CreateTreeNode();
326          prod.AddSubtree(constNode);
327          prod.AddSubtree(sumNeg);
328
329          sum.AddSubtree(prod);
330        }
331      }
332      if (sum.SubtreeCount == 1) return sum.Subtrees.First();
333      else return sum;
334    }
335
336    private ISymbol GetSymbol(string tok) {
337      var symb = knownSymbols.GetByFirst(tok).FirstOrDefault();
338      if (symb == null) throw new ArgumentException(string.Format("Unknown token {0} found.", tok));
339      return symb;
340    }
341
342    /// Term          = Fact { '*' Fact | '/' Fact }
343    private ISymbolicExpressionTreeNode ParseTerm(Queue<Token> tokens) {
344      var factors = new List<ISymbolicExpressionTreeNode>();
345      var firstFactor = ParseFact(tokens);
346      factors.Add(firstFactor);
347
348      var next = tokens.Peek();
349      while (next.strVal == "*" || next.strVal == "/") {
350        switch (next.strVal) {
351          case "*": {
352              tokens.Dequeue();
353              var fact = ParseFact(tokens);
354              factors.Add(fact);
355              break;
356            }
357          case "/": {
358              tokens.Dequeue();
359              var invFact = ParseFact(tokens);
360              var divNode = GetSymbol("/").CreateTreeNode(); // 1/x
361              divNode.AddSubtree(invFact);
362              factors.Add(divNode);
363              break;
364            }
365        }
366
367        next = tokens.Peek();
368      }
369      if (factors.Count == 1) return factors.First();
370      else {
371        var prod = GetSymbol("*").CreateTreeNode();
372        foreach (var f in factors) prod.AddSubtree(f);
373        return prod;
374      }
375    }
376
377    // Fact = SimpleFact ['^' SimpleFact]
378    private ISymbolicExpressionTreeNode ParseFact(Queue<Token> tokens) {
379      var expr = ParseSimpleFact(tokens);
380      var next = tokens.Peek();
381      if (next.TokenType == TokenType.Operator && next.strVal == "^") {
382        tokens.Dequeue(); // skip;
383
384        var p = GetSymbol("^").CreateTreeNode();
385        p.AddSubtree(expr);
386        p.AddSubtree(ParseSimpleFact(tokens));
387        expr = p;
388      }
389      return expr;
390    }
391
392
393    /// SimpleFact   = '(' Expr ')'
394    ///                 | '{' Expr '}'
395    ///                 | 'LAG' '(' varId ',' ['+' | '-' ] number ')'
396    ///                 | funcId '(' ArgList ')
397    ///                 | VarExpr
398    ///                 | number
399    /// ArgList       = Expr { ',' Expr }
400    /// VarExpr       = varId OptFactorPart
401    /// OptFactorPart = [ ('=' varVal | '[' ['+' | '-' ] number {',' ['+' | '-' ] number } ']' ) ]
402    /// varId         =  ident | ' ident ' | " ident "
403    /// varVal        =  ident | ' ident ' | " ident "
404    /// ident         =  '_' | letter { '_' | letter | digit }
405    private ISymbolicExpressionTreeNode ParseSimpleFact(Queue<Token> tokens) {
406      var next = tokens.Peek();
407      if (next.TokenType == TokenType.LeftPar) {
408        var initPar = tokens.Dequeue(); // match par type
409        var expr = ParseExpr(tokens);
410        var rPar = tokens.Dequeue();
411        if (rPar.TokenType != TokenType.RightPar)
412          throw new ArgumentException("expected closing parenthesis");
413        if (initPar.strVal == "(" && rPar.strVal == "}")
414          throw new ArgumentException("expected closing )");
415        if (initPar.strVal == "{" && rPar.strVal == ")")
416          throw new ArgumentException("expected closing }");
417        return expr;
418      } else if (next.TokenType == TokenType.Identifier) {
419        var idTok = tokens.Dequeue();
420        if (tokens.Peek().TokenType == TokenType.LeftPar) {
421          // function identifier or LAG
422          var funcId = idTok.strVal.ToUpperInvariant();
423
424          var funcNode = GetSymbol(funcId).CreateTreeNode();
425          var lPar = tokens.Dequeue();
426          if (lPar.TokenType != TokenType.LeftPar)
427            throw new ArgumentException("expected (");
428
429          // handle 'lag' specifically
430          if (funcNode.Symbol is LaggedVariable) {
431            var varId = tokens.Dequeue();
432            if (varId.TokenType != TokenType.Identifier) throw new ArgumentException("Identifier expected. Format for lagged variables: \"lag(x, -1)\"");
433            var comma = tokens.Dequeue();
434            if (comma.TokenType != TokenType.Comma) throw new ArgumentException("',' expected, Format for lagged variables: \"lag(x, -1)\"");
435            double sign = 1.0;
436            if (tokens.Peek().strVal == "+" || tokens.Peek().strVal == "-") {
437              // read sign
438              var signTok = tokens.Dequeue();
439              if (signTok.strVal == "-") sign = -1.0;
440            }
441            var lagToken = tokens.Dequeue();
442            if (lagToken.TokenType != TokenType.Number) throw new ArgumentException("Number expected, Format for lagged variables: \"lag(x, -1)\"");
443            if (!lagToken.doubleVal.IsAlmost(Math.Round(lagToken.doubleVal)))
444              throw new ArgumentException("Time lags must be integer values");
445            var laggedVarNode = funcNode as LaggedVariableTreeNode;
446            laggedVarNode.VariableName = varId.strVal;
447            laggedVarNode.Lag = (int)Math.Round(sign * lagToken.doubleVal);
448            laggedVarNode.Weight = 1.0;
449          } else {
450            // functions
451            var args = ParseArgList(tokens);
452            // check number of arguments
453            if (funcNode.Symbol.MinimumArity > args.Length || funcNode.Symbol.MaximumArity < args.Length) {
454              throw new ArgumentException(string.Format("Symbol {0} requires between {1} and  {2} arguments.", funcId,
455                funcNode.Symbol.MinimumArity, funcNode.Symbol.MaximumArity));
456            }
457            foreach (var arg in args) funcNode.AddSubtree(arg);
458          }
459
460          var rPar = tokens.Dequeue();
461          if (rPar.TokenType != TokenType.RightPar)
462            throw new ArgumentException("expected )");
463
464
465          return funcNode;
466        } else {
467          // variable
468          if (tokens.Peek().TokenType == TokenType.Eq) {
469            // binary factor
470            tokens.Dequeue(); // skip Eq
471            var valTok = tokens.Dequeue();
472            if (valTok.TokenType != TokenType.Identifier) throw new ArgumentException("expected identifier");
473            var binFactorNode = (BinaryFactorVariableTreeNode)binaryFactorVar.CreateTreeNode();
474            binFactorNode.Weight = 1.0;
475            binFactorNode.VariableName = idTok.strVal;
476            binFactorNode.VariableValue = valTok.strVal;
477            return binFactorNode;
478          } else if (tokens.Peek().TokenType == TokenType.LeftBracket) {
479            // factor variable
480            var factorVariableNode = (FactorVariableTreeNode)factorVar.CreateTreeNode();
481            factorVariableNode.VariableName = idTok.strVal;
482
483            tokens.Dequeue(); // skip [
484            var weights = new List<double>();
485            // at least one weight is necessary
486            var sign = 1.0;
487            if (tokens.Peek().TokenType == TokenType.Operator) {
488              var opToken = tokens.Dequeue();
489              if (opToken.strVal == "+") sign = 1.0;
490              else if (opToken.strVal == "-") sign = -1.0;
491              else throw new ArgumentException();
492            }
493            if (tokens.Peek().TokenType != TokenType.Number) throw new ArgumentException("number expected");
494            var weightTok = tokens.Dequeue();
495            weights.Add(sign * weightTok.doubleVal);
496            while (tokens.Peek().TokenType == TokenType.Comma) {
497              // skip comma
498              tokens.Dequeue();
499              if (tokens.Peek().TokenType == TokenType.Operator) {
500                var opToken = tokens.Dequeue();
501                if (opToken.strVal == "+") sign = 1.0;
502                else if (opToken.strVal == "-") sign = -1.0;
503                else throw new ArgumentException();
504              }
505              weightTok = tokens.Dequeue();
506              if (weightTok.TokenType != TokenType.Number) throw new ArgumentException("number expected");
507              weights.Add(sign * weightTok.doubleVal);
508            }
509            var rightBracketToken = tokens.Dequeue();
510            if (rightBracketToken.TokenType != TokenType.RightBracket) throw new ArgumentException("closing bracket ] expected");
511            factorVariableNode.Weights = weights.ToArray();
512            return factorVariableNode;
513          } else {
514            // variable
515            var varNode = (VariableTreeNode)variable.CreateTreeNode();
516            varNode.Weight = 1.0;
517            varNode.VariableName = idTok.strVal;
518            return varNode;
519          }
520        }
521      } else if (next.TokenType == TokenType.Number) {
522        var numTok = tokens.Dequeue();
523        var constNode = (ConstantTreeNode)constant.CreateTreeNode();
524        constNode.Value = numTok.doubleVal;
525        return constNode;
526      } else {
527        throw new ArgumentException(string.Format("unexpected token in expression {0}", next.strVal));
528      }
529    }
530
531    // ArgList = Expr { ',' Expr }
532    private ISymbolicExpressionTreeNode[] ParseArgList(Queue<Token> tokens) {
533      var exprList = new List<ISymbolicExpressionTreeNode>();
534      exprList.Add(ParseExpr(tokens));
535      while (tokens.Peek().TokenType != TokenType.RightPar) {
536        var comma = tokens.Dequeue();
537        if (comma.TokenType != TokenType.Comma) throw new ArgumentException("expected ',' ");
538        exprList.Add(ParseExpr(tokens));
539      }
540      return exprList.ToArray();
541    }
542  }
543}
Note: See TracBrowser for help on using the repository browser.