Free cookie consent management tool by TermsFeed Policy Generator

source: branches/2915-AbsoluteSymbol/HeuristicLab.Problems.DataAnalysis.Symbolic/3.4/Importer/InfixExpressionParser.cs @ 16003

Last change on this file since 16003 was 15944, checked in by gkronber, 7 years ago

#2915 added support for Abs() symbol to tree interpreter and linear interpreter as well as to the infix parser

File size: 20.8 KB
RevLine 
[14024]1#region License Information
2/* HeuristicLab
[15583]3 * Copyright (C) 2002-2018 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
[14024]4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.Linq;
26using System.Text;
27using HeuristicLab.Collections;
[14350]28using HeuristicLab.Common;
[14024]29using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
30
31namespace HeuristicLab.Problems.DataAnalysis.Symbolic {
32  /// <summary>
33  /// Parses mathematical expressions in infix form. E.g. x1 * (3.0 * x2 + x3)
34  /// Identifier format (functions or variables): '_' | letter { '_' | letter | digit }
[14826]35  /// Variables names and variable values can be set under quotes "" or '' because variable names might contain spaces.
36  ///   Variable = ident | " ident " | ' ident '
[14024]37  /// It is also possible to use functions e.g. log("x1") or real-valued constants e.g. 3.1415 .
38  /// Variable names are case sensitive. Function names are not case sensitive.
[14826]39  ///
40  ///
41  /// S             = Expr EOF
42  /// Expr          = ['-' | '+'] Term { '+' Term | '-' Term }
43  /// Term          = Fact { '*' Fact | '/' Fact }
44  /// Fact          = '(' Expr ')'
45  ///                 | 'LAG' '(' varId ',' ['+' | '-' ] number ')'
46  ///                 | funcId '(' ArgList ')'
47  ///                 | VarExpr | number
48  /// ArgList       = Expr { ',' Expr }
49  /// VarExpr       = varId OptFactorPart
50  /// OptFactorPart = [ ('=' varVal | '[' ['+' | '-' ] number {',' ['+' | '-' ] number } ']' ) ]
51  /// varId         =  ident | ' ident ' | " ident "
52  /// varVal        =  ident | ' ident ' | " ident "
53  /// ident         =  '_' | letter { '_' | letter | digit }
[14024]54  /// </summary>
[14026]55  public sealed class InfixExpressionParser {
[14826]56    private enum TokenType { Operator, Identifier, Number, LeftPar, RightPar, LeftBracket, RightBracket, Comma, Eq, End, NA };
[14024]57    private class Token {
58      internal double doubleVal;
59      internal string strVal;
60      internal TokenType TokenType;
61    }
62
63    private class SymbolNameComparer : IEqualityComparer<ISymbol>, IComparer<ISymbol> {
64      public int Compare(ISymbol x, ISymbol y) {
65        return x.Name.CompareTo(y.Name);
66      }
67
68      public bool Equals(ISymbol x, ISymbol y) {
69        return Compare(x, y) == 0;
70      }
71
72      public int GetHashCode(ISymbol obj) {
73        return obj.Name.GetHashCode();
74      }
75    }
76    // format name <-> symbol
77    // the lookup table is also used in the corresponding formatter
78    internal static readonly BidirectionalLookup<string, ISymbol>
79      knownSymbols = new BidirectionalLookup<string, ISymbol>(StringComparer.InvariantCulture, new SymbolNameComparer());
80
81    private Constant constant = new Constant();
82    private Variable variable = new Variable();
[14826]83    private BinaryFactorVariable binaryFactorVar = new BinaryFactorVariable();
84    private FactorVariable factorVar = new FactorVariable();
[14024]85
86    private ProgramRootSymbol programRootSymbol = new ProgramRootSymbol();
87    private StartSymbol startSymbol = new StartSymbol();
88
89    static InfixExpressionParser() {
90      // populate bidirectional lookup
91      var dict = new Dictionary<string, ISymbol>
92      {
93        { "+", new Addition()},
94        { "/", new Division()},
95        { "*", new Multiplication()},
96        { "-", new Subtraction()},
[15944]97        { "ABS", new Absolute() },
[14024]98        { "EXP", new Exponential()},
99        { "LOG", new Logarithm()},
100        { "POW", new Power()},
101        { "ROOT", new Root()},
102        { "SQR", new Square() },
103        { "SQRT", new SquareRoot() },
104        { "SIN",new Sine()},
105        { "COS", new Cosine()},
106        { "TAN", new Tangent()},
107        { "AIRYA", new AiryA()},
108        { "AIRYB", new AiryB()},
109        { "BESSEL", new Bessel()},
110        { "COSINT", new CosineIntegral()},
111        { "SININT", new SineIntegral()},
112        { "HYPCOSINT", new HyperbolicCosineIntegral()},
113        { "HYPSININT", new HyperbolicSineIntegral()},
114        { "FRESNELSININT", new FresnelSineIntegral()},
115        { "FRESNELCOSINT", new FresnelCosineIntegral()},
116        { "NORM", new Norm()},
117        { "ERF", new Erf()},
118        { "GAMMA", new Gamma()},
119        { "PSI", new Psi()},
120        { "DAWSON", new Dawson()},
121        { "EXPINT", new ExponentialIntegralEi()},
122        { "MEAN", new Average()},
123        { "IF", new IfThenElse()},
[14347]124        { "GT", new GreaterThan()},
125        { "LT", new LessThan()},
[14024]126        { "AND", new And()},
127        { "OR", new Or()},
128        { "NOT", new Not()},
129        { "XOR", new Xor()},
130        { "DIFF", new Derivative()},
[14350]131        { "LAG", new LaggedVariable() },
[14024]132      };
133
134
135      foreach (var kvp in dict) {
136        knownSymbols.Add(kvp.Key, kvp.Value);
137      }
138    }
139
140    public ISymbolicExpressionTree Parse(string str) {
141      ISymbolicExpressionTreeNode root = programRootSymbol.CreateTreeNode();
142      ISymbolicExpressionTreeNode start = startSymbol.CreateTreeNode();
143      var allTokens = GetAllTokens(str).ToArray();
144      ISymbolicExpressionTreeNode mainBranch = ParseS(new Queue<Token>(allTokens));
145
146      // only a main branch was given => insert the main branch into the default tree template
147      root.AddSubtree(start);
148      start.AddSubtree(mainBranch);
149      return new SymbolicExpressionTree(root);
150    }
151
152    private IEnumerable<Token> GetAllTokens(string str) {
153      int pos = 0;
154      while (true) {
155        while (pos < str.Length && Char.IsWhiteSpace(str[pos])) pos++;
156        if (pos >= str.Length) {
157          yield return new Token { TokenType = TokenType.End, strVal = "" };
158          yield break;
159        }
160        if (char.IsDigit(str[pos])) {
[14347]161          // read number (=> read until white space or operator or comma)
[14024]162          var sb = new StringBuilder();
163          sb.Append(str[pos]);
164          pos++;
165          while (pos < str.Length && !char.IsWhiteSpace(str[pos])
[14319]166            && (str[pos] != '+' || str[pos - 1] == 'e' || str[pos - 1] == 'E')     // continue reading exponents
[14024]167            && (str[pos] != '-' || str[pos - 1] == 'e' || str[pos - 1] == 'E')
[14319]168            && str[pos] != '*'
[14024]169            && str[pos] != '/'
[14347]170            && str[pos] != ')'
[14826]171            && str[pos] != ']'
[14347]172            && str[pos] != ',') {
[14024]173            sb.Append(str[pos]);
174            pos++;
175          }
176          double dblVal;
177          if (double.TryParse(sb.ToString(), NumberStyles.Float, CultureInfo.InvariantCulture, out dblVal))
178            yield return new Token { TokenType = TokenType.Number, strVal = sb.ToString(), doubleVal = dblVal };
179          else yield return new Token { TokenType = TokenType.NA, strVal = sb.ToString() };
180        } else if (char.IsLetter(str[pos]) || str[pos] == '_') {
181          // read ident
182          var sb = new StringBuilder();
183          sb.Append(str[pos]);
184          pos++;
185          while (pos < str.Length &&
186            (char.IsLetter(str[pos]) || str[pos] == '_' || char.IsDigit(str[pos]))) {
187            sb.Append(str[pos]);
188            pos++;
189          }
190          yield return new Token { TokenType = TokenType.Identifier, strVal = sb.ToString() };
191        } else if (str[pos] == '"') {
192          // read to next "
193          pos++;
194          var sb = new StringBuilder();
195          while (pos < str.Length && str[pos] != '"') {
196            sb.Append(str[pos]);
197            pos++;
198          }
199          if (pos < str.Length && str[pos] == '"') {
200            pos++; // skip "
201            yield return new Token { TokenType = TokenType.Identifier, strVal = sb.ToString() };
202          } else
203            yield return new Token { TokenType = TokenType.NA };
204
205        } else if (str[pos] == '\'') {
206          // read to next '
207          pos++;
208          var sb = new StringBuilder();
209          while (pos < str.Length && str[pos] != '\'') {
210            sb.Append(str[pos]);
211            pos++;
212          }
213          if (pos < str.Length && str[pos] == '\'') {
214            pos++; // skip '
215            yield return new Token { TokenType = TokenType.Identifier, strVal = sb.ToString() };
216          } else
217            yield return new Token { TokenType = TokenType.NA };
218        } else if (str[pos] == '+') {
219          pos++;
220          yield return new Token { TokenType = TokenType.Operator, strVal = "+" };
221        } else if (str[pos] == '-') {
222          pos++;
223          yield return new Token { TokenType = TokenType.Operator, strVal = "-" };
224        } else if (str[pos] == '/') {
225          pos++;
226          yield return new Token { TokenType = TokenType.Operator, strVal = "/" };
227        } else if (str[pos] == '*') {
228          pos++;
229          yield return new Token { TokenType = TokenType.Operator, strVal = "*" };
230        } else if (str[pos] == '(') {
231          pos++;
232          yield return new Token { TokenType = TokenType.LeftPar, strVal = "(" };
233        } else if (str[pos] == ')') {
234          pos++;
235          yield return new Token { TokenType = TokenType.RightPar, strVal = ")" };
[14826]236        } else if (str[pos] == '[') {
237          pos++;
238          yield return new Token { TokenType = TokenType.LeftBracket, strVal = "[" };
239        } else if (str[pos] == ']') {
240          pos++;
241          yield return new Token { TokenType = TokenType.RightBracket, strVal = "]" };
242        } else if (str[pos] == '=') {
243          pos++;
244          yield return new Token { TokenType = TokenType.Eq, strVal = "=" };
[14347]245        } else if (str[pos] == ',') {
246          pos++;
247          yield return new Token { TokenType = TokenType.Comma, strVal = "," };
[14319]248        } else {
249          throw new ArgumentException("Invalid character: " + str[pos]);
[14024]250        }
251      }
252    }
[14826]253    /// S             = Expr EOF
[14024]254    private ISymbolicExpressionTreeNode ParseS(Queue<Token> tokens) {
255      var expr = ParseExpr(tokens);
256
257      var endTok = tokens.Dequeue();
258      if (endTok.TokenType != TokenType.End)
259        throw new ArgumentException(string.Format("Expected end of expression (got {0})", endTok.strVal));
260
261      return expr;
262    }
[14826]263
264    /// Expr          = ['-' | '+'] Term { '+' Term | '-' Term }
[14024]265    private ISymbolicExpressionTreeNode ParseExpr(Queue<Token> tokens) {
266      var next = tokens.Peek();
267      var posTerms = new List<ISymbolicExpressionTreeNode>();
268      var negTerms = new List<ISymbolicExpressionTreeNode>();
269      bool negateFirstTerm = false;
270      if (next.TokenType == TokenType.Operator && (next.strVal == "+" || next.strVal == "-")) {
271        tokens.Dequeue();
272        if (next.strVal == "-")
273          negateFirstTerm = true;
274      }
275      var t = ParseTerm(tokens);
276      if (negateFirstTerm) negTerms.Add(t);
277      else posTerms.Add(t);
278
279      next = tokens.Peek();
280      while (next.strVal == "+" || next.strVal == "-") {
281        switch (next.strVal) {
282          case "+": {
283              tokens.Dequeue();
284              var term = ParseTerm(tokens);
285              posTerms.Add(term);
286              break;
287            }
288          case "-": {
289              tokens.Dequeue();
290              var term = ParseTerm(tokens);
291              negTerms.Add(term);
292              break;
293            }
294        }
295        next = tokens.Peek();
296      }
297
298      var sum = GetSymbol("+").CreateTreeNode();
299      foreach (var posTerm in posTerms) sum.AddSubtree(posTerm);
300      if (negTerms.Any()) {
301        if (negTerms.Count == 1) {
302          var sub = GetSymbol("-").CreateTreeNode();
303          sub.AddSubtree(negTerms.Single());
304          sum.AddSubtree(sub);
305        } else {
306          var sumNeg = GetSymbol("+").CreateTreeNode();
307          foreach (var negTerm in negTerms) sumNeg.AddSubtree(negTerm);
308
309          var constNode = (ConstantTreeNode)constant.CreateTreeNode();
310          constNode.Value = -1.0;
311          var prod = GetSymbol("*").CreateTreeNode();
312          prod.AddSubtree(constNode);
313          prod.AddSubtree(sumNeg);
314
315          sum.AddSubtree(prod);
316        }
317      }
318      if (sum.SubtreeCount == 1) return sum.Subtrees.First();
319      else return sum;
320    }
321
322    private ISymbol GetSymbol(string tok) {
323      var symb = knownSymbols.GetByFirst(tok).FirstOrDefault();
324      if (symb == null) throw new ArgumentException(string.Format("Unknown token {0} found.", tok));
325      return symb;
326    }
327
[14826]328    /// Term          = Fact { '*' Fact | '/' Fact }
[14024]329    private ISymbolicExpressionTreeNode ParseTerm(Queue<Token> tokens) {
330      var factors = new List<ISymbolicExpressionTreeNode>();
331      var firstFactor = ParseFact(tokens);
332      factors.Add(firstFactor);
333
334      var next = tokens.Peek();
335      while (next.strVal == "*" || next.strVal == "/") {
336        switch (next.strVal) {
337          case "*": {
338              tokens.Dequeue();
339              var fact = ParseFact(tokens);
340              factors.Add(fact);
341              break;
342            }
343          case "/": {
344              tokens.Dequeue();
345              var invFact = ParseFact(tokens);
346              var divNode = GetSymbol("/").CreateTreeNode(); // 1/x
347              divNode.AddSubtree(invFact);
348              factors.Add(divNode);
349              break;
350            }
351        }
352
353        next = tokens.Peek();
354      }
355      if (factors.Count == 1) return factors.First();
356      else {
357        var prod = GetSymbol("*").CreateTreeNode();
358        foreach (var f in factors) prod.AddSubtree(f);
359        return prod;
360      }
361    }
362
[14826]363    /// Fact          = '(' Expr ')'
364    ///                 | 'LAG' '(' varId ',' ['+' | '-' ] number ')'
365    ///                 | funcId '(' ArgList ')'
366    ///                 | VarExpr | number
367    /// ArgList       = Expr { ',' Expr }
368    /// VarExpr       = varId OptFactorPart
369    /// OptFactorPart = [ ('=' varVal | '[' ['+' | '-' ] number {',' ['+' | '-' ] number } ']' ) ]
370    /// varId         =  ident | ' ident ' | " ident "
371    /// varVal        =  ident | ' ident ' | " ident "
372    /// ident         =  '_' | letter { '_' | letter | digit }
[14024]373    private ISymbolicExpressionTreeNode ParseFact(Queue<Token> tokens) {
374      var next = tokens.Peek();
375      if (next.TokenType == TokenType.LeftPar) {
376        tokens.Dequeue();
377        var expr = ParseExpr(tokens);
378        var rPar = tokens.Dequeue();
379        if (rPar.TokenType != TokenType.RightPar)
380          throw new ArgumentException("expected )");
381        return expr;
382      } else if (next.TokenType == TokenType.Identifier) {
383        var idTok = tokens.Dequeue();
384        if (tokens.Peek().TokenType == TokenType.LeftPar) {
[14826]385          // function identifier or LAG
[14024]386          var funcId = idTok.strVal.ToUpperInvariant();
387
388          var funcNode = GetSymbol(funcId).CreateTreeNode();
389          var lPar = tokens.Dequeue();
390          if (lPar.TokenType != TokenType.LeftPar)
391            throw new ArgumentException("expected (");
[14347]392
[14350]393          // handle 'lag' specifically
394          if (funcNode.Symbol is LaggedVariable) {
395            var varId = tokens.Dequeue();
396            if (varId.TokenType != TokenType.Identifier) throw new ArgumentException("Identifier expected. Format for lagged variables: \"lag(x, -1)\"");
397            var comma = tokens.Dequeue();
398            if (comma.TokenType != TokenType.Comma) throw new ArgumentException("',' expected, Format for lagged variables: \"lag(x, -1)\"");
399            double sign = 1.0;
400            if (tokens.Peek().strVal == "+" || tokens.Peek().strVal == "-") {
401              // read sign
402              var signTok = tokens.Dequeue();
403              if (signTok.strVal == "-") sign = -1.0;
404            }
405            var lagToken = tokens.Dequeue();
406            if (lagToken.TokenType != TokenType.Number) throw new ArgumentException("Number expected, Format for lagged variables: \"lag(x, -1)\"");
407            if (!lagToken.doubleVal.IsAlmost(Math.Round(lagToken.doubleVal)))
408              throw new ArgumentException("Time lags must be integer values");
409            var laggedVarNode = funcNode as LaggedVariableTreeNode;
410            laggedVarNode.VariableName = varId.strVal;
411            laggedVarNode.Lag = (int)Math.Round(sign * lagToken.doubleVal);
412            laggedVarNode.Weight = 1.0;
413          } else {
414            // functions
415            var args = ParseArgList(tokens);
416            // check number of arguments
417            if (funcNode.Symbol.MinimumArity > args.Length || funcNode.Symbol.MaximumArity < args.Length) {
418              throw new ArgumentException(string.Format("Symbol {0} requires between {1} and  {2} arguments.", funcId,
419                funcNode.Symbol.MinimumArity, funcNode.Symbol.MaximumArity));
420            }
421            foreach (var arg in args) funcNode.AddSubtree(arg);
422          }
[14347]423
[14024]424          var rPar = tokens.Dequeue();
425          if (rPar.TokenType != TokenType.RightPar)
426            throw new ArgumentException("expected )");
427
428          return funcNode;
429        } else {
430          // variable
[14826]431          if (tokens.Peek().TokenType == TokenType.Eq) {
432            // binary factor
433            tokens.Dequeue(); // skip Eq
434            var valTok = tokens.Dequeue();
435            if (valTok.TokenType != TokenType.Identifier) throw new ArgumentException("expected identifier");
436            var binFactorNode = (BinaryFactorVariableTreeNode)binaryFactorVar.CreateTreeNode();
437            binFactorNode.Weight = 1.0;
438            binFactorNode.VariableName = idTok.strVal;
439            binFactorNode.VariableValue = valTok.strVal;
440            return binFactorNode;
441          } else if (tokens.Peek().TokenType == TokenType.LeftBracket) {
442            // factor variable
443            var factorVariableNode = (FactorVariableTreeNode)factorVar.CreateTreeNode();
444            factorVariableNode.VariableName = idTok.strVal;
445
446            tokens.Dequeue(); // skip [
447            var weights = new List<double>();
448            // at least one weight is necessary
449            var sign = 1.0;
450            if (tokens.Peek().TokenType == TokenType.Operator) {
451              var opToken = tokens.Dequeue();
452              if (opToken.strVal == "+") sign = 1.0;
453              else if (opToken.strVal == "-") sign = -1.0;
454              else throw new ArgumentException();
455            }
456            if (tokens.Peek().TokenType != TokenType.Number) throw new ArgumentException("number expected");
457            var weightTok = tokens.Dequeue();
458            weights.Add(sign * weightTok.doubleVal);
459            while (tokens.Peek().TokenType == TokenType.Comma) {
460              // skip comma
461              tokens.Dequeue();
462              if (tokens.Peek().TokenType == TokenType.Operator) {
463                var opToken = tokens.Dequeue();
464                if (opToken.strVal == "+") sign = 1.0;
465                else if (opToken.strVal == "-") sign = -1.0;
466                else throw new ArgumentException();
467              }
468              weightTok = tokens.Dequeue();
469              if (weightTok.TokenType != TokenType.Number) throw new ArgumentException("number expected");
470              weights.Add(sign * weightTok.doubleVal);
471            }
472            var rightBracketToken = tokens.Dequeue();
473            if (rightBracketToken.TokenType != TokenType.RightBracket) throw new ArgumentException("closing bracket ] expected");
474            factorVariableNode.Weights = weights.ToArray();
475            return factorVariableNode;
476          } else {
477            // variable
478            var varNode = (VariableTreeNode)variable.CreateTreeNode();
479            varNode.Weight = 1.0;
480            varNode.VariableName = idTok.strVal;
481            return varNode;
482          }
[14024]483        }
484      } else if (next.TokenType == TokenType.Number) {
485        var numTok = tokens.Dequeue();
486        var constNode = (ConstantTreeNode)constant.CreateTreeNode();
487        constNode.Value = numTok.doubleVal;
488        return constNode;
489      } else {
490        throw new ArgumentException(string.Format("unexpected token in expression {0}", next.strVal));
491      }
492    }
[14347]493
494    // ArgList = Expr { ',' Expr }
495    private ISymbolicExpressionTreeNode[] ParseArgList(Queue<Token> tokens) {
496      var exprList = new List<ISymbolicExpressionTreeNode>();
497      exprList.Add(ParseExpr(tokens));
498      while (tokens.Peek().TokenType != TokenType.RightPar) {
499        var comma = tokens.Dequeue();
500        if (comma.TokenType != TokenType.Comma) throw new ArgumentException("expected ',' ");
501        exprList.Add(ParseExpr(tokens));
502      }
503      return exprList.ToArray();
504    }
[14024]505  }
506}
Note: See TracBrowser for help on using the repository browser.