Free cookie consent management tool by TermsFeed Policy Generator

source: branches/symbreg-factors-2650/HeuristicLab.Problems.DataAnalysis.Symbolic/3.4/Importer/InfixExpressionParser.cs @ 14761

Last change on this file since 14761 was 14761, checked in by gkronber, 7 years ago

#2650 infix formatter also produces factor variable weights, infix parser supports reading of factor variable weights

File size: 20.0 KB
RevLine 
[14024]1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.Linq;
26using System.Text;
27using HeuristicLab.Collections;
[14351]28using HeuristicLab.Common;
[14024]29using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
30
31namespace HeuristicLab.Problems.DataAnalysis.Symbolic {
32  /// <summary>
33  /// Parses mathematical expressions in infix form. E.g. x1 * (3.0 * x2 + x3)
34  /// Identifier format (functions or variables): '_' | letter { '_' | letter | digit }
[14251]35  /// Variables names and variable values can be set under quotes "" or '' because variable names might contain spaces.
36  ///   Variable = ident | " ident " | ' ident '
[14024]37  /// It is also possible to use functions e.g. log("x1") or real-valued constants e.g. 3.1415 .
38  /// Variable names are case sensitive. Function names are not case sensitive.
[14251]39  ///
40  ///
[14761]41  /// S             = Expr EOF
42  /// Expr          = ['-' | '+'] Term { '+' Term | '-' Term }
43  /// Term          = Fact { '*' Fact | '/' Fact }
44  /// Fact          = '(' Expr ')'
45  ///                 | 'LAG' '(' varId ',' ['+' | '-' ] number ')'
46  ///                 | funcId '(' ArgList ')'
47  ///                 | VarExpr | number
48  /// ArgList       = Expr { ',' Expr }
49  /// VarExpr       = varId OptFactorPart
50  /// OptFactorPart = [ ('=' varVal | '[' number {',' number } ']' ) ]
51  /// varId         =  ident | ' ident ' | " ident "
52  /// varVal        =  ident | ' ident ' | " ident "
53  /// ident         =  '_' | letter { '_' | letter | digit }
[14024]54  /// </summary>
[14026]55  public sealed class InfixExpressionParser {
[14761]56    private enum TokenType { Operator, Identifier, Number, LeftPar, RightPar, LeftBracket, RightBracket, Comma, Eq, End, NA };
[14024]57    private class Token {
58      internal double doubleVal;
59      internal string strVal;
60      internal TokenType TokenType;
61    }
62
63    private class SymbolNameComparer : IEqualityComparer<ISymbol>, IComparer<ISymbol> {
64      public int Compare(ISymbol x, ISymbol y) {
65        return x.Name.CompareTo(y.Name);
66      }
67
68      public bool Equals(ISymbol x, ISymbol y) {
69        return Compare(x, y) == 0;
70      }
71
72      public int GetHashCode(ISymbol obj) {
73        return obj.Name.GetHashCode();
74      }
75    }
76    // format name <-> symbol
77    // the lookup table is also used in the corresponding formatter
78    internal static readonly BidirectionalLookup<string, ISymbol>
79      knownSymbols = new BidirectionalLookup<string, ISymbol>(StringComparer.InvariantCulture, new SymbolNameComparer());
80
81    private Constant constant = new Constant();
82    private Variable variable = new Variable();
[14251]83    private BinaryFactorVariable binaryFactorVar = new BinaryFactorVariable();
[14761]84    private FactorVariable factorVar = new FactorVariable();
[14024]85
86    private ProgramRootSymbol programRootSymbol = new ProgramRootSymbol();
87    private StartSymbol startSymbol = new StartSymbol();
88
89    static InfixExpressionParser() {
90      // populate bidirectional lookup
91      var dict = new Dictionary<string, ISymbol>
92      {
93        { "+", new Addition()},
94        { "/", new Division()},
95        { "*", new Multiplication()},
96        { "-", new Subtraction()},
97        { "EXP", new Exponential()},
98        { "LOG", new Logarithm()},
99        { "POW", new Power()},
100        { "ROOT", new Root()},
101        { "SQR", new Square() },
102        { "SQRT", new SquareRoot() },
103        { "SIN",new Sine()},
104        { "COS", new Cosine()},
105        { "TAN", new Tangent()},
106        { "AIRYA", new AiryA()},
107        { "AIRYB", new AiryB()},
108        { "BESSEL", new Bessel()},
109        { "COSINT", new CosineIntegral()},
110        { "SININT", new SineIntegral()},
111        { "HYPCOSINT", new HyperbolicCosineIntegral()},
112        { "HYPSININT", new HyperbolicSineIntegral()},
113        { "FRESNELSININT", new FresnelSineIntegral()},
114        { "FRESNELCOSINT", new FresnelCosineIntegral()},
115        { "NORM", new Norm()},
116        { "ERF", new Erf()},
117        { "GAMMA", new Gamma()},
118        { "PSI", new Psi()},
119        { "DAWSON", new Dawson()},
120        { "EXPINT", new ExponentialIntegralEi()},
121        { "MEAN", new Average()},
122        { "IF", new IfThenElse()},
[14351]123        { "GT", new GreaterThan()},
124        { "LT", new LessThan()},
[14024]125        { "AND", new And()},
126        { "OR", new Or()},
127        { "NOT", new Not()},
128        { "XOR", new Xor()},
129        { "DIFF", new Derivative()},
[14351]130        { "LAG", new LaggedVariable() },
[14024]131      };
132
133
[14761]134      foreach(var kvp in dict) {
[14024]135        knownSymbols.Add(kvp.Key, kvp.Value);
136      }
137    }
138
139    public ISymbolicExpressionTree Parse(string str) {
140      ISymbolicExpressionTreeNode root = programRootSymbol.CreateTreeNode();
141      ISymbolicExpressionTreeNode start = startSymbol.CreateTreeNode();
142      var allTokens = GetAllTokens(str).ToArray();
143      ISymbolicExpressionTreeNode mainBranch = ParseS(new Queue<Token>(allTokens));
144
145      // only a main branch was given => insert the main branch into the default tree template
146      root.AddSubtree(start);
147      start.AddSubtree(mainBranch);
148      return new SymbolicExpressionTree(root);
149    }
150
151    private IEnumerable<Token> GetAllTokens(string str) {
152      int pos = 0;
[14761]153      while(true) {
154        while(pos < str.Length && Char.IsWhiteSpace(str[pos])) pos++;
155        if(pos >= str.Length) {
[14024]156          yield return new Token { TokenType = TokenType.End, strVal = "" };
157          yield break;
158        }
[14761]159        if(char.IsDigit(str[pos])) {
[14351]160          // read number (=> read until white space or operator or comma)
[14024]161          var sb = new StringBuilder();
162          sb.Append(str[pos]);
163          pos++;
[14761]164          while(pos < str.Length && !char.IsWhiteSpace(str[pos])
[14251]165            && (str[pos] != '+' || str[pos - 1] == 'e' || str[pos - 1] == 'E')     // continue reading exponents
[14024]166            && (str[pos] != '-' || str[pos - 1] == 'e' || str[pos - 1] == 'E')
[14251]167            && str[pos] != '*'
[14024]168            && str[pos] != '/'
[14351]169            && str[pos] != ')'
[14761]170            && str[pos] != ']'
[14351]171            && str[pos] != ',') {
[14024]172            sb.Append(str[pos]);
173            pos++;
174          }
175          double dblVal;
[14761]176          if(double.TryParse(sb.ToString(), NumberStyles.Float, CultureInfo.InvariantCulture, out dblVal))
[14024]177            yield return new Token { TokenType = TokenType.Number, strVal = sb.ToString(), doubleVal = dblVal };
178          else yield return new Token { TokenType = TokenType.NA, strVal = sb.ToString() };
[14761]179        } else if(char.IsLetter(str[pos]) || str[pos] == '_') {
[14024]180          // read ident
181          var sb = new StringBuilder();
182          sb.Append(str[pos]);
183          pos++;
[14761]184          while(pos < str.Length &&
[14024]185            (char.IsLetter(str[pos]) || str[pos] == '_' || char.IsDigit(str[pos]))) {
186            sb.Append(str[pos]);
187            pos++;
188          }
189          yield return new Token { TokenType = TokenType.Identifier, strVal = sb.ToString() };
[14761]190        } else if(str[pos] == '"') {
[14024]191          // read to next "
192          pos++;
193          var sb = new StringBuilder();
[14761]194          while(pos < str.Length && str[pos] != '"') {
[14024]195            sb.Append(str[pos]);
196            pos++;
197          }
[14761]198          if(pos < str.Length && str[pos] == '"') {
[14024]199            pos++; // skip "
200            yield return new Token { TokenType = TokenType.Identifier, strVal = sb.ToString() };
201          } else
202            yield return new Token { TokenType = TokenType.NA };
203
[14761]204        } else if(str[pos] == '\'') {
[14024]205          // read to next '
206          pos++;
207          var sb = new StringBuilder();
[14761]208          while(pos < str.Length && str[pos] != '\'') {
[14024]209            sb.Append(str[pos]);
210            pos++;
211          }
[14761]212          if(pos < str.Length && str[pos] == '\'') {
[14024]213            pos++; // skip '
214            yield return new Token { TokenType = TokenType.Identifier, strVal = sb.ToString() };
215          } else
216            yield return new Token { TokenType = TokenType.NA };
[14761]217        } else if(str[pos] == '+') {
[14024]218          pos++;
219          yield return new Token { TokenType = TokenType.Operator, strVal = "+" };
[14761]220        } else if(str[pos] == '-') {
[14024]221          pos++;
222          yield return new Token { TokenType = TokenType.Operator, strVal = "-" };
[14761]223        } else if(str[pos] == '/') {
[14024]224          pos++;
225          yield return new Token { TokenType = TokenType.Operator, strVal = "/" };
[14761]226        } else if(str[pos] == '*') {
[14024]227          pos++;
228          yield return new Token { TokenType = TokenType.Operator, strVal = "*" };
[14761]229        } else if(str[pos] == '(') {
[14024]230          pos++;
231          yield return new Token { TokenType = TokenType.LeftPar, strVal = "(" };
[14761]232        } else if(str[pos] == ')') {
[14024]233          pos++;
234          yield return new Token { TokenType = TokenType.RightPar, strVal = ")" };
[14761]235        } else if(str[pos] == '[') {
[14251]236          pos++;
[14761]237          yield return new Token { TokenType = TokenType.LeftBracket, strVal = "[" };
238        } else if(str[pos] == ']') {
239          pos++;
240          yield return new Token { TokenType = TokenType.RightBracket, strVal = "]" };
241        } else if(str[pos] == '=') {
242          pos++;
[14251]243          yield return new Token { TokenType = TokenType.Eq, strVal = "=" };
[14761]244        } else if(str[pos] == ',') {
[14351]245          pos++;
246          yield return new Token { TokenType = TokenType.Comma, strVal = "," };
[14330]247        } else {
248          throw new ArgumentException("Invalid character: " + str[pos]);
[14024]249        }
250      }
251    }
[14761]252    /// S             = Expr EOF
[14024]253    private ISymbolicExpressionTreeNode ParseS(Queue<Token> tokens) {
254      var expr = ParseExpr(tokens);
255
256      var endTok = tokens.Dequeue();
[14761]257      if(endTok.TokenType != TokenType.End)
[14024]258        throw new ArgumentException(string.Format("Expected end of expression (got {0})", endTok.strVal));
259
260      return expr;
261    }
[14761]262
263    /// Expr          = ['-' | '+'] Term { '+' Term | '-' Term }
[14024]264    private ISymbolicExpressionTreeNode ParseExpr(Queue<Token> tokens) {
265      var next = tokens.Peek();
266      var posTerms = new List<ISymbolicExpressionTreeNode>();
267      var negTerms = new List<ISymbolicExpressionTreeNode>();
268      bool negateFirstTerm = false;
[14761]269      if(next.TokenType == TokenType.Operator && (next.strVal == "+" || next.strVal == "-")) {
[14024]270        tokens.Dequeue();
[14761]271        if(next.strVal == "-")
[14024]272          negateFirstTerm = true;
273      }
274      var t = ParseTerm(tokens);
[14761]275      if(negateFirstTerm) negTerms.Add(t);
[14024]276      else posTerms.Add(t);
277
278      next = tokens.Peek();
[14761]279      while(next.strVal == "+" || next.strVal == "-") {
280        switch(next.strVal) {
[14024]281          case "+": {
282              tokens.Dequeue();
283              var term = ParseTerm(tokens);
284              posTerms.Add(term);
285              break;
286            }
287          case "-": {
288              tokens.Dequeue();
289              var term = ParseTerm(tokens);
290              negTerms.Add(term);
291              break;
292            }
293        }
294        next = tokens.Peek();
295      }
296
297      var sum = GetSymbol("+").CreateTreeNode();
[14761]298      foreach(var posTerm in posTerms) sum.AddSubtree(posTerm);
299      if(negTerms.Any()) {
300        if(negTerms.Count == 1) {
[14024]301          var sub = GetSymbol("-").CreateTreeNode();
302          sub.AddSubtree(negTerms.Single());
303          sum.AddSubtree(sub);
304        } else {
305          var sumNeg = GetSymbol("+").CreateTreeNode();
[14761]306          foreach(var negTerm in negTerms) sumNeg.AddSubtree(negTerm);
[14024]307
308          var constNode = (ConstantTreeNode)constant.CreateTreeNode();
309          constNode.Value = -1.0;
310          var prod = GetSymbol("*").CreateTreeNode();
311          prod.AddSubtree(constNode);
312          prod.AddSubtree(sumNeg);
313
314          sum.AddSubtree(prod);
315        }
316      }
[14761]317      if(sum.SubtreeCount == 1) return sum.Subtrees.First();
[14024]318      else return sum;
319    }
320
321    private ISymbol GetSymbol(string tok) {
322      var symb = knownSymbols.GetByFirst(tok).FirstOrDefault();
[14761]323      if(symb == null) throw new ArgumentException(string.Format("Unknown token {0} found.", tok));
[14024]324      return symb;
325    }
326
[14761]327    /// Term          = Fact { '*' Fact | '/' Fact }
[14024]328    private ISymbolicExpressionTreeNode ParseTerm(Queue<Token> tokens) {
329      var factors = new List<ISymbolicExpressionTreeNode>();
330      var firstFactor = ParseFact(tokens);
331      factors.Add(firstFactor);
332
333      var next = tokens.Peek();
[14761]334      while(next.strVal == "*" || next.strVal == "/") {
335        switch(next.strVal) {
[14024]336          case "*": {
337              tokens.Dequeue();
338              var fact = ParseFact(tokens);
339              factors.Add(fact);
340              break;
341            }
342          case "/": {
343              tokens.Dequeue();
344              var invFact = ParseFact(tokens);
345              var divNode = GetSymbol("/").CreateTreeNode(); // 1/x
346              divNode.AddSubtree(invFact);
347              factors.Add(divNode);
348              break;
349            }
350        }
351
352        next = tokens.Peek();
353      }
[14761]354      if(factors.Count == 1) return factors.First();
[14024]355      else {
356        var prod = GetSymbol("*").CreateTreeNode();
[14761]357        foreach(var f in factors) prod.AddSubtree(f);
[14024]358        return prod;
359      }
360    }
361
[14761]362    /// Fact          = '(' Expr ')'
363    ///                 | 'LAG' '(' varId ',' ['+' | '-' ] number ')'
364    ///                 | funcId '(' ArgList ')'
365    ///                 | VarExpr | number
366    /// ArgList       = Expr { ',' Expr }
367    /// VarExpr       = varId OptFactorPart
368    /// OptFactorPart = [ ('=' varVal | '[' number {',' number } ']' ) ]
369    /// varId         =  ident | ' ident ' | " ident "
370    /// varVal        =  ident | ' ident ' | " ident "
371    /// ident         =  '_' | letter { '_' | letter | digit }
[14024]372    private ISymbolicExpressionTreeNode ParseFact(Queue<Token> tokens) {
373      var next = tokens.Peek();
[14761]374      if(next.TokenType == TokenType.LeftPar) {
[14024]375        tokens.Dequeue();
376        var expr = ParseExpr(tokens);
377        var rPar = tokens.Dequeue();
[14761]378        if(rPar.TokenType != TokenType.RightPar)
[14024]379          throw new ArgumentException("expected )");
380        return expr;
[14761]381      } else if(next.TokenType == TokenType.Identifier) {
[14024]382        var idTok = tokens.Dequeue();
[14761]383        if(tokens.Peek().TokenType == TokenType.LeftPar) {
384          // function identifier or LAG
[14024]385          var funcId = idTok.strVal.ToUpperInvariant();
386
387          var funcNode = GetSymbol(funcId).CreateTreeNode();
388          var lPar = tokens.Dequeue();
[14761]389          if(lPar.TokenType != TokenType.LeftPar)
[14024]390            throw new ArgumentException("expected (");
[14351]391
392          // handle 'lag' specifically
[14761]393          if(funcNode.Symbol is LaggedVariable) {
[14351]394            var varId = tokens.Dequeue();
[14761]395            if(varId.TokenType != TokenType.Identifier) throw new ArgumentException("Identifier expected. Format for lagged variables: \"lag(x, -1)\"");
[14351]396            var comma = tokens.Dequeue();
[14761]397            if(comma.TokenType != TokenType.Comma) throw new ArgumentException("',' expected, Format for lagged variables: \"lag(x, -1)\"");
[14351]398            double sign = 1.0;
[14761]399            if(tokens.Peek().strVal == "+" || tokens.Peek().strVal == "-") {
[14351]400              // read sign
401              var signTok = tokens.Dequeue();
[14761]402              if(signTok.strVal == "-") sign = -1.0;
[14351]403            }
404            var lagToken = tokens.Dequeue();
[14761]405            if(lagToken.TokenType != TokenType.Number) throw new ArgumentException("Number expected, Format for lagged variables: \"lag(x, -1)\"");
406            if(!lagToken.doubleVal.IsAlmost(Math.Round(lagToken.doubleVal)))
[14351]407              throw new ArgumentException("Time lags must be integer values");
408            var laggedVarNode = funcNode as LaggedVariableTreeNode;
409            laggedVarNode.VariableName = varId.strVal;
410            laggedVarNode.Lag = (int)Math.Round(sign * lagToken.doubleVal);
411            laggedVarNode.Weight = 1.0;
412          } else {
413            // functions
414            var args = ParseArgList(tokens);
415            // check number of arguments
[14761]416            if(funcNode.Symbol.MinimumArity > args.Length || funcNode.Symbol.MaximumArity < args.Length) {
[14351]417              throw new ArgumentException(string.Format("Symbol {0} requires between {1} and  {2} arguments.", funcId,
418                funcNode.Symbol.MinimumArity, funcNode.Symbol.MaximumArity));
419            }
[14761]420            foreach(var arg in args) funcNode.AddSubtree(arg);
[14351]421          }
422
[14024]423          var rPar = tokens.Dequeue();
[14761]424          if(rPar.TokenType != TokenType.RightPar)
[14024]425            throw new ArgumentException("expected )");
426
427          return funcNode;
428        } else {
429          // variable
[14761]430          if(tokens.Peek().TokenType == TokenType.Eq) {
[14251]431            // binary factor
432            tokens.Dequeue(); // skip Eq
433            var valTok = tokens.Dequeue();
[14761]434            if(valTok.TokenType != TokenType.Identifier) throw new ArgumentException("expected identifier");
[14251]435            var binFactorNode = (BinaryFactorVariableTreeNode)binaryFactorVar.CreateTreeNode();
436            binFactorNode.Weight = 1.0;
437            binFactorNode.VariableName = idTok.strVal;
438            binFactorNode.VariableValue = valTok.strVal;
439            return binFactorNode;
[14761]440          } else if(tokens.Peek().TokenType == TokenType.LeftBracket) {
441            // factor variable
442            var factorVariableNode = (FactorVariableTreeNode) factorVar.CreateTreeNode();
443            factorVariableNode.VariableName = idTok.strVal;
444
445            tokens.Dequeue(); // skip [
446            var weights = new List<double>();
447            // at least one weight is necessary
448            if(tokens.Peek().TokenType != TokenType.Number) throw new ArgumentException("number expected");
449            var weightTok = tokens.Dequeue();
450            weights.Add(weightTok.doubleVal);
451            while(tokens.Peek().TokenType == TokenType.Comma) {
452              // skip comma
453              tokens.Dequeue();
454              weightTok = tokens.Dequeue();
455              if(weightTok.TokenType != TokenType.Number) throw new ArgumentException("number expected");
456              weights.Add(weightTok.doubleVal);
457            }
458            var rightBracketToken = tokens.Dequeue();
459            if(rightBracketToken.TokenType != TokenType.RightBracket) throw new ArgumentException("closing bracket ] expected");
460            factorVariableNode.Weights = weights.ToArray();
461            return factorVariableNode;
[14251]462          } else {
463            // variable
464            var varNode = (VariableTreeNode)variable.CreateTreeNode();
465            varNode.Weight = 1.0;
466            varNode.VariableName = idTok.strVal;
467            return varNode;
468          }
[14024]469        }
[14761]470      } else if(next.TokenType == TokenType.Number) {
[14024]471        var numTok = tokens.Dequeue();
472        var constNode = (ConstantTreeNode)constant.CreateTreeNode();
473        constNode.Value = numTok.doubleVal;
474        return constNode;
475      } else {
476        throw new ArgumentException(string.Format("unexpected token in expression {0}", next.strVal));
477      }
478    }
[14251]479
[14351]480    // ArgList = Expr { ',' Expr }
481    private ISymbolicExpressionTreeNode[] ParseArgList(Queue<Token> tokens) {
482      var exprList = new List<ISymbolicExpressionTreeNode>();
483      exprList.Add(ParseExpr(tokens));
[14761]484      while(tokens.Peek().TokenType != TokenType.RightPar) {
[14351]485        var comma = tokens.Dequeue();
[14761]486        if(comma.TokenType != TokenType.Comma) throw new ArgumentException("expected ',' ");
[14351]487        exprList.Add(ParseExpr(tokens));
488      }
489      return exprList.ToArray();
490    }
[14024]491  }
492}
Note: See TracBrowser for help on using the repository browser.