Free cookie consent management tool by TermsFeed Policy Generator

source: trunk/sources/HeuristicLab.Problems.DataAnalysis.Symbolic/3.4/Importer/InfixExpressionParser.cs @ 14419

Last change on this file since 14419 was 14350, checked in by gkronber, 8 years ago

#2687: added support for lagged variables to infix parser and formatter

File size: 16.6 KB
RevLine 
[14024]1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.Linq;
26using System.Text;
27using HeuristicLab.Collections;
[14350]28using HeuristicLab.Common;
[14024]29using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
30
31namespace HeuristicLab.Problems.DataAnalysis.Symbolic {
32  /// <summary>
33  /// Parses mathematical expressions in infix form. E.g. x1 * (3.0 * x2 + x3)
34  /// Identifier format (functions or variables): '_' | letter { '_' | letter | digit }
35  /// Variables names can be set under quotes "" or '' because variable names might contain spaces.
36  /// It is also possible to use functions e.g. log("x1") or real-valued constants e.g. 3.1415 .
37  /// Variable names are case sensitive. Function names are not case sensitive.
38  /// </summary>
[14026]39  public sealed class InfixExpressionParser {
[14347]40    private enum TokenType { Operator, Identifier, Number, LeftPar, RightPar, Comma, End, NA };
[14024]41    private class Token {
42      internal double doubleVal;
43      internal string strVal;
44      internal TokenType TokenType;
45    }
46
47    private class SymbolNameComparer : IEqualityComparer<ISymbol>, IComparer<ISymbol> {
48      public int Compare(ISymbol x, ISymbol y) {
49        return x.Name.CompareTo(y.Name);
50      }
51
52      public bool Equals(ISymbol x, ISymbol y) {
53        return Compare(x, y) == 0;
54      }
55
56      public int GetHashCode(ISymbol obj) {
57        return obj.Name.GetHashCode();
58      }
59    }
60    // format name <-> symbol
61    // the lookup table is also used in the corresponding formatter
62    internal static readonly BidirectionalLookup<string, ISymbol>
63      knownSymbols = new BidirectionalLookup<string, ISymbol>(StringComparer.InvariantCulture, new SymbolNameComparer());
64
65    private Constant constant = new Constant();
66    private Variable variable = new Variable();
67
68    private ProgramRootSymbol programRootSymbol = new ProgramRootSymbol();
69    private StartSymbol startSymbol = new StartSymbol();
70
71    static InfixExpressionParser() {
72      // populate bidirectional lookup
73      var dict = new Dictionary<string, ISymbol>
74      {
75        { "+", new Addition()},
76        { "/", new Division()},
77        { "*", new Multiplication()},
78        { "-", new Subtraction()},
79        { "EXP", new Exponential()},
80        { "LOG", new Logarithm()},
81        { "POW", new Power()},
82        { "ROOT", new Root()},
83        { "SQR", new Square() },
84        { "SQRT", new SquareRoot() },
85        { "SIN",new Sine()},
86        { "COS", new Cosine()},
87        { "TAN", new Tangent()},
88        { "AIRYA", new AiryA()},
89        { "AIRYB", new AiryB()},
90        { "BESSEL", new Bessel()},
91        { "COSINT", new CosineIntegral()},
92        { "SININT", new SineIntegral()},
93        { "HYPCOSINT", new HyperbolicCosineIntegral()},
94        { "HYPSININT", new HyperbolicSineIntegral()},
95        { "FRESNELSININT", new FresnelSineIntegral()},
96        { "FRESNELCOSINT", new FresnelCosineIntegral()},
97        { "NORM", new Norm()},
98        { "ERF", new Erf()},
99        { "GAMMA", new Gamma()},
100        { "PSI", new Psi()},
101        { "DAWSON", new Dawson()},
102        { "EXPINT", new ExponentialIntegralEi()},
103        { "MEAN", new Average()},
104        { "IF", new IfThenElse()},
[14347]105        { "GT", new GreaterThan()},
106        { "LT", new LessThan()},
[14024]107        { "AND", new And()},
108        { "OR", new Or()},
109        { "NOT", new Not()},
110        { "XOR", new Xor()},
111        { "DIFF", new Derivative()},
[14350]112        { "LAG", new LaggedVariable() },
[14024]113      };
114
115
116      foreach (var kvp in dict) {
117        knownSymbols.Add(kvp.Key, kvp.Value);
118      }
119    }
120
121    public ISymbolicExpressionTree Parse(string str) {
122      ISymbolicExpressionTreeNode root = programRootSymbol.CreateTreeNode();
123      ISymbolicExpressionTreeNode start = startSymbol.CreateTreeNode();
124      var allTokens = GetAllTokens(str).ToArray();
125      ISymbolicExpressionTreeNode mainBranch = ParseS(new Queue<Token>(allTokens));
126
127      // only a main branch was given => insert the main branch into the default tree template
128      root.AddSubtree(start);
129      start.AddSubtree(mainBranch);
130      return new SymbolicExpressionTree(root);
131    }
132
133    private IEnumerable<Token> GetAllTokens(string str) {
134      int pos = 0;
135      while (true) {
136        while (pos < str.Length && Char.IsWhiteSpace(str[pos])) pos++;
137        if (pos >= str.Length) {
138          yield return new Token { TokenType = TokenType.End, strVal = "" };
139          yield break;
140        }
141        if (char.IsDigit(str[pos])) {
[14347]142          // read number (=> read until white space or operator or comma)
[14024]143          var sb = new StringBuilder();
144          sb.Append(str[pos]);
145          pos++;
146          while (pos < str.Length && !char.IsWhiteSpace(str[pos])
[14319]147            && (str[pos] != '+' || str[pos - 1] == 'e' || str[pos - 1] == 'E')     // continue reading exponents
[14024]148            && (str[pos] != '-' || str[pos - 1] == 'e' || str[pos - 1] == 'E')
[14319]149            && str[pos] != '*'
[14024]150            && str[pos] != '/'
[14347]151            && str[pos] != ')'
152            && str[pos] != ',') {
[14024]153            sb.Append(str[pos]);
154            pos++;
155          }
156          double dblVal;
157          if (double.TryParse(sb.ToString(), NumberStyles.Float, CultureInfo.InvariantCulture, out dblVal))
158            yield return new Token { TokenType = TokenType.Number, strVal = sb.ToString(), doubleVal = dblVal };
159          else yield return new Token { TokenType = TokenType.NA, strVal = sb.ToString() };
160        } else if (char.IsLetter(str[pos]) || str[pos] == '_') {
161          // read ident
162          var sb = new StringBuilder();
163          sb.Append(str[pos]);
164          pos++;
165          while (pos < str.Length &&
166            (char.IsLetter(str[pos]) || str[pos] == '_' || char.IsDigit(str[pos]))) {
167            sb.Append(str[pos]);
168            pos++;
169          }
170          yield return new Token { TokenType = TokenType.Identifier, strVal = sb.ToString() };
171        } else if (str[pos] == '"') {
172          // read to next "
173          pos++;
174          var sb = new StringBuilder();
175          while (pos < str.Length && str[pos] != '"') {
176            sb.Append(str[pos]);
177            pos++;
178          }
179          if (pos < str.Length && str[pos] == '"') {
180            pos++; // skip "
181            yield return new Token { TokenType = TokenType.Identifier, strVal = sb.ToString() };
182          } else
183            yield return new Token { TokenType = TokenType.NA };
184
185        } else if (str[pos] == '\'') {
186          // read to next '
187          pos++;
188          var sb = new StringBuilder();
189          while (pos < str.Length && str[pos] != '\'') {
190            sb.Append(str[pos]);
191            pos++;
192          }
193          if (pos < str.Length && str[pos] == '\'') {
194            pos++; // skip '
195            yield return new Token { TokenType = TokenType.Identifier, strVal = sb.ToString() };
196          } else
197            yield return new Token { TokenType = TokenType.NA };
198        } else if (str[pos] == '+') {
199          pos++;
200          yield return new Token { TokenType = TokenType.Operator, strVal = "+" };
201        } else if (str[pos] == '-') {
202          pos++;
203          yield return new Token { TokenType = TokenType.Operator, strVal = "-" };
204        } else if (str[pos] == '/') {
205          pos++;
206          yield return new Token { TokenType = TokenType.Operator, strVal = "/" };
207        } else if (str[pos] == '*') {
208          pos++;
209          yield return new Token { TokenType = TokenType.Operator, strVal = "*" };
210        } else if (str[pos] == '(') {
211          pos++;
212          yield return new Token { TokenType = TokenType.LeftPar, strVal = "(" };
213        } else if (str[pos] == ')') {
214          pos++;
215          yield return new Token { TokenType = TokenType.RightPar, strVal = ")" };
[14347]216        } else if (str[pos] == ',') {
217          pos++;
218          yield return new Token { TokenType = TokenType.Comma, strVal = "," };
[14319]219        } else {
220          throw new ArgumentException("Invalid character: " + str[pos]);
[14024]221        }
222      }
223    }
224
[14347]225    // S       = Expr EOF
226    // Expr    = ['-' | '+'] Term { '+' Term | '-' Term }
227    // Term    = Fact { '*' Fact | '/' Fact }
228    // Fact    = '(' Expr ')' | funcId '(' ArgList ')' | varId | number
229    // ArgList = Expr { ',' Expr }
[14024]230    private ISymbolicExpressionTreeNode ParseS(Queue<Token> tokens) {
231      var expr = ParseExpr(tokens);
232
233      var endTok = tokens.Dequeue();
234      if (endTok.TokenType != TokenType.End)
235        throw new ArgumentException(string.Format("Expected end of expression (got {0})", endTok.strVal));
236
237      return expr;
238    }
239    private ISymbolicExpressionTreeNode ParseExpr(Queue<Token> tokens) {
240      var next = tokens.Peek();
241      var posTerms = new List<ISymbolicExpressionTreeNode>();
242      var negTerms = new List<ISymbolicExpressionTreeNode>();
243      bool negateFirstTerm = false;
244      if (next.TokenType == TokenType.Operator && (next.strVal == "+" || next.strVal == "-")) {
245        tokens.Dequeue();
246        if (next.strVal == "-")
247          negateFirstTerm = true;
248      }
249      var t = ParseTerm(tokens);
250      if (negateFirstTerm) negTerms.Add(t);
251      else posTerms.Add(t);
252
253      next = tokens.Peek();
254      while (next.strVal == "+" || next.strVal == "-") {
255        switch (next.strVal) {
256          case "+": {
257              tokens.Dequeue();
258              var term = ParseTerm(tokens);
259              posTerms.Add(term);
260              break;
261            }
262          case "-": {
263              tokens.Dequeue();
264              var term = ParseTerm(tokens);
265              negTerms.Add(term);
266              break;
267            }
268        }
269        next = tokens.Peek();
270      }
271
272      var sum = GetSymbol("+").CreateTreeNode();
273      foreach (var posTerm in posTerms) sum.AddSubtree(posTerm);
274      if (negTerms.Any()) {
275        if (negTerms.Count == 1) {
276          var sub = GetSymbol("-").CreateTreeNode();
277          sub.AddSubtree(negTerms.Single());
278          sum.AddSubtree(sub);
279        } else {
280          var sumNeg = GetSymbol("+").CreateTreeNode();
281          foreach (var negTerm in negTerms) sumNeg.AddSubtree(negTerm);
282
283          var constNode = (ConstantTreeNode)constant.CreateTreeNode();
284          constNode.Value = -1.0;
285          var prod = GetSymbol("*").CreateTreeNode();
286          prod.AddSubtree(constNode);
287          prod.AddSubtree(sumNeg);
288
289          sum.AddSubtree(prod);
290        }
291      }
292      if (sum.SubtreeCount == 1) return sum.Subtrees.First();
293      else return sum;
294    }
295
296    private ISymbol GetSymbol(string tok) {
297      var symb = knownSymbols.GetByFirst(tok).FirstOrDefault();
298      if (symb == null) throw new ArgumentException(string.Format("Unknown token {0} found.", tok));
299      return symb;
300    }
301
302    // Term = Fact { '*' Fact | '/' Fact }
303    private ISymbolicExpressionTreeNode ParseTerm(Queue<Token> tokens) {
304      var factors = new List<ISymbolicExpressionTreeNode>();
305      var firstFactor = ParseFact(tokens);
306      factors.Add(firstFactor);
307
308      var next = tokens.Peek();
309      while (next.strVal == "*" || next.strVal == "/") {
310        switch (next.strVal) {
311          case "*": {
312              tokens.Dequeue();
313              var fact = ParseFact(tokens);
314              factors.Add(fact);
315              break;
316            }
317          case "/": {
318              tokens.Dequeue();
319              var invFact = ParseFact(tokens);
320              var divNode = GetSymbol("/").CreateTreeNode(); // 1/x
321              divNode.AddSubtree(invFact);
322              factors.Add(divNode);
323              break;
324            }
325        }
326
327        next = tokens.Peek();
328      }
329      if (factors.Count == 1) return factors.First();
330      else {
331        var prod = GetSymbol("*").CreateTreeNode();
332        foreach (var f in factors) prod.AddSubtree(f);
333        return prod;
334      }
335    }
336
[14350]337    // Fact = '(' Expr ')' | 'LAG' '(' varId ',' ['+' | '-'] number ')' | funcId '(' ArgList ')' | varId | number
[14024]338    private ISymbolicExpressionTreeNode ParseFact(Queue<Token> tokens) {
339      var next = tokens.Peek();
340      if (next.TokenType == TokenType.LeftPar) {
341        tokens.Dequeue();
342        var expr = ParseExpr(tokens);
343        var rPar = tokens.Dequeue();
344        if (rPar.TokenType != TokenType.RightPar)
345          throw new ArgumentException("expected )");
346        return expr;
347      } else if (next.TokenType == TokenType.Identifier) {
348        var idTok = tokens.Dequeue();
349        if (tokens.Peek().TokenType == TokenType.LeftPar) {
350          // function identifier
351          var funcId = idTok.strVal.ToUpperInvariant();
352
353          var funcNode = GetSymbol(funcId).CreateTreeNode();
354          var lPar = tokens.Dequeue();
355          if (lPar.TokenType != TokenType.LeftPar)
356            throw new ArgumentException("expected (");
[14347]357
[14350]358          // handle 'lag' specifically
359          if (funcNode.Symbol is LaggedVariable) {
360            var varId = tokens.Dequeue();
361            if (varId.TokenType != TokenType.Identifier) throw new ArgumentException("Identifier expected. Format for lagged variables: \"lag(x, -1)\"");
362            var comma = tokens.Dequeue();
363            if (comma.TokenType != TokenType.Comma) throw new ArgumentException("',' expected, Format for lagged variables: \"lag(x, -1)\"");
364            double sign = 1.0;
365            if (tokens.Peek().strVal == "+" || tokens.Peek().strVal == "-") {
366              // read sign
367              var signTok = tokens.Dequeue();
368              if (signTok.strVal == "-") sign = -1.0;
369            }
370            var lagToken = tokens.Dequeue();
371            if (lagToken.TokenType != TokenType.Number) throw new ArgumentException("Number expected, Format for lagged variables: \"lag(x, -1)\"");
372            if (!lagToken.doubleVal.IsAlmost(Math.Round(lagToken.doubleVal)))
373              throw new ArgumentException("Time lags must be integer values");
374            var laggedVarNode = funcNode as LaggedVariableTreeNode;
375            laggedVarNode.VariableName = varId.strVal;
376            laggedVarNode.Lag = (int)Math.Round(sign * lagToken.doubleVal);
377            laggedVarNode.Weight = 1.0;
378          } else {
379            // functions
380            var args = ParseArgList(tokens);
381            // check number of arguments
382            if (funcNode.Symbol.MinimumArity > args.Length || funcNode.Symbol.MaximumArity < args.Length) {
383              throw new ArgumentException(string.Format("Symbol {0} requires between {1} and  {2} arguments.", funcId,
384                funcNode.Symbol.MinimumArity, funcNode.Symbol.MaximumArity));
385            }
386            foreach (var arg in args) funcNode.AddSubtree(arg);
387          }
[14347]388
[14024]389          var rPar = tokens.Dequeue();
390          if (rPar.TokenType != TokenType.RightPar)
391            throw new ArgumentException("expected )");
392
393          return funcNode;
394        } else {
395          // variable
396          var varNode = (VariableTreeNode)variable.CreateTreeNode();
397          varNode.Weight = 1.0;
398          varNode.VariableName = idTok.strVal;
399          return varNode;
400        }
401      } else if (next.TokenType == TokenType.Number) {
402        var numTok = tokens.Dequeue();
403        var constNode = (ConstantTreeNode)constant.CreateTreeNode();
404        constNode.Value = numTok.doubleVal;
405        return constNode;
406      } else {
407        throw new ArgumentException(string.Format("unexpected token in expression {0}", next.strVal));
408      }
409    }
[14347]410
411    // ArgList = Expr { ',' Expr }
412    private ISymbolicExpressionTreeNode[] ParseArgList(Queue<Token> tokens) {
413      var exprList = new List<ISymbolicExpressionTreeNode>();
414      exprList.Add(ParseExpr(tokens));
415      while (tokens.Peek().TokenType != TokenType.RightPar) {
416        var comma = tokens.Dequeue();
417        if (comma.TokenType != TokenType.Comma) throw new ArgumentException("expected ',' ");
418        exprList.Add(ParseExpr(tokens));
419      }
420      return exprList.ToArray();
421    }
[14024]422  }
423}
Note: See TracBrowser for help on using the repository browser.