Free cookie consent management tool by TermsFeed Policy Generator

source: branches/PersistenceReintegration/HeuristicLab.Problems.DataAnalysis.Symbolic/3.4/Importer/InfixExpressionParser.cs @ 15018

Last change on this file since 15018 was 14929, checked in by gkronber, 8 years ago

#2520 fixed unit tests for new persistence: loading & storing all samples

File size: 20.9 KB
RevLine 
[14024]1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.Linq;
26using System.Text;
27using HeuristicLab.Collections;
[14350]28using HeuristicLab.Common;
[14024]29using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
[14929]30using HeuristicLab.Persistence;
[14024]31
32namespace HeuristicLab.Problems.DataAnalysis.Symbolic {
33  /// <summary>
34  /// Parses mathematical expressions in infix form. E.g. x1 * (3.0 * x2 + x3)
35  /// Identifier format (functions or variables): '_' | letter { '_' | letter | digit }
[14826]36  /// Variables names and variable values can be set under quotes "" or '' because variable names might contain spaces.
37  ///   Variable = ident | " ident " | ' ident '
[14024]38  /// It is also possible to use functions e.g. log("x1") or real-valued constants e.g. 3.1415 .
39  /// Variable names are case sensitive. Function names are not case sensitive.
[14826]40  ///
41  ///
42  /// S             = Expr EOF
43  /// Expr          = ['-' | '+'] Term { '+' Term | '-' Term }
44  /// Term          = Fact { '*' Fact | '/' Fact }
45  /// Fact          = '(' Expr ')'
46  ///                 | 'LAG' '(' varId ',' ['+' | '-' ] number ')'
47  ///                 | funcId '(' ArgList ')'
48  ///                 | VarExpr | number
49  /// ArgList       = Expr { ',' Expr }
50  /// VarExpr       = varId OptFactorPart
51  /// OptFactorPart = [ ('=' varVal | '[' ['+' | '-' ] number {',' ['+' | '-' ] number } ']' ) ]
52  /// varId         =  ident | ' ident ' | " ident "
53  /// varVal        =  ident | ' ident ' | " ident "
54  /// ident         =  '_' | letter { '_' | letter | digit }
[14024]55  /// </summary>
[14026]56  public sealed class InfixExpressionParser {
[14929]57    [StorableType("2923f64d-5d2b-4723-bf00-e2bd1c809515")]
[14826]58    private enum TokenType { Operator, Identifier, Number, LeftPar, RightPar, LeftBracket, RightBracket, Comma, Eq, End, NA };
[14024]59    private class Token {
60      internal double doubleVal;
61      internal string strVal;
62      internal TokenType TokenType;
63    }
64
65    private class SymbolNameComparer : IEqualityComparer<ISymbol>, IComparer<ISymbol> {
66      public int Compare(ISymbol x, ISymbol y) {
67        return x.Name.CompareTo(y.Name);
68      }
69
70      public bool Equals(ISymbol x, ISymbol y) {
71        return Compare(x, y) == 0;
72      }
73
74      public int GetHashCode(ISymbol obj) {
75        return obj.Name.GetHashCode();
76      }
77    }
78    // format name <-> symbol
79    // the lookup table is also used in the corresponding formatter
80    internal static readonly BidirectionalLookup<string, ISymbol>
81      knownSymbols = new BidirectionalLookup<string, ISymbol>(StringComparer.InvariantCulture, new SymbolNameComparer());
82
83    private Constant constant = new Constant();
84    private Variable variable = new Variable();
[14826]85    private BinaryFactorVariable binaryFactorVar = new BinaryFactorVariable();
86    private FactorVariable factorVar = new FactorVariable();
[14024]87
88    private ProgramRootSymbol programRootSymbol = new ProgramRootSymbol();
89    private StartSymbol startSymbol = new StartSymbol();
90
91    static InfixExpressionParser() {
92      // populate bidirectional lookup
93      var dict = new Dictionary<string, ISymbol>
94      {
95        { "+", new Addition()},
96        { "/", new Division()},
97        { "*", new Multiplication()},
98        { "-", new Subtraction()},
99        { "EXP", new Exponential()},
100        { "LOG", new Logarithm()},
101        { "POW", new Power()},
102        { "ROOT", new Root()},
103        { "SQR", new Square() },
104        { "SQRT", new SquareRoot() },
105        { "SIN",new Sine()},
106        { "COS", new Cosine()},
107        { "TAN", new Tangent()},
108        { "AIRYA", new AiryA()},
109        { "AIRYB", new AiryB()},
110        { "BESSEL", new Bessel()},
111        { "COSINT", new CosineIntegral()},
112        { "SININT", new SineIntegral()},
113        { "HYPCOSINT", new HyperbolicCosineIntegral()},
114        { "HYPSININT", new HyperbolicSineIntegral()},
115        { "FRESNELSININT", new FresnelSineIntegral()},
116        { "FRESNELCOSINT", new FresnelCosineIntegral()},
117        { "NORM", new Norm()},
118        { "ERF", new Erf()},
119        { "GAMMA", new Gamma()},
120        { "PSI", new Psi()},
121        { "DAWSON", new Dawson()},
122        { "EXPINT", new ExponentialIntegralEi()},
123        { "MEAN", new Average()},
124        { "IF", new IfThenElse()},
[14347]125        { "GT", new GreaterThan()},
126        { "LT", new LessThan()},
[14024]127        { "AND", new And()},
128        { "OR", new Or()},
129        { "NOT", new Not()},
130        { "XOR", new Xor()},
131        { "DIFF", new Derivative()},
[14350]132        { "LAG", new LaggedVariable() },
[14024]133      };
134
135
136      foreach (var kvp in dict) {
137        knownSymbols.Add(kvp.Key, kvp.Value);
138      }
139    }
140
141    public ISymbolicExpressionTree Parse(string str) {
142      ISymbolicExpressionTreeNode root = programRootSymbol.CreateTreeNode();
143      ISymbolicExpressionTreeNode start = startSymbol.CreateTreeNode();
144      var allTokens = GetAllTokens(str).ToArray();
145      ISymbolicExpressionTreeNode mainBranch = ParseS(new Queue<Token>(allTokens));
146
147      // only a main branch was given => insert the main branch into the default tree template
148      root.AddSubtree(start);
149      start.AddSubtree(mainBranch);
150      return new SymbolicExpressionTree(root);
151    }
152
153    private IEnumerable<Token> GetAllTokens(string str) {
154      int pos = 0;
155      while (true) {
156        while (pos < str.Length && Char.IsWhiteSpace(str[pos])) pos++;
157        if (pos >= str.Length) {
158          yield return new Token { TokenType = TokenType.End, strVal = "" };
159          yield break;
160        }
161        if (char.IsDigit(str[pos])) {
[14347]162          // read number (=> read until white space or operator or comma)
[14024]163          var sb = new StringBuilder();
164          sb.Append(str[pos]);
165          pos++;
166          while (pos < str.Length && !char.IsWhiteSpace(str[pos])
[14319]167            && (str[pos] != '+' || str[pos - 1] == 'e' || str[pos - 1] == 'E')     // continue reading exponents
[14024]168            && (str[pos] != '-' || str[pos - 1] == 'e' || str[pos - 1] == 'E')
[14319]169            && str[pos] != '*'
[14024]170            && str[pos] != '/'
[14347]171            && str[pos] != ')'
[14826]172            && str[pos] != ']'
[14347]173            && str[pos] != ',') {
[14024]174            sb.Append(str[pos]);
175            pos++;
176          }
177          double dblVal;
178          if (double.TryParse(sb.ToString(), NumberStyles.Float, CultureInfo.InvariantCulture, out dblVal))
179            yield return new Token { TokenType = TokenType.Number, strVal = sb.ToString(), doubleVal = dblVal };
180          else yield return new Token { TokenType = TokenType.NA, strVal = sb.ToString() };
181        } else if (char.IsLetter(str[pos]) || str[pos] == '_') {
182          // read ident
183          var sb = new StringBuilder();
184          sb.Append(str[pos]);
185          pos++;
186          while (pos < str.Length &&
187            (char.IsLetter(str[pos]) || str[pos] == '_' || char.IsDigit(str[pos]))) {
188            sb.Append(str[pos]);
189            pos++;
190          }
191          yield return new Token { TokenType = TokenType.Identifier, strVal = sb.ToString() };
192        } else if (str[pos] == '"') {
193          // read to next "
194          pos++;
195          var sb = new StringBuilder();
196          while (pos < str.Length && str[pos] != '"') {
197            sb.Append(str[pos]);
198            pos++;
199          }
200          if (pos < str.Length && str[pos] == '"') {
201            pos++; // skip "
202            yield return new Token { TokenType = TokenType.Identifier, strVal = sb.ToString() };
203          } else
204            yield return new Token { TokenType = TokenType.NA };
205
206        } else if (str[pos] == '\'') {
207          // read to next '
208          pos++;
209          var sb = new StringBuilder();
210          while (pos < str.Length && str[pos] != '\'') {
211            sb.Append(str[pos]);
212            pos++;
213          }
214          if (pos < str.Length && str[pos] == '\'') {
215            pos++; // skip '
216            yield return new Token { TokenType = TokenType.Identifier, strVal = sb.ToString() };
217          } else
218            yield return new Token { TokenType = TokenType.NA };
219        } else if (str[pos] == '+') {
220          pos++;
221          yield return new Token { TokenType = TokenType.Operator, strVal = "+" };
222        } else if (str[pos] == '-') {
223          pos++;
224          yield return new Token { TokenType = TokenType.Operator, strVal = "-" };
225        } else if (str[pos] == '/') {
226          pos++;
227          yield return new Token { TokenType = TokenType.Operator, strVal = "/" };
228        } else if (str[pos] == '*') {
229          pos++;
230          yield return new Token { TokenType = TokenType.Operator, strVal = "*" };
231        } else if (str[pos] == '(') {
232          pos++;
233          yield return new Token { TokenType = TokenType.LeftPar, strVal = "(" };
234        } else if (str[pos] == ')') {
235          pos++;
236          yield return new Token { TokenType = TokenType.RightPar, strVal = ")" };
[14826]237        } else if (str[pos] == '[') {
238          pos++;
239          yield return new Token { TokenType = TokenType.LeftBracket, strVal = "[" };
240        } else if (str[pos] == ']') {
241          pos++;
242          yield return new Token { TokenType = TokenType.RightBracket, strVal = "]" };
243        } else if (str[pos] == '=') {
244          pos++;
245          yield return new Token { TokenType = TokenType.Eq, strVal = "=" };
[14347]246        } else if (str[pos] == ',') {
247          pos++;
248          yield return new Token { TokenType = TokenType.Comma, strVal = "," };
[14319]249        } else {
250          throw new ArgumentException("Invalid character: " + str[pos]);
[14024]251        }
252      }
253    }
[14826]254    /// S             = Expr EOF
[14024]255    private ISymbolicExpressionTreeNode ParseS(Queue<Token> tokens) {
256      var expr = ParseExpr(tokens);
257
258      var endTok = tokens.Dequeue();
259      if (endTok.TokenType != TokenType.End)
260        throw new ArgumentException(string.Format("Expected end of expression (got {0})", endTok.strVal));
261
262      return expr;
263    }
[14826]264
265    /// Expr          = ['-' | '+'] Term { '+' Term | '-' Term }
[14024]266    private ISymbolicExpressionTreeNode ParseExpr(Queue<Token> tokens) {
267      var next = tokens.Peek();
268      var posTerms = new List<ISymbolicExpressionTreeNode>();
269      var negTerms = new List<ISymbolicExpressionTreeNode>();
270      bool negateFirstTerm = false;
271      if (next.TokenType == TokenType.Operator && (next.strVal == "+" || next.strVal == "-")) {
272        tokens.Dequeue();
273        if (next.strVal == "-")
274          negateFirstTerm = true;
275      }
276      var t = ParseTerm(tokens);
277      if (negateFirstTerm) negTerms.Add(t);
278      else posTerms.Add(t);
279
280      next = tokens.Peek();
281      while (next.strVal == "+" || next.strVal == "-") {
282        switch (next.strVal) {
283          case "+": {
284              tokens.Dequeue();
285              var term = ParseTerm(tokens);
286              posTerms.Add(term);
287              break;
288            }
289          case "-": {
290              tokens.Dequeue();
291              var term = ParseTerm(tokens);
292              negTerms.Add(term);
293              break;
294            }
295        }
296        next = tokens.Peek();
297      }
298
299      var sum = GetSymbol("+").CreateTreeNode();
300      foreach (var posTerm in posTerms) sum.AddSubtree(posTerm);
301      if (negTerms.Any()) {
302        if (negTerms.Count == 1) {
303          var sub = GetSymbol("-").CreateTreeNode();
304          sub.AddSubtree(negTerms.Single());
305          sum.AddSubtree(sub);
306        } else {
307          var sumNeg = GetSymbol("+").CreateTreeNode();
308          foreach (var negTerm in negTerms) sumNeg.AddSubtree(negTerm);
309
310          var constNode = (ConstantTreeNode)constant.CreateTreeNode();
311          constNode.Value = -1.0;
312          var prod = GetSymbol("*").CreateTreeNode();
313          prod.AddSubtree(constNode);
314          prod.AddSubtree(sumNeg);
315
316          sum.AddSubtree(prod);
317        }
318      }
319      if (sum.SubtreeCount == 1) return sum.Subtrees.First();
320      else return sum;
321    }
322
323    private ISymbol GetSymbol(string tok) {
324      var symb = knownSymbols.GetByFirst(tok).FirstOrDefault();
325      if (symb == null) throw new ArgumentException(string.Format("Unknown token {0} found.", tok));
326      return symb;
327    }
328
[14826]329    /// Term          = Fact { '*' Fact | '/' Fact }
[14024]330    private ISymbolicExpressionTreeNode ParseTerm(Queue<Token> tokens) {
331      var factors = new List<ISymbolicExpressionTreeNode>();
332      var firstFactor = ParseFact(tokens);
333      factors.Add(firstFactor);
334
335      var next = tokens.Peek();
336      while (next.strVal == "*" || next.strVal == "/") {
337        switch (next.strVal) {
338          case "*": {
339              tokens.Dequeue();
340              var fact = ParseFact(tokens);
341              factors.Add(fact);
342              break;
343            }
344          case "/": {
345              tokens.Dequeue();
346              var invFact = ParseFact(tokens);
347              var divNode = GetSymbol("/").CreateTreeNode(); // 1/x
348              divNode.AddSubtree(invFact);
349              factors.Add(divNode);
350              break;
351            }
352        }
353
354        next = tokens.Peek();
355      }
356      if (factors.Count == 1) return factors.First();
357      else {
358        var prod = GetSymbol("*").CreateTreeNode();
359        foreach (var f in factors) prod.AddSubtree(f);
360        return prod;
361      }
362    }
363
[14826]364    /// Fact          = '(' Expr ')'
365    ///                 | 'LAG' '(' varId ',' ['+' | '-' ] number ')'
366    ///                 | funcId '(' ArgList ')'
367    ///                 | VarExpr | number
368    /// ArgList       = Expr { ',' Expr }
369    /// VarExpr       = varId OptFactorPart
370    /// OptFactorPart = [ ('=' varVal | '[' ['+' | '-' ] number {',' ['+' | '-' ] number } ']' ) ]
371    /// varId         =  ident | ' ident ' | " ident "
372    /// varVal        =  ident | ' ident ' | " ident "
373    /// ident         =  '_' | letter { '_' | letter | digit }
[14024]374    private ISymbolicExpressionTreeNode ParseFact(Queue<Token> tokens) {
375      var next = tokens.Peek();
376      if (next.TokenType == TokenType.LeftPar) {
377        tokens.Dequeue();
378        var expr = ParseExpr(tokens);
379        var rPar = tokens.Dequeue();
380        if (rPar.TokenType != TokenType.RightPar)
381          throw new ArgumentException("expected )");
382        return expr;
383      } else if (next.TokenType == TokenType.Identifier) {
384        var idTok = tokens.Dequeue();
385        if (tokens.Peek().TokenType == TokenType.LeftPar) {
[14826]386          // function identifier or LAG
[14024]387          var funcId = idTok.strVal.ToUpperInvariant();
388
389          var funcNode = GetSymbol(funcId).CreateTreeNode();
390          var lPar = tokens.Dequeue();
391          if (lPar.TokenType != TokenType.LeftPar)
392            throw new ArgumentException("expected (");
[14347]393
[14350]394          // handle 'lag' specifically
395          if (funcNode.Symbol is LaggedVariable) {
396            var varId = tokens.Dequeue();
397            if (varId.TokenType != TokenType.Identifier) throw new ArgumentException("Identifier expected. Format for lagged variables: \"lag(x, -1)\"");
398            var comma = tokens.Dequeue();
399            if (comma.TokenType != TokenType.Comma) throw new ArgumentException("',' expected, Format for lagged variables: \"lag(x, -1)\"");
400            double sign = 1.0;
401            if (tokens.Peek().strVal == "+" || tokens.Peek().strVal == "-") {
402              // read sign
403              var signTok = tokens.Dequeue();
404              if (signTok.strVal == "-") sign = -1.0;
405            }
406            var lagToken = tokens.Dequeue();
407            if (lagToken.TokenType != TokenType.Number) throw new ArgumentException("Number expected, Format for lagged variables: \"lag(x, -1)\"");
408            if (!lagToken.doubleVal.IsAlmost(Math.Round(lagToken.doubleVal)))
409              throw new ArgumentException("Time lags must be integer values");
410            var laggedVarNode = funcNode as LaggedVariableTreeNode;
411            laggedVarNode.VariableName = varId.strVal;
412            laggedVarNode.Lag = (int)Math.Round(sign * lagToken.doubleVal);
413            laggedVarNode.Weight = 1.0;
414          } else {
415            // functions
416            var args = ParseArgList(tokens);
417            // check number of arguments
418            if (funcNode.Symbol.MinimumArity > args.Length || funcNode.Symbol.MaximumArity < args.Length) {
419              throw new ArgumentException(string.Format("Symbol {0} requires between {1} and  {2} arguments.", funcId,
420                funcNode.Symbol.MinimumArity, funcNode.Symbol.MaximumArity));
421            }
422            foreach (var arg in args) funcNode.AddSubtree(arg);
423          }
[14347]424
[14024]425          var rPar = tokens.Dequeue();
426          if (rPar.TokenType != TokenType.RightPar)
427            throw new ArgumentException("expected )");
428
429          return funcNode;
430        } else {
431          // variable
[14826]432          if (tokens.Peek().TokenType == TokenType.Eq) {
433            // binary factor
434            tokens.Dequeue(); // skip Eq
435            var valTok = tokens.Dequeue();
436            if (valTok.TokenType != TokenType.Identifier) throw new ArgumentException("expected identifier");
437            var binFactorNode = (BinaryFactorVariableTreeNode)binaryFactorVar.CreateTreeNode();
438            binFactorNode.Weight = 1.0;
439            binFactorNode.VariableName = idTok.strVal;
440            binFactorNode.VariableValue = valTok.strVal;
441            return binFactorNode;
442          } else if (tokens.Peek().TokenType == TokenType.LeftBracket) {
443            // factor variable
444            var factorVariableNode = (FactorVariableTreeNode)factorVar.CreateTreeNode();
445            factorVariableNode.VariableName = idTok.strVal;
446
447            tokens.Dequeue(); // skip [
448            var weights = new List<double>();
449            // at least one weight is necessary
450            var sign = 1.0;
451            if (tokens.Peek().TokenType == TokenType.Operator) {
452              var opToken = tokens.Dequeue();
453              if (opToken.strVal == "+") sign = 1.0;
454              else if (opToken.strVal == "-") sign = -1.0;
455              else throw new ArgumentException();
456            }
457            if (tokens.Peek().TokenType != TokenType.Number) throw new ArgumentException("number expected");
458            var weightTok = tokens.Dequeue();
459            weights.Add(sign * weightTok.doubleVal);
460            while (tokens.Peek().TokenType == TokenType.Comma) {
461              // skip comma
462              tokens.Dequeue();
463              if (tokens.Peek().TokenType == TokenType.Operator) {
464                var opToken = tokens.Dequeue();
465                if (opToken.strVal == "+") sign = 1.0;
466                else if (opToken.strVal == "-") sign = -1.0;
467                else throw new ArgumentException();
468              }
469              weightTok = tokens.Dequeue();
470              if (weightTok.TokenType != TokenType.Number) throw new ArgumentException("number expected");
471              weights.Add(sign * weightTok.doubleVal);
472            }
473            var rightBracketToken = tokens.Dequeue();
474            if (rightBracketToken.TokenType != TokenType.RightBracket) throw new ArgumentException("closing bracket ] expected");
475            factorVariableNode.Weights = weights.ToArray();
476            return factorVariableNode;
477          } else {
478            // variable
479            var varNode = (VariableTreeNode)variable.CreateTreeNode();
480            varNode.Weight = 1.0;
481            varNode.VariableName = idTok.strVal;
482            return varNode;
483          }
[14024]484        }
485      } else if (next.TokenType == TokenType.Number) {
486        var numTok = tokens.Dequeue();
487        var constNode = (ConstantTreeNode)constant.CreateTreeNode();
488        constNode.Value = numTok.doubleVal;
489        return constNode;
490      } else {
491        throw new ArgumentException(string.Format("unexpected token in expression {0}", next.strVal));
492      }
493    }
[14347]494
495    // ArgList = Expr { ',' Expr }
496    private ISymbolicExpressionTreeNode[] ParseArgList(Queue<Token> tokens) {
497      var exprList = new List<ISymbolicExpressionTreeNode>();
498      exprList.Add(ParseExpr(tokens));
499      while (tokens.Peek().TokenType != TokenType.RightPar) {
500        var comma = tokens.Dequeue();
501        if (comma.TokenType != TokenType.Comma) throw new ArgumentException("expected ',' ");
502        exprList.Add(ParseExpr(tokens));
503      }
504      return exprList.ToArray();
505    }
[14024]506  }
507}
Note: See TracBrowser for help on using the repository browser.