source: branches/symbreg-factors-2650/HeuristicLab.Problems.DataAnalysis.Symbolic/3.4/Importer/InfixExpressionParser.cs @ 14351

Last change on this file since 14351 was 14351, checked in by gkronber, 6 years ago

#2650: merged r14332:14350 from trunk to branch

File size: 17.9 KB
Line 
1#region License Information
2/* HeuristicLab
3 * Copyright (C) 2002-2016 Heuristic and Evolutionary Algorithms Laboratory (HEAL)
4 *
5 * This file is part of HeuristicLab.
6 *
7 * HeuristicLab is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * HeuristicLab is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with HeuristicLab. If not, see <http://www.gnu.org/licenses/>.
19 */
20#endregion
21
22using System;
23using System.Collections.Generic;
24using System.Globalization;
25using System.Linq;
26using System.Text;
27using HeuristicLab.Collections;
28using HeuristicLab.Common;
29using HeuristicLab.Encodings.SymbolicExpressionTreeEncoding;
30
31namespace HeuristicLab.Problems.DataAnalysis.Symbolic {
32  /// <summary>
33  /// Parses mathematical expressions in infix form. E.g. x1 * (3.0 * x2 + x3)
34  /// Identifier format (functions or variables): '_' | letter { '_' | letter | digit }
35  /// Variables names and variable values can be set under quotes "" or '' because variable names might contain spaces.
36  ///   Variable = ident | " ident " | ' ident '
37  /// It is also possible to use functions e.g. log("x1") or real-valued constants e.g. 3.1415 .
38  /// Variable names are case sensitive. Function names are not case sensitive.
39  ///
40  ///
41  /// S       = Expr EOF
42  /// Expr    = ['-' | '+'] Term { '+' Term | '-' Term }
43  /// Term    = Fact { '*' Fact | '/' Fact }
44  /// Fact    = '(' Expr ')' | funcId '(' Expr ')' | VarExpr | number
45  /// VarExpr = varId [ '=' varVal]
46  /// varId   =  ident | ' ident ' | " ident "
47  /// varVal  =  ident | ' ident ' | " ident "
48  /// ident   =  '_' | letter { '_' | letter | digit }
49  /// </summary>
50  public sealed class InfixExpressionParser {
51    private enum TokenType { Operator, Identifier, Number, LeftPar, RightPar, Comma, Eq, End, NA };
52    private class Token {
53      internal double doubleVal;
54      internal string strVal;
55      internal TokenType TokenType;
56    }
57
58    private class SymbolNameComparer : IEqualityComparer<ISymbol>, IComparer<ISymbol> {
59      public int Compare(ISymbol x, ISymbol y) {
60        return x.Name.CompareTo(y.Name);
61      }
62
63      public bool Equals(ISymbol x, ISymbol y) {
64        return Compare(x, y) == 0;
65      }
66
67      public int GetHashCode(ISymbol obj) {
68        return obj.Name.GetHashCode();
69      }
70    }
71    // format name <-> symbol
72    // the lookup table is also used in the corresponding formatter
73    internal static readonly BidirectionalLookup<string, ISymbol>
74      knownSymbols = new BidirectionalLookup<string, ISymbol>(StringComparer.InvariantCulture, new SymbolNameComparer());
75
76    private Constant constant = new Constant();
77    private Variable variable = new Variable();
78    private BinaryFactorVariable binaryFactorVar = new BinaryFactorVariable();
79
80    private ProgramRootSymbol programRootSymbol = new ProgramRootSymbol();
81    private StartSymbol startSymbol = new StartSymbol();
82
83    static InfixExpressionParser() {
84      // populate bidirectional lookup
85      var dict = new Dictionary<string, ISymbol>
86      {
87        { "+", new Addition()},
88        { "/", new Division()},
89        { "*", new Multiplication()},
90        { "-", new Subtraction()},
91        { "EXP", new Exponential()},
92        { "LOG", new Logarithm()},
93        { "POW", new Power()},
94        { "ROOT", new Root()},
95        { "SQR", new Square() },
96        { "SQRT", new SquareRoot() },
97        { "SIN",new Sine()},
98        { "COS", new Cosine()},
99        { "TAN", new Tangent()},
100        { "AIRYA", new AiryA()},
101        { "AIRYB", new AiryB()},
102        { "BESSEL", new Bessel()},
103        { "COSINT", new CosineIntegral()},
104        { "SININT", new SineIntegral()},
105        { "HYPCOSINT", new HyperbolicCosineIntegral()},
106        { "HYPSININT", new HyperbolicSineIntegral()},
107        { "FRESNELSININT", new FresnelSineIntegral()},
108        { "FRESNELCOSINT", new FresnelCosineIntegral()},
109        { "NORM", new Norm()},
110        { "ERF", new Erf()},
111        { "GAMMA", new Gamma()},
112        { "PSI", new Psi()},
113        { "DAWSON", new Dawson()},
114        { "EXPINT", new ExponentialIntegralEi()},
115        { "MEAN", new Average()},
116        { "IF", new IfThenElse()},
117        { "GT", new GreaterThan()},
118        { "LT", new LessThan()},
119        { "AND", new And()},
120        { "OR", new Or()},
121        { "NOT", new Not()},
122        { "XOR", new Xor()},
123        { "DIFF", new Derivative()},
124        { "LAG", new LaggedVariable() },
125      };
126
127
128      foreach (var kvp in dict) {
129        knownSymbols.Add(kvp.Key, kvp.Value);
130      }
131    }
132
133    public ISymbolicExpressionTree Parse(string str) {
134      ISymbolicExpressionTreeNode root = programRootSymbol.CreateTreeNode();
135      ISymbolicExpressionTreeNode start = startSymbol.CreateTreeNode();
136      var allTokens = GetAllTokens(str).ToArray();
137      ISymbolicExpressionTreeNode mainBranch = ParseS(new Queue<Token>(allTokens));
138
139      // only a main branch was given => insert the main branch into the default tree template
140      root.AddSubtree(start);
141      start.AddSubtree(mainBranch);
142      return new SymbolicExpressionTree(root);
143    }
144
145    private IEnumerable<Token> GetAllTokens(string str) {
146      int pos = 0;
147      while (true) {
148        while (pos < str.Length && Char.IsWhiteSpace(str[pos])) pos++;
149        if (pos >= str.Length) {
150          yield return new Token { TokenType = TokenType.End, strVal = "" };
151          yield break;
152        }
153        if (char.IsDigit(str[pos])) {
154          // read number (=> read until white space or operator or comma)
155          var sb = new StringBuilder();
156          sb.Append(str[pos]);
157          pos++;
158          while (pos < str.Length && !char.IsWhiteSpace(str[pos])
159            && (str[pos] != '+' || str[pos - 1] == 'e' || str[pos - 1] == 'E')     // continue reading exponents
160            && (str[pos] != '-' || str[pos - 1] == 'e' || str[pos - 1] == 'E')
161            && str[pos] != '*'
162            && str[pos] != '/'
163            && str[pos] != ')'
164            && str[pos] != ',') {
165            sb.Append(str[pos]);
166            pos++;
167          }
168          double dblVal;
169          if (double.TryParse(sb.ToString(), NumberStyles.Float, CultureInfo.InvariantCulture, out dblVal))
170            yield return new Token { TokenType = TokenType.Number, strVal = sb.ToString(), doubleVal = dblVal };
171          else yield return new Token { TokenType = TokenType.NA, strVal = sb.ToString() };
172        } else if (char.IsLetter(str[pos]) || str[pos] == '_') {
173          // read ident
174          var sb = new StringBuilder();
175          sb.Append(str[pos]);
176          pos++;
177          while (pos < str.Length &&
178            (char.IsLetter(str[pos]) || str[pos] == '_' || char.IsDigit(str[pos]))) {
179            sb.Append(str[pos]);
180            pos++;
181          }
182          yield return new Token { TokenType = TokenType.Identifier, strVal = sb.ToString() };
183        } else if (str[pos] == '"') {
184          // read to next "
185          pos++;
186          var sb = new StringBuilder();
187          while (pos < str.Length && str[pos] != '"') {
188            sb.Append(str[pos]);
189            pos++;
190          }
191          if (pos < str.Length && str[pos] == '"') {
192            pos++; // skip "
193            yield return new Token { TokenType = TokenType.Identifier, strVal = sb.ToString() };
194          } else
195            yield return new Token { TokenType = TokenType.NA };
196
197        } else if (str[pos] == '\'') {
198          // read to next '
199          pos++;
200          var sb = new StringBuilder();
201          while (pos < str.Length && str[pos] != '\'') {
202            sb.Append(str[pos]);
203            pos++;
204          }
205          if (pos < str.Length && str[pos] == '\'') {
206            pos++; // skip '
207            yield return new Token { TokenType = TokenType.Identifier, strVal = sb.ToString() };
208          } else
209            yield return new Token { TokenType = TokenType.NA };
210        } else if (str[pos] == '+') {
211          pos++;
212          yield return new Token { TokenType = TokenType.Operator, strVal = "+" };
213        } else if (str[pos] == '-') {
214          pos++;
215          yield return new Token { TokenType = TokenType.Operator, strVal = "-" };
216        } else if (str[pos] == '/') {
217          pos++;
218          yield return new Token { TokenType = TokenType.Operator, strVal = "/" };
219        } else if (str[pos] == '*') {
220          pos++;
221          yield return new Token { TokenType = TokenType.Operator, strVal = "*" };
222        } else if (str[pos] == '(') {
223          pos++;
224          yield return new Token { TokenType = TokenType.LeftPar, strVal = "(" };
225        } else if (str[pos] == ')') {
226          pos++;
227          yield return new Token { TokenType = TokenType.RightPar, strVal = ")" };
228        } else if (str[pos] == '=') {
229          pos++;
230          yield return new Token { TokenType = TokenType.Eq, strVal = "=" };
231        } else if (str[pos] == ',') {
232          pos++;
233          yield return new Token { TokenType = TokenType.Comma, strVal = "," };
234        } else {
235          throw new ArgumentException("Invalid character: " + str[pos]);
236        }
237      }
238    }
239
240    // S       = Expr EOF
241    // Expr    = ['-' | '+'] Term { '+' Term | '-' Term }
242    // Term    = Fact { '*' Fact | '/' Fact }
243    // Fact    = '(' Expr ')' | funcId '(' ArgList ')' | varId | number
244    // ArgList = Expr { ',' Expr }
245    private ISymbolicExpressionTreeNode ParseS(Queue<Token> tokens) {
246      var expr = ParseExpr(tokens);
247
248      var endTok = tokens.Dequeue();
249      if (endTok.TokenType != TokenType.End)
250        throw new ArgumentException(string.Format("Expected end of expression (got {0})", endTok.strVal));
251
252      return expr;
253    }
254    private ISymbolicExpressionTreeNode ParseExpr(Queue<Token> tokens) {
255      var next = tokens.Peek();
256      var posTerms = new List<ISymbolicExpressionTreeNode>();
257      var negTerms = new List<ISymbolicExpressionTreeNode>();
258      bool negateFirstTerm = false;
259      if (next.TokenType == TokenType.Operator && (next.strVal == "+" || next.strVal == "-")) {
260        tokens.Dequeue();
261        if (next.strVal == "-")
262          negateFirstTerm = true;
263      }
264      var t = ParseTerm(tokens);
265      if (negateFirstTerm) negTerms.Add(t);
266      else posTerms.Add(t);
267
268      next = tokens.Peek();
269      while (next.strVal == "+" || next.strVal == "-") {
270        switch (next.strVal) {
271          case "+": {
272              tokens.Dequeue();
273              var term = ParseTerm(tokens);
274              posTerms.Add(term);
275              break;
276            }
277          case "-": {
278              tokens.Dequeue();
279              var term = ParseTerm(tokens);
280              negTerms.Add(term);
281              break;
282            }
283        }
284        next = tokens.Peek();
285      }
286
287      var sum = GetSymbol("+").CreateTreeNode();
288      foreach (var posTerm in posTerms) sum.AddSubtree(posTerm);
289      if (negTerms.Any()) {
290        if (negTerms.Count == 1) {
291          var sub = GetSymbol("-").CreateTreeNode();
292          sub.AddSubtree(negTerms.Single());
293          sum.AddSubtree(sub);
294        } else {
295          var sumNeg = GetSymbol("+").CreateTreeNode();
296          foreach (var negTerm in negTerms) sumNeg.AddSubtree(negTerm);
297
298          var constNode = (ConstantTreeNode)constant.CreateTreeNode();
299          constNode.Value = -1.0;
300          var prod = GetSymbol("*").CreateTreeNode();
301          prod.AddSubtree(constNode);
302          prod.AddSubtree(sumNeg);
303
304          sum.AddSubtree(prod);
305        }
306      }
307      if (sum.SubtreeCount == 1) return sum.Subtrees.First();
308      else return sum;
309    }
310
311    private ISymbol GetSymbol(string tok) {
312      var symb = knownSymbols.GetByFirst(tok).FirstOrDefault();
313      if (symb == null) throw new ArgumentException(string.Format("Unknown token {0} found.", tok));
314      return symb;
315    }
316
317    // Term = Fact { '*' Fact | '/' Fact }
318    private ISymbolicExpressionTreeNode ParseTerm(Queue<Token> tokens) {
319      var factors = new List<ISymbolicExpressionTreeNode>();
320      var firstFactor = ParseFact(tokens);
321      factors.Add(firstFactor);
322
323      var next = tokens.Peek();
324      while (next.strVal == "*" || next.strVal == "/") {
325        switch (next.strVal) {
326          case "*": {
327              tokens.Dequeue();
328              var fact = ParseFact(tokens);
329              factors.Add(fact);
330              break;
331            }
332          case "/": {
333              tokens.Dequeue();
334              var invFact = ParseFact(tokens);
335              var divNode = GetSymbol("/").CreateTreeNode(); // 1/x
336              divNode.AddSubtree(invFact);
337              factors.Add(divNode);
338              break;
339            }
340        }
341
342        next = tokens.Peek();
343      }
344      if (factors.Count == 1) return factors.First();
345      else {
346        var prod = GetSymbol("*").CreateTreeNode();
347        foreach (var f in factors) prod.AddSubtree(f);
348        return prod;
349      }
350    }
351
352    // Fact = '(' Expr ')' | 'LAG' '(' varId ',' ['+' | '-' ] number ')' | funcId '(' Expr ')' | varId [ = valId ] | number
353    private ISymbolicExpressionTreeNode ParseFact(Queue<Token> tokens) {
354      var next = tokens.Peek();
355      if (next.TokenType == TokenType.LeftPar) {
356        tokens.Dequeue();
357        var expr = ParseExpr(tokens);
358        var rPar = tokens.Dequeue();
359        if (rPar.TokenType != TokenType.RightPar)
360          throw new ArgumentException("expected )");
361        return expr;
362      } else if (next.TokenType == TokenType.Identifier) {
363        var idTok = tokens.Dequeue();
364        if (tokens.Peek().TokenType == TokenType.LeftPar) {
365          // function identifier
366          var funcId = idTok.strVal.ToUpperInvariant();
367
368          var funcNode = GetSymbol(funcId).CreateTreeNode();
369          var lPar = tokens.Dequeue();
370          if (lPar.TokenType != TokenType.LeftPar)
371            throw new ArgumentException("expected (");
372
373          // handle 'lag' specifically
374          if (funcNode.Symbol is LaggedVariable) {
375            var varId = tokens.Dequeue();
376            if (varId.TokenType != TokenType.Identifier) throw new ArgumentException("Identifier expected. Format for lagged variables: \"lag(x, -1)\"");
377            var comma = tokens.Dequeue();
378            if (comma.TokenType != TokenType.Comma) throw new ArgumentException("',' expected, Format for lagged variables: \"lag(x, -1)\"");
379            double sign = 1.0;
380            if (tokens.Peek().strVal == "+" || tokens.Peek().strVal == "-") {
381              // read sign
382              var signTok = tokens.Dequeue();
383              if (signTok.strVal == "-") sign = -1.0;
384            }
385            var lagToken = tokens.Dequeue();
386            if (lagToken.TokenType != TokenType.Number) throw new ArgumentException("Number expected, Format for lagged variables: \"lag(x, -1)\"");
387            if (!lagToken.doubleVal.IsAlmost(Math.Round(lagToken.doubleVal)))
388              throw new ArgumentException("Time lags must be integer values");
389            var laggedVarNode = funcNode as LaggedVariableTreeNode;
390            laggedVarNode.VariableName = varId.strVal;
391            laggedVarNode.Lag = (int)Math.Round(sign * lagToken.doubleVal);
392            laggedVarNode.Weight = 1.0;
393          } else {
394            // functions
395            var args = ParseArgList(tokens);
396            // check number of arguments
397            if (funcNode.Symbol.MinimumArity > args.Length || funcNode.Symbol.MaximumArity < args.Length) {
398              throw new ArgumentException(string.Format("Symbol {0} requires between {1} and  {2} arguments.", funcId,
399                funcNode.Symbol.MinimumArity, funcNode.Symbol.MaximumArity));
400            }
401            foreach (var arg in args) funcNode.AddSubtree(arg);
402          }
403
404          var rPar = tokens.Dequeue();
405          if (rPar.TokenType != TokenType.RightPar)
406            throw new ArgumentException("expected )");
407
408          return funcNode;
409        } else {
410          // variable
411          if (tokens.Peek().TokenType == TokenType.Eq) {
412            // binary factor
413            tokens.Dequeue(); // skip Eq
414            var valTok = tokens.Dequeue();
415            if (valTok.TokenType != TokenType.Identifier) throw new ArgumentException("expected identifier");
416            var binFactorNode = (BinaryFactorVariableTreeNode)binaryFactorVar.CreateTreeNode();
417            binFactorNode.Weight = 1.0;
418            binFactorNode.VariableName = idTok.strVal;
419            binFactorNode.VariableValue = valTok.strVal;
420            return binFactorNode;
421          } else {
422            // variable
423            var varNode = (VariableTreeNode)variable.CreateTreeNode();
424            varNode.Weight = 1.0;
425            varNode.VariableName = idTok.strVal;
426            return varNode;
427          }
428        }
429      } else if (next.TokenType == TokenType.Number) {
430        var numTok = tokens.Dequeue();
431        var constNode = (ConstantTreeNode)constant.CreateTreeNode();
432        constNode.Value = numTok.doubleVal;
433        return constNode;
434      } else {
435        throw new ArgumentException(string.Format("unexpected token in expression {0}", next.strVal));
436      }
437    }
438
439    // ArgList = Expr { ',' Expr }
440    private ISymbolicExpressionTreeNode[] ParseArgList(Queue<Token> tokens) {
441      var exprList = new List<ISymbolicExpressionTreeNode>();
442      exprList.Add(ParseExpr(tokens));
443      while (tokens.Peek().TokenType != TokenType.RightPar) {
444        var comma = tokens.Dequeue();
445        if (comma.TokenType != TokenType.Comma) throw new ArgumentException("expected ',' ");
446        exprList.Add(ParseExpr(tokens));
447      }
448      return exprList.ToArray();
449    }
450  }
451}
Note: See TracBrowser for help on using the repository browser.