Free cookie consent management tool by TermsFeed Policy Generator

source: branches/HiveHiveEngine/HeuristicLab.ExtLibs/HeuristicLab.ProtobufCS/0.9.1/ProtobufCS/src/ProtocolBuffers/TextTokenizer.cs @ 9674

Last change on this file since 9674 was 3857, checked in by abeham, 15 years ago

#866

  • Added protobuf-csharp-port project source to ExtLibs
File size: 14.1 KB
Line 
1#region Copyright notice and license
2// Protocol Buffers - Google's data interchange format
3// Copyright 2008 Google Inc.  All rights reserved.
4// http://github.com/jskeet/dotnet-protobufs/
5// Original C++/Java/Python code:
6// http://code.google.com/p/protobuf/
7//
8// Redistribution and use in source and binary forms, with or without
9// modification, are permitted provided that the following conditions are
10// met:
11//
12//     * Redistributions of source code must retain the above copyright
13// notice, this list of conditions and the following disclaimer.
14//     * Redistributions in binary form must reproduce the above
15// copyright notice, this list of conditions and the following disclaimer
16// in the documentation and/or other materials provided with the
17// distribution.
18//     * Neither the name of Google Inc. nor the names of its
19// contributors may be used to endorse or promote products derived from
20// this software without specific prior written permission.
21//
22// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33#endregion
34
35using System;
36using System.Globalization;
37using System.Text.RegularExpressions;
38
39namespace Google.ProtocolBuffers {
40  /// <summary>
41  /// Represents a stream of tokens parsed from a string.
42  /// </summary>
43  internal sealed class TextTokenizer {
44    private readonly string text;
45    private string currentToken;
46
47    /// <summary>
48    /// The character index within the text to perform the next regex match at.
49    /// </summary>
50    private int matchPos = 0;
51
52    /// <summary>
53    /// The character index within the text at which the current token begins.
54    /// </summary>
55    private int pos = 0;
56
57    /// <summary>
58    /// The line number of the current token.
59    /// </summary>
60    private int line = 0;
61    /// <summary>
62    /// The column number of the current token.
63    /// </summary>
64    private int column = 0;
65
66    /// <summary>
67    /// The line number of the previous token.
68    /// </summary>
69    private int previousLine = 0;
70    /// <summary>
71    /// The column number of the previous token.
72    /// </summary>
73    private int previousColumn = 0;
74
75    // Note: atomic groups used to mimic possessive quantifiers in Java in both of these regexes
76    internal static readonly Regex WhitespaceAndCommentPattern = new Regex("\\G(?>(\\s|(#.*$))+)",
77        SilverlightCompatibility.CompiledRegexWhereAvailable | RegexOptions.Multiline);
78    private static readonly Regex TokenPattern = new Regex(
79      "\\G[a-zA-Z_](?>[0-9a-zA-Z_+-]*)|" +              // an identifier
80      "\\G[0-9+-](?>[0-9a-zA-Z_.+-]*)|" +                  // a number
81      "\\G\"(?>([^\"\\\n\\\\]|\\\\.)*)(\"|\\\\?$)|" +    // a double-quoted string
82      "\\G\'(?>([^\"\\\n\\\\]|\\\\.)*)(\'|\\\\?$)",      // a single-quoted string
83      SilverlightCompatibility.CompiledRegexWhereAvailable | RegexOptions.Multiline);
84
85    private static readonly Regex DoubleInfinity = new Regex("^-?inf(inity)?$",
86      SilverlightCompatibility.CompiledRegexWhereAvailable | RegexOptions.IgnoreCase);
87    private static readonly Regex FloatInfinity = new Regex("^-?inf(inity)?f?$",
88      SilverlightCompatibility.CompiledRegexWhereAvailable | RegexOptions.IgnoreCase);
89    private static readonly Regex FloatNan = new Regex("^nanf?$",
90      SilverlightCompatibility.CompiledRegexWhereAvailable | RegexOptions.IgnoreCase);
91
92    /** Construct a tokenizer that parses tokens from the given text. */
93    public TextTokenizer(string text) {
94      this.text = text;
95      SkipWhitespace();
96      NextToken();
97    }
98
99    /// <summary>
100    /// Are we at the end of the input?
101    /// </summary>
102    public bool AtEnd {
103      get { return currentToken.Length == 0; }
104    }
105
106    /// <summary>
107    /// Advances to the next token.
108    /// </summary>
109    public void NextToken() {
110      previousLine = line;
111      previousColumn = column;
112
113      // Advance the line counter to the current position.
114      while (pos < matchPos) {
115        if (text[pos] == '\n') {
116          ++line;
117          column = 0;
118        } else {
119          ++column;
120        }
121        ++pos;
122      }
123
124      // Match the next token.
125      if (matchPos == text.Length) {
126        // EOF
127        currentToken = "";
128      } else {
129        Match match = TokenPattern.Match(text, matchPos);
130        if (match.Success) {
131          currentToken = match.Value;
132          matchPos += match.Length;
133        } else {
134          // Take one character.
135          currentToken = text[matchPos].ToString();
136          matchPos++;
137        }
138
139        SkipWhitespace();
140      }
141    }
142
143    /// <summary>
144    /// Skip over any whitespace so that matchPos starts at the next token.
145    /// </summary>
146    private void SkipWhitespace() {
147      Match match = WhitespaceAndCommentPattern.Match(text, matchPos);
148      if (match.Success) {
149        matchPos += match.Length;
150      }
151    }
152
153    /// <summary>
154    /// If the next token exactly matches the given token, consume it and return
155    /// true. Otherwise, return false without doing anything.
156    /// </summary>
157    public bool TryConsume(string token) {
158      if (currentToken == token) {
159        NextToken();
160        return true;
161      }
162      return false;
163    }
164
165    /*
166     * If the next token exactly matches {@code token}, consume it.  Otherwise,
167     * throw a {@link ParseException}.
168     */
169    /// <summary>
170    /// If the next token exactly matches the specified one, consume it.
171    /// Otherwise, throw a FormatException.
172    /// </summary>
173    /// <param name="token"></param>
174    public void Consume(string token) {
175      if (!TryConsume(token)) {
176        throw CreateFormatException("Expected \"" + token + "\".");
177      }
178    }
179
180    /// <summary>
181    /// Returns true if the next token is an integer, but does not consume it.
182    /// </summary>
183    public bool LookingAtInteger() {
184      if (currentToken.Length == 0) {
185        return false;
186      }
187
188      char c = currentToken[0];
189      return ('0' <= c && c <= '9') || c == '-' || c == '+';
190    }
191
192    /// <summary>
193    /// If the next token is an identifier, consume it and return its value.
194    /// Otherwise, throw a FormatException.
195    /// </summary>
196    public string ConsumeIdentifier() {
197      foreach (char c in currentToken) {
198        if (('a' <= c && c <= 'z') ||
199            ('A' <= c && c <= 'Z') ||
200            ('0' <= c && c <= '9') ||
201            (c == '_') || (c == '.')) {
202          // OK
203        } else {
204          throw CreateFormatException("Expected identifier.");
205        }
206      }
207
208      string result = currentToken;
209      NextToken();
210      return result;
211    }
212
213    /// <summary>
214    /// If the next token is a 32-bit signed integer, consume it and return its
215    /// value. Otherwise, throw a FormatException.
216    /// </summary>
217    public int ConsumeInt32()  {
218      try {
219        int result = TextFormat.ParseInt32(currentToken);
220        NextToken();
221        return result;
222      } catch (FormatException e) {
223        throw CreateIntegerParseException(e);
224      }
225    }
226
227    /// <summary>
228    /// If the next token is a 32-bit unsigned integer, consume it and return its
229    /// value. Otherwise, throw a FormatException.
230    /// </summary>
231    public uint ConsumeUInt32() {
232      try {
233        uint result = TextFormat.ParseUInt32(currentToken);
234        NextToken();
235        return result;
236      } catch (FormatException e) {
237        throw CreateIntegerParseException(e);
238      }
239    }
240
241    /// <summary>
242    /// If the next token is a 64-bit signed integer, consume it and return its
243    /// value. Otherwise, throw a FormatException.
244    /// </summary>
245    public long ConsumeInt64() {
246      try {
247        long result = TextFormat.ParseInt64(currentToken);
248        NextToken();
249        return result;
250      } catch (FormatException e) {
251        throw CreateIntegerParseException(e);
252      }
253    }
254
255    /// <summary>
256    /// If the next token is a 64-bit unsigned integer, consume it and return its
257    /// value. Otherwise, throw a FormatException.
258    /// </summary>
259    public ulong ConsumeUInt64() {
260      try {
261        ulong result = TextFormat.ParseUInt64(currentToken);
262        NextToken();
263        return result;
264      } catch (FormatException e) {
265        throw CreateIntegerParseException(e);
266      }
267    }
268
269    /// <summary>
270    /// If the next token is a double, consume it and return its value.
271    /// Otherwise, throw a FormatException.
272    /// </summary>
273    public double ConsumeDouble() {
274      // We need to parse infinity and nan separately because
275      // double.Parse() does not accept "inf", "infinity", or "nan".
276      if (DoubleInfinity.IsMatch(currentToken)) {
277        bool negative = currentToken.StartsWith("-");
278        NextToken();
279        return negative ? double.NegativeInfinity : double.PositiveInfinity;
280      }
281      if (currentToken.Equals("nan", StringComparison.InvariantCultureIgnoreCase)) {
282        NextToken();
283        return Double.NaN;
284      }
285
286      try {
287        double result = double.Parse(currentToken, CultureInfo.InvariantCulture);
288        NextToken();
289        return result;
290      } catch (FormatException e) {
291        throw CreateFloatParseException(e);
292      } catch (OverflowException e) {
293        throw CreateFloatParseException(e);
294      }
295    }
296
297    /// <summary>
298    /// If the next token is a float, consume it and return its value.
299    /// Otherwise, throw a FormatException.
300    /// </summary>
301    public float ConsumeFloat() {
302      // We need to parse infinity and nan separately because
303      // Float.parseFloat() does not accept "inf", "infinity", or "nan".
304      if (FloatInfinity.IsMatch(currentToken)) {
305        bool negative = currentToken.StartsWith("-");
306        NextToken();
307        return negative ? float.NegativeInfinity : float.PositiveInfinity;
308      }
309      if (FloatNan.IsMatch(currentToken)) {
310        NextToken();
311        return float.NaN;
312      }
313
314      if (currentToken.EndsWith("f")) {
315        currentToken = currentToken.TrimEnd('f');
316      }
317
318      try {
319        float result = float.Parse(currentToken, CultureInfo.InvariantCulture);
320        NextToken();
321        return result;
322      } catch (FormatException e) {
323        throw CreateFloatParseException(e);
324      } catch (OverflowException e) {
325        throw CreateFloatParseException(e);
326      }
327    }
328
329    /// <summary>
330    /// If the next token is a Boolean, consume it and return its value.
331    /// Otherwise, throw a FormatException.   
332    /// </summary>
333    public bool ConsumeBoolean() {
334      if (currentToken == "true") {
335        NextToken();
336        return true;
337      }
338      if (currentToken == "false") {
339        NextToken();
340        return false;
341      }
342      throw CreateFormatException("Expected \"true\" or \"false\".");
343    }
344
345    /// <summary>
346    /// If the next token is a string, consume it and return its (unescaped) value.
347    /// Otherwise, throw a FormatException.
348    /// </summary>
349    public string ConsumeString() {
350      return ConsumeByteString().ToStringUtf8();
351    }
352
353    /// <summary>
354    /// If the next token is a string, consume it, unescape it as a
355    /// ByteString and return it. Otherwise, throw a FormatException.
356    /// </summary>
357    public ByteString ConsumeByteString() {
358      char quote = currentToken.Length > 0 ? currentToken[0] : '\0';
359      if (quote != '\"' && quote != '\'') {
360        throw CreateFormatException("Expected string.");
361      }
362
363      if (currentToken.Length < 2 ||
364          currentToken[currentToken.Length-1] != quote) {
365        throw CreateFormatException("String missing ending quote.");
366      }
367
368      try {
369        string escaped = currentToken.Substring(1, currentToken.Length - 2);
370        ByteString result = TextFormat.UnescapeBytes(escaped);
371        NextToken();
372        return result;
373      } catch (FormatException e) {
374        throw CreateFormatException(e.Message);
375      }
376    }
377
378    /// <summary>
379    /// Returns a format exception with the current line and column numbers
380    /// in the description, suitable for throwing.
381    /// </summary>
382    public FormatException CreateFormatException(string description) {
383      // Note:  People generally prefer one-based line and column numbers.
384      return new FormatException((line + 1) + ":" + (column + 1) + ": " + description);
385    }
386
387    /// <summary>
388    /// Returns a format exception with the line and column numbers of the
389    /// previous token in the description, suitable for throwing.
390    /// </summary>
391    public FormatException CreateFormatExceptionPreviousToken(string description) {
392      // Note:  People generally prefer one-based line and column numbers.
393      return new FormatException((previousLine + 1) + ":" + (previousColumn + 1) + ": " + description);
394    }
395
396    /// <summary>
397    /// Constructs an appropriate FormatException for the given existing exception
398    /// when trying to parse an integer.
399    /// </summary>
400    private FormatException CreateIntegerParseException(FormatException e) {
401      return CreateFormatException("Couldn't parse integer: " + e.Message);
402    }
403
404    /// <summary>
405    /// Constructs an appropriate FormatException for the given existing exception
406    /// when trying to parse a float or double.
407    /// </summary>
408    private FormatException CreateFloatParseException(Exception e) {
409      return CreateFormatException("Couldn't parse number: " + e.Message);
410    }
411  }
412}
Note: See TracBrowser for help on using the repository browser.