1 | package ec.util; |
---|
2 | import java.util.regex.*; |
---|
3 | |
---|
4 | /* |
---|
5 | * Lexer.java |
---|
6 | * |
---|
7 | * Created: Sun Dec 5 11:33:43 EST 2010 |
---|
8 | * By: Sean Luke |
---|
9 | * |
---|
10 | */ |
---|
11 | |
---|
12 | /** |
---|
13 | * A simple line-by-line String tokenizer. You provide Lexer with a String or other |
---|
14 | * CharSequence as input, plus an array of regular expressions. Each time you call |
---|
15 | * nextToken(...), the Lexer matches the next token against the regular expressions |
---|
16 | * and returns it. The regular expressions are checked in order, and the first one |
---|
17 | * that matches is the winner. |
---|
18 | * |
---|
19 | */ |
---|
20 | |
---|
21 | public class Lexer |
---|
22 | { |
---|
23 | /** An index which indicates that no further tokens were found. This could be due to the end of the string or due to a bad |
---|
24 | string. You'll need to check the index to determine for sure.*/ |
---|
25 | public static final int FAILURE = -1; |
---|
26 | |
---|
27 | CharSequence input; |
---|
28 | int position = 0; |
---|
29 | Matcher[] matchers; |
---|
30 | String[] regexps; |
---|
31 | int matchingIndex = FAILURE; |
---|
32 | |
---|
33 | /** Builds a Lexer for the given input with the provided regular expressions. The regular expressions |
---|
34 | will be checked in order against the input, and the first one which matches will be assumed to be the token.*/ |
---|
35 | |
---|
36 | public Lexer(CharSequence input, String[] regexps) |
---|
37 | { |
---|
38 | this.regexps = regexps; |
---|
39 | matchers = new Matcher[regexps.length]; |
---|
40 | for(int i = 0 ; i < regexps.length; i++) |
---|
41 | matchers[i] = Pattern.compile(regexps[i]).matcher(input); // not DOTALL |
---|
42 | this.input = input; |
---|
43 | } |
---|
44 | |
---|
45 | /** Returns the next token as a string. If *trim* is true, then the string is first trimmed of whitespace. */ |
---|
46 | public String nextToken(boolean trim) |
---|
47 | { |
---|
48 | for(int i = 0 ; i < regexps.length; i++) |
---|
49 | { |
---|
50 | if (!matchers[i].region(position, input.length()).lookingAt()) continue; |
---|
51 | position = matchers[i].end(); |
---|
52 | matchingIndex = i; |
---|
53 | return ( trim ? matchers[i].group().trim() : matchers[i].group() ); |
---|
54 | } |
---|
55 | // we failed |
---|
56 | matchingIndex = -1; |
---|
57 | return null; |
---|
58 | } |
---|
59 | |
---|
60 | /** Returns the next token as a string. The string is first trimmed of whitespace. */ |
---|
61 | public String nextToken() { return nextToken(true); } |
---|
62 | |
---|
63 | |
---|
64 | /** Returns the index of the regular expression which matched the most recent token. */ |
---|
65 | public int getMatchingIndex() |
---|
66 | { |
---|
67 | return matchingIndex; |
---|
68 | } |
---|
69 | |
---|
70 | /** Returns the regular expression which matched the most recent token. */ |
---|
71 | public String getMatchingRule() |
---|
72 | { |
---|
73 | if (matchingIndex == -1) return null; |
---|
74 | return regexps[matchingIndex]; |
---|
75 | } |
---|
76 | |
---|
77 | /** Returns the position in the String just beyond the most recent token. */ |
---|
78 | public int getMatchingPosition() |
---|
79 | { |
---|
80 | return position; |
---|
81 | } |
---|
82 | |
---|
83 | } |
---|