tokenizer.rb
1 |
%w{token}.each{|library| require library}
|
---|---|
2 |
|
3 |
module Yapser |
4 |
|
5 |
class Tokenizer |
6 |
|
7 |
# Current Token index, zero-based.
|
8 |
attr_accessor :position
|
9 |
|
10 |
# Highest Token index reached since last time that maxPosition was set.
|
11 |
attr_accessor :maxPosition
|
12 |
|
13 |
# Symbols recognized by the tokenizer are:
|
14 |
# Comma, Dot, Integer, In, Is, Leftbrace, Leftbracket, Leftparen,
|
15 |
# Name, Number, Ordinal, Nil, True, False,
|
16 |
# Rightbrace, Rightbracket, Rightparen, String and Version.
|
17 |
|
18 |
# Tokens are in the form of Token objects
|
19 |
# EOD is indicated by a nil response.
|
20 |
|
21 |
# The token returned is the first Token of the maximum string length
|
22 |
# recognized in the @@patterns variable.
|
23 |
|
24 |
# The @@patterns variable contains an Array of two element Arrays.
|
25 |
# In the two element arrays, the first element is the token type
|
26 |
# and the second element is the recognizing regular expression.
|
27 |
# A Token kind value of false means that that text is to be ignored.
|
28 |
# The :error Token kind at the end is to find bad characters.
|
29 |
|
30 |
# N.B., the numeric portion of this taken with \Z added at the right
|
31 |
# end of the pattern is present in the parser for the prettyPrint
|
32 |
# routine's use (for a requirement to print numbers unquoted).
|
33 |
|
34 |
@@patterns = [
|
35 |
[false, %r{\A\s\s*}], |
36 |
[false, %r{\A#.*$}], |
37 |
[false, %r{\A//.*$}], |
38 |
[:true, %r{\Atrue((?=[^A-Za-z0-9_])|(?=\Z))}], |
39 |
[:false, %r{\Afalse((?=[^A-Za-z0-9_])|(?=\Z))}], |
40 |
[:nil, %r{\Anil((?=[^A-Za-z0-9_])|(?=\Z))}], |
41 |
[:nil, %r{\Anull((?=[^A-Za-z0-9_])|(?=\Z))}], |
42 |
[:include, %r{\A\$remember((?=[^A-Za-z0-9_])|(?=\Z))}], |
43 |
[:exclude, %r{\A\$forget((?=[^A-Za-z0-9_])|(?=\Z))}], |
44 |
[:number, %r{\A[-+]?infinity((?=[^A-Za-z0-9_])|(?=\Z))}], |
45 |
[:number, %r{\A[-+]?[0-9]+\.[0-9]*[Ee][-+]?[0-9]{1,3} |
46 |
((?=[^0-9])|(?=\Z))}x],
|
47 |
[:number, %r{\A[-+]?\.[0-9]+[Ee][-+]?[0-9]{1,3} |
48 |
((?=[^0-9])|(?=\Z))}x],
|
49 |
[:number, %r{\A[-+]?[0-9]+[Ee][-+]?[0-9]{1,3} |
50 |
((?=[^0-9])|(?=\Z))}x],
|
51 |
[:number, %r{\A[-+]?[0-9]+\.[0-9]*}], |
52 |
[:number, %r{\A[-+]?\.[0-9]+}], |
53 |
[:integer, %r{\A[-+][0-9][0-9]*}], |
54 |
[:ordinal, %r{\A[0-9][0-9]*}], |
55 |
[:name, %r{\A[A-Za-z_][A-Za-z0-9_]*}], |
56 |
[:string, %r{\A"([^"\\]|\\["\\])*"}], |
57 |
[:dot, %r{\A\.}], |
58 |
[:in, %r{\A::}], |
59 |
[:leftbrace, %r{\A\{}], |
60 |
[:rightbrace, %r{\A\}}], |
61 |
[:leftbracket, %r{\A\[}], |
62 |
[:rightbracket, %r{\A\]}], |
63 |
[:leftparen, %r{\A\(}], |
64 |
[:rightparen, %r{\A\)}], |
65 |
[:comma, %r{\A\,}], |
66 |
[:is, %r{\A\:}], |
67 |
[:error, %r{\A.}] |
68 |
] |
69 |
|
70 |
def initialize(text,patterns=nil) |
71 |
@line = 1 |
72 |
@ordinal = 0 |
73 |
@position = 0 |
74 |
@maxPosition = 0 |
75 |
@text = text
|
76 |
@patterns = patterns||@@patterns |
77 |
@tokens = []
|
78 |
if text.class.to_s == "String" |
79 |
while true do |
80 |
break if @text.empty? |
81 |
matched = false
|
82 |
theKind = nil
|
83 |
theText = ""
|
84 |
for kind, regEx in @patterns do |
85 |
if @text =~ regEx |
86 |
if matched
|
87 |
if theText.length < $&.length |
88 |
theKind = kind |
89 |
theText = $&
|
90 |
end
|
91 |
else
|
92 |
matched = true
|
93 |
theKind = kind |
94 |
theText = $&
|
95 |
end
|
96 |
end
|
97 |
end
|
98 |
if matched
|
99 |
@text = @text[theText.length..-1] |
100 |
if theKind
|
101 |
@ordinal += 1 |
102 |
@tokens.push(Token.new(theKind, theText, @line, @ordinal)) |
103 |
end
|
104 |
# Ruby 1.8 does not have String.chars
|
105 |
for c in theText.split("") |
106 |
if c == "\n" |
107 |
@line += 1 |
108 |
@ordinal = 0 |
109 |
end
|
110 |
end
|
111 |
else
|
112 |
@tokens = [Token.new(:error, @text, @line, @ordinal + 1)] |
113 |
@text = "" |
114 |
end
|
115 |
end
|
116 |
else
|
117 |
@tokens = [Token.new(:error,text,@line,@ordinal)] |
118 |
end
|
119 |
end
|
120 |
|
121 |
def setMaxPosition |
122 |
@maxPosition = @position |
123 |
end
|
124 |
|
125 |
def backup(amount=1) |
126 |
@position -= amount
|
127 |
end
|
128 |
|
129 |
def get |
130 |
copy(look { |
131 |
setMaxPosition if @maxPosition < @position |
132 |
@position += 1 |
133 |
}) |
134 |
end
|
135 |
|
136 |
def peek |
137 |
copy(look { }) |
138 |
end
|
139 |
|
140 |
def [](location) avail(location) end |
141 |
|
142 |
def numberOfTokens |
143 |
@tokens.size
|
144 |
end
|
145 |
|
146 |
def copyOfTokens |
147 |
copy(@tokens)
|
148 |
end
|
149 |
|
150 |
def abstract() |
151 |
result = [] |
152 |
@tokens.each{|token| result << [token.kind, token.text.dup]}
|
153 |
result |
154 |
end
|
155 |
|
156 |
private |
157 |
|
158 |
def copy(thing) Marshal.load(Marshal.dump(thing)) end |
159 |
|
160 |
def avail(location) |
161 |
token = @tokens[location] rescue nil if location >= 0 |
162 |
copy(token) |
163 |
end
|
164 |
|
165 |
def look(&block) |
166 |
(@position < @tokens.length) \ |
167 |
? ( result = @tokens[@position]; block.call; result ) : nil |
168 |
end
|
169 |
end
|
170 |
|
171 |
end
|