Project

General

Profile

tokenizer.rb

The yapser tokenizer - Randolph Herber, 08/18/2011 02:25 PM

 
1
%w{token}.each{|library| require library}
2

    
3
module Yapser
4

    
5
  class Tokenizer
6

    
7
    # Current Token index, zero-based.
8
    attr_accessor :position
9

    
10
    # Highest Token index reached since last time that maxPosition was set.
11
    attr_accessor :maxPosition
12

    
13
    # Symbols recognized by the tokenizer are:
14
    #   Comma, Dot, Integer, In, Is, Leftbrace, Leftbracket, Leftparen,
15
    #   Name, Number, Ordinal, Nil, True, False,
16
    #   Rightbrace, Rightbracket, Rightparen, String and Version.
17

    
18
    # Tokens are in the form of Token objects
19
    # EOD is indicated by a nil response.
20

    
21
    # The token returned is the first Token of the maximum string length
22
    # recognized in the @@patterns variable.
23

    
24
    # The @@patterns variable contains an Array of two element Arrays.
25
    # In the two element arrays, the first element is the token type
26
    # and the second element is the recognizing regular expression.
27
    # A Token kind value of false means that that text is to be ignored.
28
    # The :error Token kind at the end is to find bad characters.
29

    
30
    # N.B., the numeric portion of this taken with \Z added at the right
31
    # end of the pattern is present in the parser for the prettyPrint
32
    # routine's use (for a requirement to print numbers unquoted).
33

    
34
    @@patterns = [
35
        [false,          %r{\A\s\s*}],
36
        [false,          %r{\A#.*$}],
37
        [false,          %r{\A//.*$}],
38
        [:true,          %r{\Atrue((?=[^A-Za-z0-9_])|(?=\Z))}],
39
        [:false,         %r{\Afalse((?=[^A-Za-z0-9_])|(?=\Z))}],
40
        [:nil,           %r{\Anil((?=[^A-Za-z0-9_])|(?=\Z))}],
41
        [:nil,           %r{\Anull((?=[^A-Za-z0-9_])|(?=\Z))}],
42
        [:include,       %r{\A\$remember((?=[^A-Za-z0-9_])|(?=\Z))}],
43
        [:exclude,       %r{\A\$forget((?=[^A-Za-z0-9_])|(?=\Z))}],
44
        [:number,        %r{\A[-+]?infinity((?=[^A-Za-z0-9_])|(?=\Z))}],
45
        [:number,        %r{\A[-+]?[0-9]+\.[0-9]*[Ee][-+]?[0-9]{1,3}
46
                            ((?=[^0-9])|(?=\Z))}x],
47
        [:number,        %r{\A[-+]?\.[0-9]+[Ee][-+]?[0-9]{1,3}
48
                            ((?=[^0-9])|(?=\Z))}x],
49
        [:number,        %r{\A[-+]?[0-9]+[Ee][-+]?[0-9]{1,3}
50
                            ((?=[^0-9])|(?=\Z))}x],
51
        [:number,        %r{\A[-+]?[0-9]+\.[0-9]*}],
52
        [:number,        %r{\A[-+]?\.[0-9]+}],
53
        [:integer,       %r{\A[-+][0-9][0-9]*}],
54
        [:ordinal,       %r{\A[0-9][0-9]*}],
55
        [:name,          %r{\A[A-Za-z_][A-Za-z0-9_]*}],
56
        [:string,        %r{\A"([^"\\]|\\["\\])*"}],
57
        [:dot,           %r{\A\.}],
58
        [:in,            %r{\A::}],
59
        [:leftbrace,     %r{\A\{}],
60
        [:rightbrace,    %r{\A\}}],
61
        [:leftbracket,   %r{\A\[}],
62
        [:rightbracket,  %r{\A\]}],
63
        [:leftparen,     %r{\A\(}],
64
        [:rightparen,    %r{\A\)}],
65
        [:comma,         %r{\A\,}],
66
        [:is,            %r{\A\:}],
67
        [:error,         %r{\A.}]
68
      ]
69

    
70
    def initialize(text,patterns=nil)
71
      @line = 1
72
      @ordinal = 0
73
      @position = 0
74
      @maxPosition = 0
75
      @text = text
76
      @patterns = patterns||@@patterns
77
      @tokens = []
78
      if text.class.to_s == "String"
79
        while true do
80
          break if @text.empty?
81
          matched = false
82
          theKind = nil
83
          theText = ""
84
          for kind, regEx in @patterns do
85
            if @text =~ regEx
86
              if matched
87
                if theText.length < $&.length
88
                  theKind = kind
89
                  theText = $&
90
                end
91
              else
92
                matched = true
93
                theKind = kind
94
                theText = $&
95
              end
96
            end
97
          end
98
          if matched
99
            @text = @text[theText.length..-1]
100
            if theKind
101
              @ordinal += 1
102
              @tokens.push(Token.new(theKind, theText, @line, @ordinal))
103
            end
104
            # Ruby 1.8 does not have String.chars
105
            for c in theText.split("")
106
              if c == "\n"
107
                @line += 1
108
                @ordinal = 0
109
              end
110
            end
111
          else
112
            @tokens = [Token.new(:error, @text, @line, @ordinal + 1)]
113
            @text = ""
114
          end
115
        end
116
      else
117
        @tokens = [Token.new(:error,text,@line,@ordinal)]
118
      end
119
    end
120

    
121
    def setMaxPosition
122
      @maxPosition = @position
123
    end
124

    
125
    def backup(amount=1)
126
      @position -= amount
127
    end
128
    
129
    def get
130
      copy(look {
131
        setMaxPosition if @maxPosition < @position
132
        @position += 1
133
      })
134
    end
135

    
136
    def peek
137
      copy(look { })
138
    end
139

    
140
    def [](location) avail(location) end
141

    
142
    def numberOfTokens
143
      @tokens.size
144
    end
145

    
146
    def copyOfTokens
147
      copy(@tokens)
148
    end
149

    
150
    def abstract()
151
      result = []
152
      @tokens.each{|token| result << [token.kind, token.text.dup]}
153
      result
154
    end
155

    
156
    private
157

    
158
    def copy(thing) Marshal.load(Marshal.dump(thing)) end
159

    
160
    def avail(location)
161
      token = @tokens[location] rescue nil if location >= 0
162
      copy(token)
163
    end
164

    
165
    def look(&block)
166
      (@position < @tokens.length) \
167
        ? ( result = @tokens[@position]; block.call; result ) : nil
168
    end
169
  end
170

    
171
end