tokenizer.cls source

00001 /*----------------------------------------------------------------------------*/
00002 /*                                                                            */
00003 /*   Copyright (c) 2004-2009 William  Data  Systems Ltd. and Geoff Stevens.   */
00004 /*   All rights reserved.                                                     */
00005 /*                                                                            */
00006 /*   This program and the  accompanying  materials are made available under   */
00007 /*   the terms of the  Common  Public  License  v1.0 which accompanies this   */
00008 /*   distribution. A  copy  is  also  available  at  the following address:   */
00009 /*   http://www.opensource.org/licenses/cpl1.0.php                            */
00010 /*                                                                            */
00011 /*   Redistribution and use in  source  and  binary  forms, with or without   */
00012 /*   modification, are  permitted  provided  that  the following conditions   */
00013 /*   are met:                                                                 */
00014 /*                                                                            */
00015 /*   Redistributions  of  source  code  must  retain  the  above  copyright   */
00016 /*   notice, this list of conditions and the following disclaimer.            */
00017 /*                                                                            */
00018 /*   Redistributions in  binary  form  must  reproduce  the above copyright   */
00019 /*   notice, this list of  conditions  and  the following disclaimer in the   */
00020 /*   documentation and/or other materials provided with the distribution.     */
00021 /*                                                                            */
00022 /*   Neither the name or trademarks  of  William Data Systems nor the names   */
00023 /*   of its  contributors  may  be  used  to  endorse  or  promote products   */
00024 /*   derived from this software without specific prior written permission.    */
00025 /*                                                                            */
00026 /*   DISCLAIMER                                                               */
00027 /*                                                                            */
00028 /*   THIS SOFTWARE IS PROVIDED  BY  THE  COPYRIGHT HOLDERS AND CONTRIBUTORS   */
00029 /*   "AS IS" AND  ANY  EXPRESS  OR  IMPLIED  WARRANTIES, INCLUDING, BUT NOT   */
00030 /*   LIMITED TO, THE IMPLIED WARRANTIES  OF MERCHANTABILITY AND FITNESS FOR   */
00031 /*   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN  NO EVENT SHALL THE COPYRIGHT   */
00032 /*   OWNER OR CONTRIBUTORS BE LIABLE  FOR ANY DIRECT, INDIRECT, INCIDENTAL,   */
00033 /*   SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT   */
00034 /*   LIMITED TO, PROCUREMENT OF SUBSTITUTE  GOODS OR SERVICES; LOSS OF USE,   */
00035 /*   DATA, OR PROFITS; OR BUSINESS  INTERRUPTION) HOWEVER CAUSED AND ON ANY   */
00036 /*   THEORY OF LIABILITY, WHETHER  IN  CONTRACT,  STRICT LIABILITY, OR TORT   */
00037 /*   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN  ANY WAY OUT OF THE USE   */
00038 /*   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.     */
00039 /*                                                                            */
00040 /*----------------------------------------------------------------------------*/
00041  
00042 -- flyweight token
00043 ::CLASS token public
00044 ::ATTRIBUTE symbol   -- the text of the token
00045 ::ATTRIBUTE type     -- token type
00046 ::ATTRIBUTE index    -- position in list
00047 ::ATTRIBUTE user     -- for user classification
00048  
00049 -- @param symbol - string representation of the tokenized symbol
00050 -- @param type - type of the tokenized symbol
00051 -- @param index - index of the tokenized symbol in a tokenlist
00052 ::METHOD init
00053   use strict arg symbol, type, index
00054   self~symbol = symbol
00055   self~type = type
00056   self~index = index
00057   self~user = .nil
00058  
00059 -- list of tokens
00060 -- retains state during navigation
00061 ::CLASS tokenlist public
00062 ::METHOD current ATTRIBUTE -- current token in navigation
00063 ::METHOD tokens ATTRIBUTE  -- queue of tokens
00064  
00065 -- @param tokens - queue of token objects
00066 ::METHOD init
00067   use strict arg tokens
00068   self~tokens = tokens
00069  
00070 -- get token by index
00071 -- @param index - index of token to retrieve
00072 -- @return a token
00073 ::METHOD gettoken
00074   expose tokens
00075   use arg index
00076   return tokens~at(index)
00077  
00078 -- get first token
00079 -- @return first token
00080 ::METHOD getfirst
00081   expose tokens current
00082   current = 1
00083   return tokens~at(current)
00084  
00085 -- get next token
00086 -- @return next token
00087 ::METHOD getnext
00088   expose tokens current
00089   current = current + 1
00090   return tokens~at(current)
00091  
00092 -- get next non blank token
00093 -- @return next non blank token or .nil
00094 ::METHOD getnextnonblank
00095   expose tokens current
00096   current = current + 1
00097   do j = current to tokens~items
00098     if tokens~at(j)~symbol~words > 0
00099       then do
00100              current = j
00101              return tokens~at(j)
00102            end
00103   end
00104   current = 0
00105   return .nil
00106  
00107  
00108 -- return list of subtokens
00109 -- note this is a list of *new* tokens, not part
00110 -- of the tokenlist called
00111 -- @param starttoke - index of first token to return
00112 -- @param endtoke - index of last token to return
00113 -- @return - list of tokens
00114 ::METHOD subtokens
00115   use arg starttoke, endtoke
00116   subtokens = .queue~new
00117   do j = starttoke to endtoke
00118     subtokens~queue(token~at(j))
00119   end
00120   return .tokenlist~new(subtokens)
00121  
00122 -- return list of tokens
00123 -- note this is the list in the tokenlist called
00124 -- @return - list of tokens
00125 ::METHOD gettokens
00126   expose tokens
00127   return tokens
00128  
00129 -- glue tokens together
00130 -- @param starttoke - index of first token to return
00131 -- @param endtoke - index of last token to return
00132 -- @return - string of assembled tokens
00133 ::METHOD detokenize
00134   expose tokens
00135   use arg starttoke, endtoke
00136  
00137   flat = ''
00138   do j = starttoke to endtoke
00139     flat = flat || tokens~at(j)
00140   end
00141   return flat
00142  
00143 -- abstract tokenizer
00144 -- Tokenizer lookahead sizes are calculated in the setup
00145 -- method, for use in the toke method. This is thus an ll(n)
00146 -- tokenizer, where n is programmable.
00147 ::CLASS tokenizer public
00148  
00149 ::ATTRIBUTE delims       -- list of delimiters
00150 ::ATTRIBUTE lookaheads   -- list of lookahead lengths, descending
00151  
00152 -- process a list of arrays of delimiter/space pairs
00153 -- into a list of delimiters sorted into descending
00154 -- order of length, and a directory of delimiters
00155 -- indexed from delimiter to name
00156 -- @param tokelist - list of two-element arrays
00157 -- @return two element array of
00158 -- @return sorted queue of lookaheads
00159 -- @return directory from delimiter to name
00160 ::METHOD setup CLASS
00161   use arg tokelist
00162  
00163   lookaheads = ''
00164   delims = .directory~new
00165   la = .queue~new
00166   lal = ''
00167   do i over tokelist
00168    delims[i[1]] = i[2]
00169    l = i[1]~length
00170    if lal~wordpos(l) = 0
00171      then do
00172             lal  = lal l
00173             la~queue(l+1)
00174           end
00175   end
00176  
00177   lookaheads = la~makeArray~stableSortWith(DescendingComparator)
00178  
00179   return .array~of(lookaheads, delims)
00180  
00181 -- initialize tokenizer
00182 -- @param tokelist - list of two-element arrays
00183 ::METHOD init
00184   use arg tokelist
00185   array = self~class~setup(tokelist)
00186   self~lookaheads = array[1]
00187   self~delims = array[2]
00188  
00189  
00190 -- tokenize a string
00191 -- @param line - string to tokenize
00192 -- @return tokenlist
00193 ::METHOD toke 
00194   use arg line
00195  
00196   tkl = .tokenlist~new(.queue~new)
00197  
00198   token = ''
00199   index = 1
00200   start = 1
00201  
00202   do label top while line <> ''
00203  
00204     do i over self~lookaheads
00205       parse var line 1 lah =(i) rest
00206       if self~delims~hasIndex(lah)
00207         then do
00208                if token <> ''
00209                  then do
00210                         tkl~tokens~queue(.token~new(token, 'symbol', start))
00211                         token = ''
00212                       end
00213                line = rest
00214                tkl~tokens~queue(.token~new(lah, self~delims[lah], index))
00215                index = index + lah~length
00216                start = index
00217                iterate top
00218              end
00219     end
00220  
00221     parse var line 1 c 2 line
00222     token = token || c
00223     index = index + 1
00224  
00225   end
00226  
00227   if token <> ''
00228     then tkl~tokens~queue(.token~new(token, 'symbol', start))
00229  
00230   return tkl

Get RexxLiterate at SourceForge.net. Fast, secure and Free Open Source software downloads
Generated on 31 Aug 2010 05:20:36 for RexxLiterate by rexxliterate  0.0.1