import java.io.InputStream;
import java.io.BufferedInputStream;
import java.io.StringBufferInputStream;


/////////////////////////////////////////////////////////////////////////
//
//  A nearly complete rewrite of Java's StreamTokenizer.  The purpose
//  begin that I did not like the way number were parse.  First, you
//  cannot turn the parting of numbers off (look like you can, but you
//  can't -- trust me!)  Anyway, I wanted to parse numbers expressed
//  in different number bases (hex, oct, binary, and decimal).
//
//  As you might guess, this is not as fast as THE StreamTokenizer,
//  but it's still pretty fast.
//
//  If you've ever wondered "how they did that," this is one way of
//  doing it!  You're welcome to extend it so that it if a full
//  implementation of the original.
//
//           (c) 1997 Brett Blatchley, all rights reserved.  
//                 See MicMac TERMS OF USE for details.
//
//////////////////////////////////////////////////////////////////////

/**
 A stream-oriented lexical analyzer like SteamTokenizer execept numbers
 of the form often seen in assemblers are parsed correctly.
 */

public class Tokenizer extends Object
{
   public  static final int TT_EOF    =  -1;
   public  static final int TT_EOL    =  -2;
   public  static final int TT_NUMBER =  -3;
   public  static final int TT_WORD   =  -4;
   public  static final int TT_EMPTY  = -10;


   ///////////////////////////////////////////////////////////////////
   //  Constructor
   ///////////////////////////////////////////////////////////////////

   /**
    Create a new Tokenizer on the given stream.
    @param is Input stream.
    */

   public Tokenizer( InputStream is)
   {
      this.is = new BufferedInputStream( is);

      setChars( 0,   ' ', ORDINARY_C);
      setChars( 128, 255, ORDINARY_C);

      ctype[' ']  = WHITESP_C;
      ctype['\t'] = WHITESP_C;
      ctype['\b'] = WHITESP_C;
      ctype['\n'] = NEWLINE_C;


      lineno = 0;

      eolIsSignificant( false);
      lowerCaseMode( false);
      parseNumbers( true);
      slashSlashComments( false);
      slashStarComments( false);

      eolCh = '\n';
   }




   ///////////////////////////////////////////////////////////////////
   ///////////////////////////////////////////////////////////////////
   //  Public Members
   ///////////////////////////////////////////////////////////////////
   ///////////////////////////////////////////////////////////////////




   public int    ttype;
   public String sval;
   public int    nval;


   ///////////////////////////////////////////////////////////////////
   //  Set the comment character
   ///////////////////////////////////////////////////////////////////

   /**
    Designate the given character as one which starts a comment.
    */

   public void commentChar(int ch)
   {
      ctype[(commentCh = (byte) ch)] = COMMENT_C;
   }



   ///////////////////////////////////////////////////////////////////
   //  Set true if EOL is a significant character
   ///////////////////////////////////////////////////////////////////

   /**
    Set to true if you want end of lines to be significant to the lexer.
    */

   public void eolIsSignificant( boolean b)
   {
      eolSignificant = b;
   }



   ///////////////////////////////////////////////////////////////////
   //  Return the current line number
   ///////////////////////////////////////////////////////////////////

   /**
    Return the current line number.
    @return Line number.
    */

   public int lineno()
   {
      return lineno;
   }



   ///////////////////////////////////////////////////////////////////
   //  Convert returned token to lowercase
   ///////////////////////////////////////////////////////////////////

   /**
    Set to true if you want automatic conversion of WORDSs to lowercase.
    */

   public void lowerCaseMode( boolean b)
   {
      lowerCase = b;
   }



   ///////////////////////////////////////////////////////////////////
   //  Return the next available token.  Return a push-back one if
   //  that is available.  Use a finite state machine to parse the
   //  input chars.
   ///////////////////////////////////////////////////////////////////

   /**
    Return the next token from input.
    @return Token value, TT_EMPTY if none remaining.
    */

   public int nextToken()
   {
      if ( pushed_ttype != TT_EMPTY) {  // Pushback?
         ttype = pushed_ttype;
         sval  = pushed_sval;
         nval  = pushed_nval;

         pushed_ttype = TT_EMPTY;

         return ttype;
      }


      int c, token = 0;

      while (true) {
         c = readc();

      //Util.log(state+": c is '"+(char)c+"' "+c);

         switch (state) {
            case WHITESP_C:           // Whitespace
               if ( c == eolCh) {
                  pushbackc( c);
                  state = NEWLINE_C;
               }
               else
            if ( c == TT_EOF) {
               pushbackc(c);
              state = TT_EOF;
            }
            else
            if ( ctype[c] == ORDINARY_C)
               return ttype = c;
            else
            if ( ctype[c] != WHITESP_C) {
                  state = ctype[c];
                  nc = 0; nextc[nc++] = (byte) c;
                  nval = 0;
            }
            break;


            case WORD_C:              // Words
               if ( c != TT_EOF && ctype[c] == WORD_C) {
                  try {
                     nextc[nc++] = (byte) c;
                  }
                  catch (Exception e) { nc = nextc.length - 1; };
               }
               else {
                  pushbackc(c);

                  state = c != TT_EOF ? ctype[c] : TT_EOF;

                  sval = new String( nextc, 0, 0, nc);

                  if ( parseNum && isConvertableNumber( sval)) {
                     nval = parseInt( sval);
                     sval = null;
                     return ttype = TT_NUMBER;
                  }


                  if ( lowerCase)
                     sval = sval.toLowerCase();

                  return ttype = TT_WORD;
               }
               break;


            case NEWLINE_C:           // Newlines
               lineno++;

               state = WHITESP_C;

               if (eolSignificant)
                  return ttype = TT_EOL;

               break;


            case COMMENT_C:           // Comments
               if ( c == eolCh) {
                  pushbackc(c);
                  state = NEWLINE_C;
               }
               else
               if ( c == TT_EOF) {
                  pushbackc(c);
                  state = TT_EOF;
               }
               break;


            case ORDINARY_C:         // "Ordinary" characters
               state = WHITESP_C;
               sval = null;
               nval = 0;
               return ttype = c;
         

            case QUOTE_C:            // Quoted tokens
               pushbackc(c);
               sval = "";
               nval = 0;
               state = QUOTE_C1;
               break;


            case QUOTE_C1:
               if ( c == '\\')
                  state = QUOTE_C2;
               else
               if ( c == TT_EOF || ctype[c] == QUOTE_C) {
                  state = WHITESP_C;
                  if ( c == TT_EOF)
                     pushbackc(c);

                  return ttype = TT_WORD;
               }
               else
                  sval += "" + (char) c;
               break;


            case QUOTE_C2:

               if ( c == TT_EOF) {
                  pushbackc(c);
                  state = WHITESP_C;
               }
               else {
                  sval += "" + (char) c;
                  state = QUOTE_C1;
               }
               break;



            case TT_EOF:
               try { is.close(); } catch( Exception e) {} return TT_EOF;


            default:
               state = WHITESP_C;
         }
      }
   }




   ///////////////////////////////////////////////////////////////////
   //  Declare a given character to be "Ordinary."  That is, it is
   //  returned as a separte token when encountered.
   ///////////////////////////////////////////////////////////////////

   /**
    Designate the given character as an 'ordinary' charactor (one which is
    its own token value. Operator symbols are typical examples.)
    */

   public void ordinaryChar( int ch)
   {
      ctype[(byte) ch] = ORDINARY_C; 
   }


   ///////////////////////////////////////////////////////////////////
   //  Declare a range of characters to be "Ordinary" in the above
   //  sense.
   ///////////////////////////////////////////////////////////////////

   /**
    Designate a range of characters to be 'ordinary'
    @param lo Low-end ASCII value.
    @param hi High-end ASCII value.
    */

   public void ordinaryChars( int lo, int hi)
   {
      setChars( lo, hi, ORDINARY_C);
   }



   ///////////////////////////////////////////////////////////////////
   //  Turn number parsing on or off.  StreamTokenizer has a method
   //  by this name which is supposed to turn number parsing on. But
   //  why bother, it's on by default, and there's no way to turn if
   //  off!  We that's fixed!
   ///////////////////////////////////////////////////////////////////

   /**
    Set to true if you want numbers to be parsed and returned as NUMBERS.
    */

   public void parseNumbers( boolean b)
   {
      parseNum = b;
   }


   ///////////////////////////////////////////////////////////////////
   //  Push a token back onto the input.  Very useful for pattern
   //  matching!
   ///////////////////////////////////////////////////////////////////

   /**
    Push the current token back onto the input.
    */

   public void pushBack()
   {
      pushed_ttype = ttype;
      pushed_sval  = sval;
      pushed_nval  = nval;
   }


   ///////////////////////////////////////////////////////////////////
   //  Define a character as one used to delimit comments.  There
   //  can be more than one character defined.  Usually ' and " will
   //  be defined, but it could be something else altogether.
   ///////////////////////////////////////////////////////////////////

   /**
    Designate the given charactor as a 'quoted-text' delimeter.
    */

   public void quoteChar( int ch)
   {
      ctype[quoteCh = (byte) ch] = QUOTE_C;
   }



   ///////////////////////////////////////////////////////////////////
   //  Undeclares all quote and ordinary characters.
   ///////////////////////////////////////////////////////////////////

   /**
    Redesignate all characters as 'ordinary,' thus resetting the grammar.
    */

   public void resetSyntax()
   {
      for (int i = 0; i < 256; i++)
         ctype[i] = ORDINARY_C;
   }



   ///////////////////////////////////////////////////////////////////
   //  I started getting tired here.  Since I don't need this in my
   //  current project, I did not bother to finish implementing the
   //  different types of comments.
   ///////////////////////////////////////////////////////////////////

   /**
    Set to true to enable slash-slash style comments. 
    (Slash-slash style comments not implemented in Tokenizer.)
    */

   public void slashSlashComments( boolean b)
   {
      if ( (slashComments = b))
         ctype['/'] = (byte) COMMENTSLASH_C;
      else
         ctype['/'] = (byte) WORD_C;
   }


   ///////////////////////////////////////////////////////////////////
   //  See above.
   ///////////////////////////////////////////////////////////////////

   /**
    Set to true to enable slash-star style comments. 
    (Slash-star style comments not implemented in Tokenizer.)
    */

   public void slashStarComments( boolean b)
   {
      if ( (starComments = b))
         ctype['/'] = (byte) COMMENTSTAR_C;
      else
         ctype['/'] = (byte) WORD_C;

   }



   ///////////////////////////////////////////////////////////////////
   //  Smart huh?
   ///////////////////////////////////////////////////////////////////

   /**
    Returns the current line number as the toString form of Tokenizer.
    @return Line number in text form.
    */

   public String toString()
   {
      return "Tokenizer on line "+lineno();
   }



   ///////////////////////////////////////////////////////////////////
   //  Declare a range of characters to be whitespace characters.
   ///////////////////////////////////////////////////////////////////

   /**
    Designate the given range of characters as whichspace characters.
    @param lo Low-end ASCII value.
    @param hi High-end ASCII value.
    */

   public void whitespaceChars( int lo, int hi)
   {
      setChars( lo, hi, WHITESP_C);
   }



   ///////////////////////////////////////////////////////////////////
   //  Declare a range of characters to be included in word tokens.
   ///////////////////////////////////////////////////////////////////

   /**
    Designate the given range of characters to be included in WORDS.
    @param lo Low-end ASCII value.
    @param hi High-end ASCII value.
    */

   public void wordChars( int lo, int hi)
   {
      setChars( lo, hi, WORD_C);
   }





   ///////////////////////////////////////////////////////////////////
   //  Test the Tokenizer from the command line
   ///////////////////////////////////////////////////////////////////

   /**
    Test the functioning of the Tokenizer class.<br>
    usage: <b><tt>java Tokenizer</tt></b>
    */

   public static void main( String[] args)
   {
      String s = "   Now is\n  THE 101b time //Hello! Brett = \"cool\nman\" ' \\'Right\\' ? '\n1 2 3";

      Util.log("s("+s+")");

      Tokenizer t = new Tokenizer( 
         new StringBufferInputStream(s));

      t.eolIsSignificant(true);

      int tt;

      t.quoteChar('"');    // No quoted strings by default
      t.quoteChar('\'');
      t.commentChar('/');  // No comments by default

//      t.slashSlashComments(true);

      t.ordinaryChar('=');
      t.lowerCaseMode( true);

      t.nextToken();
      System.out.println("("+t.sval+")");
      t.pushBack();

      while ( (tt = t.nextToken()) != TT_EOF) {
         switch ( tt) {
         case TT_WORD:
            System.out.println(t.lineno()+": TT_WORD \""+t.sval+"\""); break;

         case TT_NUMBER:
            System.out.println(t.lineno()+": TT_NUMBER "+t.nval); break;

         case TT_EOL:
            System.out.println("Newline"); break;

         default:
            System.out.println(t.lineno()+": ORDINARY '"+(char)t.ttype+"'");
         }
      }
      
   }




   ///////////////////////////////////////////////////////////////////
   ///////////////////////////////////////////////////////////////////
   //  Private Members
   ///////////////////////////////////////////////////////////////////
   ///////////////////////////////////////////////////////////////////




   private static final int WORD_C         = 0;
   private static final int ORDINARY_C     = 1;
   private static final int COMMENT_C      = 2;
   private static final int COMMENTSLASH_C = 3;
   private static final int COMMENTSTAR_C  = 4;
   private static final int WHITESP_C      = 5;
   private static final int NEWLINE_C      = 6;
   private static final int QUOTE_C        = 7;
   private static final int QUOTE_C1       = 8;
   private static final int QUOTE_C2       = 9;


   
   private byte[] nextc = new byte[1024];
   private int    nc;

   private int    pushed_ttype = TT_EMPTY;
   private String pushed_sval;
   private int    pushed_nval;

   private int    pushed_c = TT_EMPTY;

   private BufferedInputStream is;
   private byte[]              ctype = new byte[256];
   private int                 commentCh;
   private int                 quoteCh;
   private int                 eolCh;
   private int                 lineno;
   private boolean             eolSignificant;
   private boolean             lowerCase;
   private boolean             parseNum;
   private boolean             slashComments;
   private boolean             starComments;

   private int                 state = WHITESP_C;




   ///////////////////////////////////////////////////////////////////
   //  Set a range of chars in the char table to be a certain type
   //  of char.  Used by other methods.
   ///////////////////////////////////////////////////////////////////


   private void setChars( int lo, int hi, int type)
   {
      if ( lo > hi) {
         int t = lo; lo = hi; hi = lo;
      }

      byte b = (byte) type;

      for (; lo <= hi; lo++)
         ctype[lo] = b;
   }



   ///////////////////////////////////////////////////////////////////
   //  Read a single character, taking the pushback into account.
   //  Used in the nextToken() state machine.
   ///////////////////////////////////////////////////////////////////

   private int readc()
   {
      if ( pushed_c != TT_EMPTY) {
         int c = pushed_c;
         pushed_c = TT_EMPTY;
         return c;
      }
      else try {
         return is.read();
      }
      catch (Exception e) { return TT_EOF; }
   }



   ///////////////////////////////////////////////////////////////////
   //  Pushback a single character. Used by nextToken().  Its use
   //  will give you idea of how to use pushBack();
   ///////////////////////////////////////////////////////////////////

   private void pushbackc( int c)
   {
      pushed_c = c;
   }



   ///////////////////////////////////////////////////////////////////
   //  Given a string, try to convert it to an integer based on
   //  the radix flag on the end (if there is one)
   //
   //  Exa:   0FFh, 377o, 1111111b = 255.  Note that each of these
   //  begins with a conventional digit.  Decimal number have no
   //  radix flag suffix.  This is the way numbers are parsed 
   //  throughout MicMac.
   ///////////////////////////////////////////////////////////////////
   
   private int parseInt(String v)
   {
      int i = 0, len = v.length();

      if ( isConvertableNumber( v)) {

         char   c = v.charAt(len-1);

         try {
            switch (c) {
               case 'h':
               case 'H': i = Integer.parseInt(v.substring(0,len-1),16); break;

               case 'o':
               case 'O': i = Integer.parseInt(v.substring(0,len-1),8);  break;

               case 'b':
               case 'B': i = Integer.parseInt(v.substring(0,len-1),2);  break;

               default:  i = Integer.parseInt(v);
            }
         }
         catch( Exception e) { i = 0; }
      }

      return i;
   }



   ///////////////////////////////////////////////////////////////////
   //  Is is possbile to convert this string to an integer?
   ///////////////////////////////////////////////////////////////////

   private boolean isConvertableNumber( String n)
   {
      if ( n.length() > 0) {
         char c = n.charAt(0);

         if ( (c >= '0' && c <= '9') || c == '-' || c == '+')
            return true;
      }

      return false;
   }   
}


