JavaTokenizer.java
// JavaTokenizer.java
import java.io.*;
import java.util.*;
// Class JavaTokenizer
// ------------------------------------------------------------------
/**
* Converts a JavaLexemes object into a stream of JavaToken
* objects, which may be retrieved via the Iterator Interface.
* The task of this class is made more difficult by the fact that
* the implementation of JavaLexemes being used does not deal
* with embedded periods, which may be be the decimal point in
* floating-point numbers, or separators in identifiers.
*
* @author C. Vickery
* @version 1.0 - Spring 2000
*/
public class JavaTokenizer implements Iterator,
JavaLexemeTypes,
JavaTokenTypes,
JavaKeywords
{
// Instance Variables
// ================================================================
private JavaLexemes jl;
private String thisLexemeSval = null;
private int thisLexemeHashCode = 0;
private int thisLexemeType = LT_NOTHING;
private JavaToken thisToken = null;
private StringTokenizer compoundLexeme =
new StringTokenizer( "" );
// Constructor
// ----------------------------------------------------------------
public JavaTokenizer( JavaLexemes input ) throws IOException
{
jl = input;
findNextLexeme();
constructNextToken();
}
// Method constructNextToken()
// ----------------------------------------------------------------
/**
* Constructs the next token from the Lexeme stream. May
* more than one token from lexemes that have embedded periods.
* Leaves the next Token in thisToken, or leaves thisToken equal
* to null if there are no more tokens.
*/
private void constructNextToken() throws IOException
{
// Check if processing a compound lexeme.
if ( compoundLexeme.hasMoreTokens() )
{
String s = compoundLexeme.nextToken();
if ( s.equals( "." ) )
{
thisToken = new JavaToken( JTT_Separator, s );
}
else
{
thisToken = new JavaToken( JTT_Identifier, s );
}
if ( ! compoundLexeme.hasMoreTokens() )
{
findNextLexeme();
}
return;
}
// Convert a lexeme into a token.
switch ( thisLexemeType )
{
case LT_EOF:
thisToken = null;
return;
case LT_LITERALSTR:
thisToken = new JavaToken( JTT_String, thisLexemeSval );
findNextLexeme();
break;
case LT_LITERALCHR:
thisToken = new JavaToken( JTT_Character,
thisLexemeSval,
lexemeToChar( thisLexemeSval )
);
findNextLexeme();
break;
case LT_LEXEME:
// Keyword?
if ( isKeyword( thisLexemeHashCode ) )
{
thisToken = new JavaToken( JTT_Keyword,
thisLexemeSval
);
findNextLexeme();
break;
}
// Operator?
if ( isOperator( thisLexemeHashCode ) )
{
thisToken = new JavaToken( JTT_Operator,
thisLexemeSval
);
findNextLexeme();
break;
}
// Separator character?
if ( isSeparator( thisLexemeSval ) )
{
thisToken = new JavaToken( JTT_Separator,
thisLexemeSval
);
findNextLexeme();
break;
}
// True literal?
if ( thisLexemeSval.equals( "true" ) )
{
thisToken = new JavaToken( JTT_True,
thisLexemeSval
);
findNextLexeme();
break;
}
// False literal?
if ( thisLexemeSval.equals( "false" ) )
{
thisToken = new JavaToken( JTT_False,
thisLexemeSval
);
findNextLexeme();
break;
}
// Null literal?
if ( thisLexemeSval.equals( "null" ) )
{
thisToken = new JavaToken( JTT_Keyword,
thisLexemeSval
);
findNextLexeme();
break;
}
// Integer literal?
try
{
long lVal = Long.parseLong( thisLexemeSval );
thisToken = new JavaToken( JTT_FixedPoint,
thisLexemeSval,
lVal
);
findNextLexeme();
break;
}
catch ( NumberFormatException nfe ) { }
// Double literal?
try
{
double dVal = Double.parseDouble( thisLexemeSval );
thisToken = new JavaToken( JTT_FloatingPoint,
thisLexemeSval,
dVal
);
findNextLexeme();
break;
}
catch ( NumberFormatException nfe ) { }
// check for embedded dots
// If there is a period in the string, it is really
// multiple lexemes, which have to be handled specially.
if ( thisLexemeSval.indexOf( "." ) != -1 )
{
compoundLexeme =
new StringTokenizer( thisLexemeSval, ".", true );
String s = compoundLexeme.nextToken();
if ( s.equals( "." ) )
{
thisToken = new JavaToken( JTT_Separator, s );
}
else
{
thisToken = new JavaToken( JTT_Identifier, s );
}
// Do not "findNextLexeme()"
break;
}
// Nothing else, so it's just an identifier.
thisToken = new JavaToken( JTT_Identifier,
thisLexemeSval
);
findNextLexeme();
break;
default:
System.err.println( "Bad Switch in constructNextToken!" );
break;
}
return;
}
// Method findNextLexeme()
// ----------------------------------------------------------------
/**
* Skips over non-lexeme items returned by JavaLexemes (comments,
* end of line markers, and errors), and saves the next actual
* lexeme's String value and type in thisLexemeSval and
* thisLexemeType, respectively. Leaves thisLexemeType set to
* LT_EOF at end of input.
*/
private void findNextLexeme() throws IOException
{
thisLexemeType = jl.nextLexeme();
while ( thisLexemeType != LT_LEXEME &&
thisLexemeType != LT_LITERALSTR &&
thisLexemeType != LT_LITERALCHR
)
{
if ( thisLexemeType == LT_EOF )
{
return;
}
thisLexemeType = jl.nextLexeme();
}
thisLexemeSval = jl.theLexeme;
thisLexemeHashCode = thisLexemeSval.hashCode();
}
// Method lexemeToChar()
// ----------------------------------------------------------------
/**
* Converts a character literal lexeme into a character value.
* Handles Unicode, Octal, and decimal constants, as well as
* simple char constants.
*/
private char lexemeToChar( String lexeme )
{
char charValue = '\u0000';
String s = null;
// Convert character literal into a char value.
if ( lexeme.charAt(0) == '\\' )
{
try
{
switch ( lexeme.charAt(1) )
{
// Unicode character literal
case 'u':
{
s = lexeme.substring( 2 );
charValue = (char) Integer.parseInt( s, 16 );
break;
}
// Octal character literal
case '0':
{
s = lexeme.substring( 2 );
charValue = (char) Integer.parseInt( s, 8 );
break;
}
default:
{
// Decimal character literal
s = lexeme.substring( 1 );
charValue = (char) Integer.parseInt( s );
break;
}
}
}
catch ( NumberFormatException nfe )
{
System.err.println( "Invalid Character Literal" );
charValue = '\u0000';
}
}
else
{
// Simple character literal
charValue = lexeme.charAt( 0 );
}
return charValue;
}
// Testers
// ================================================================
// Method isKeyword()
// ----------------------------------------------------------------
private boolean isKeyword( int hash )
{
int[] t = JavaKeywords.keywordHashCodes;
for ( int i = 0; i < t.length; i++ )
{
if ( t[i] == hash ) return true;
}
return false;
}
private boolean isOperator( int hash )
{
int[] t = JavaOperators.operatorHashCodes;
for ( int i = 0; i < t.length; i++ )
{
if ( t[i] == hash ) return true;
}
return false;
}
private boolean isSeparator( String lexeme )
{
String seps = "(){}[];,.";
if ( lexeme.length() == 1 )
{
char candidate = lexeme.charAt( 0 );
for ( int i = 0; i < seps.length(); i++ )
{
if ( candidate == seps.charAt( i ) ) return true;
}
}
return false;
}
// Iterator Interface
// =================================================================
// Method hasNext()
// -----------------------------------------------------------------
public boolean hasNext()
{
return thisToken != null;
}
// Method next()
// ----------------------------------------------------------------
public Object next()
{
JavaToken returnVal = thisToken;
try
{
constructNextToken();
}
catch (IOException ioe )
{
System.err.println( "Error Reading" );
}
return returnVal;
}
// Method remove()
// ----------------------------------------------------------------
public void remove() { }
}