Sample lexer definition file: java.lex - a subset of Java
Text outside the four sections sections is ignored.
The generated JavaLexer class can be run like this (after compilation):
java JavaLexer JavaLexer.java
- this will print out a list of tokens in JavaLexer.java.
The Import section is optional and contains imports
%Import
import java_cup.runtime.*;
import java.util.*;
Import%
The ClassOptions sections is optional and supports only the className option.
%ClassOptions
className=JavaLexer
ClassOptions%
The Class section is NOT OPTIONAL and copied into the generated LexerClass class definition
There should be a private void init() method defined here
%LexerClass
// testing
public static void main( String[] args )
{
try
{
JavaLexer lexer = new JavaLexer( new BufferedReader(new FileReader(args[0])) );
Symbol s;
long start = System.currentTimeMillis();
for(;;)
{
if( ( s = lexer.next_token() ) == null )
{
break;
}
System.out.println(s.sym + ",'" + s.value + "'," + s.left + "," + s.right);
}
long end = System.currentTimeMillis();
System.out.println("Lexing '" + args[0] + "': " + (end-start) + " ms" );
}
catch( Exception e )
{
e.printStackTrace();
}
}
private void error( String s )
{
System.err.println("Lexical error around '" + getText() + "'(line" + getLineNumber() + "," + getStartIndex() + "," + getEndIndex() + ") " + s);
System.exit(1);
}
private Hashtable keyWordDic = new Hashtable();
//special constants
public static int EOF = 1;
public static int WHITESPACE = 2;
public static int OTHER = 42;
//complex constants
public static int STRING = 3;
public static int INTEGER = 4;
public static int C_STYLE_COMMENT = 5;
public static int CPP_STYLE_COMMENT = 6;
public static int CHAR = 7;
public static int IDENTIFIER = 8;
// simple constants
public static final int LEFT_PAR = 101;
public static final int RIGHT_PAR = 102;
public static final int LEFT_BRACE = 103;
public static final int RIGHT_BRACE = 104;
public static int C_STYLE_COMMENT_END = 105;
public static int ASSIGN = 106;
public static int EQUALS = 107;
public static int INCREMENT = 108;
public static int DECREMENT = 109;
public static int COLON = 1010;
public static int SEMI_COLON = 1011;
public static int DOT = 1012;
public static int PLUS = 1013;
public static int MINUS = 1014;
public static int STAR = 1015;
public static int DIVIDE = 1016;
public static int COMMA = 1017;
public static int LOGICAL_OR = 1018;
public static int LOGICAL_AND = 1019;
public static int LOGICAL_NOT = 1020;
public static int LESS_THAN = 1021;
public static int BIGGER_THAN = 1022;
public static int LESS_THAN_EQUALS = 1023;
public static int BIGGER_THAN_EQUALS = 1024;
public static int QUESTION = 1025;
public static int UNDERSCORE = 1026;
public static int K_LEFT_PAR = 1027;
public static int K_RIGHT_PAR = 1028;
// keyword constants
public static final int PUBLIC = 1001;
public static final int PRIVATE = 1002;
public static final int CLASS = 1003;
public static final int VOID = 1004;
public static final int EXTENDS = 1005;
public static final int IMPLEMENTS = 1006;
public static final int INT = 1007;
public static final int IF = 1008;
public static final int THEN = 1009;
public static final int ELSE = 10010;
private void init()
{
keyWordDic.put("public",new Integer(PUBLIC));
keyWordDic.put("private",new Integer(PRIVATE));
keyWordDic.put("class",new Integer(CLASS));
keyWordDic.put("void",new Integer(VOID));
keyWordDic.put("extends",new Integer(EXTENDS));
keyWordDic.put("implements",new Integer(IMPLEMENTS));
keyWordDic.put("int",new Integer(INT));
keyWordDic.put("if",new Integer(IF));
keyWordDic.put("else",new Integer(ELSE));
keyWordDic.put("then",new Integer(THEN));
}
private Symbol getKeyword( String str, int start, int end )
{
Integer value = (Integer)keyWordDic.get( str );
if( value == null )// identifier
{
return new Symbol( IDENTIFIER, start, end, str);
}
return new Symbol( value.intValue(), start, end, str );
}
LexerClass%
The Java section is optional and copied in after the Yylex class
%Java
// no other classes defined
Java%
The LexerDef part IS NOT OPTIONAL and contains the definitions as REGEXP,%{CODE%} pairs
Patters that should be ignored MUST NOT define any code or text! (see whitespace)
%LexerDef
' '|'\t'|'\n'|'\r'
%{
%}
'\"'((^'\"')*)'\"'
%{
// string
String str = getText();
if( str.indexOf("\n") != -1 )
{
error("Newline not permitted in strings");
}
return new Symbol( STRING, getStartIndex(), getEndIndex(), str );
%}
'/''*' (((^'*')|('*'(^'/')))*) '*''/'
%{
return new Symbol(C_STYLE_COMMENT,getStartIndex(),getEndIndex(),getText());
%}
'/''/'((^'\n')*)'\n'
%{
// c++ comment
return new Symbol( CPP_STYLE_COMMENT, getStartIndex(), getEndIndex(), getText().trim() );
%}
'\''((['\0'-'\38']|['\40'-'\91']|['\93'-'\127'])|('\\'['\0'-'\127']))'\''
%{
// char
return new Symbol( CHAR, getStartIndex(), getEndIndex(), getText() );
%}
['0'-'9']+
%{
// integer constant
return new Symbol( INTEGER, getStartIndex(), getEndIndex(), getText() );
%}
'>''=' %{ return new Symbol(BIGGER_THAN_EQUALS, getStartIndex(), getEndIndex(), getText() ); %}
'&''&' %{ return new Symbol(LOGICAL_AND, getStartIndex(), getEndIndex(), getText() ); %}
'|''|' %{ return new Symbol(LOGICAL_OR, getStartIndex(), getEndIndex(), getText() ); %}
'+''+' %{ return new Symbol(INCREMENT, getStartIndex(), getEndIndex(), getText() ); %}
'-''-' %{ return new Symbol(DECREMENT, getStartIndex(), getEndIndex(), getText() ); %}
'=''=' %{ return new Symbol(EQUALS, getStartIndex(), getEndIndex(), getText() ); %}
',' %{ return new Symbol(COMMA, getStartIndex(), getEndIndex(), getText() ); %}
';' %{ return new Symbol(SEMI_COLON, getStartIndex(), getEndIndex(), getText() ); %}
':' %{ return new Symbol(COLON, getStartIndex(), getEndIndex(), getText() ); %}
'.' %{ return new Symbol(DOT, getStartIndex(), getEndIndex(), getText() ); %}
'*' %{ return new Symbol(STAR, getStartIndex(), getEndIndex(), getText() ); %}
'/' %{ return new Symbol(DIVIDE, getStartIndex(), getEndIndex(), getText() ); %}
'+' %{ return new Symbol(PLUS, getStartIndex(), getEndIndex(), getText() ); %}
'-' %{ return new Symbol(MINUS, getStartIndex(), getEndIndex(), getText() ); %}
'(' %{ return new Symbol(LEFT_PAR, getStartIndex(), getEndIndex(), getText() ); %}
')' %{ return new Symbol(RIGHT_PAR, getStartIndex(), getEndIndex(), getText() ); %}
'{' %{ return new Symbol(LEFT_BRACE, getStartIndex(), getEndIndex(), getText() ); %}
'}' %{ return new Symbol(RIGHT_BRACE, getStartIndex(), getEndIndex(), getText() ); %}
'=' %{ return new Symbol(ASSIGN, getStartIndex(), getEndIndex(), getText() ); %}
'!' %{ return new Symbol(LOGICAL_NOT, getStartIndex(), getEndIndex(), getText() ); %}
'<' %{ return new Symbol(LESS_THAN, getStartIndex(), getEndIndex(), getText() ); %}
'>' %{ return new Symbol(BIGGER_THAN, getStartIndex(), getEndIndex(), getText() ); %}
'?' %{ return new Symbol(QUESTION, getStartIndex(), getEndIndex(), getText() ); %}
'_' %{ return new Symbol(UNDERSCORE, getStartIndex(), getEndIndex(), getText() ); %}
'\[' %{ return new Symbol(K_LEFT_PAR, getStartIndex(), getEndIndex(), getText() ); %}
'\]' %{ return new Symbol(K_RIGHT_PAR, getStartIndex(), getEndIndex(), getText() ); %}
'<''=' %{ return new Symbol(LESS_THAN_EQUALS, getStartIndex(), getEndIndex(), getText() ); %}
(['a'-'z']|['A'-'Z'])((['a'-'z']|['A'-'Z']|['0'-'9'])*)
%{
// keywords and identifiers handled by getKeyword in LexerClass section
return getKeyword(getText(),getStartIndex(),getEndIndex());
%}
eof
%{
// eof returns null by default
%}
.
%{
error("");
%}
LexerDef%
               (
geocities.com/sunsetstrip/palladium/1303/java)                   (
geocities.com/sunsetstrip/palladium/1303)                   (
geocities.com/sunsetstrip/palladium)                   (
geocities.com/sunsetstrip)