Sample lexer definition file: java.lex - a subset of Java
Text outside the four sections sections is ignored.

The generated JavaLexer class can be run like this (after compilation):

java JavaLexer JavaLexer.java

- this will print out a list of tokens in JavaLexer.java.


The Import section is optional and contains imports
%Import
	import java_cup.runtime.*;
	import java.util.*;
Import%

The ClassOptions sections is optional and supports only the className option.

%ClassOptions
className=JavaLexer
ClassOptions%

The Class section is NOT OPTIONAL and copied into the generated LexerClass class definition
There should be a private void init() method defined here
%LexerClass
	
	// testing
	public static void main( String[] args )
	{
		try
		{
			JavaLexer lexer = new JavaLexer( new BufferedReader(new FileReader(args[0])) );
			Symbol s;
			long start = System.currentTimeMillis();
			for(;;)
			{
				if( ( s = lexer.next_token() ) == null )
				{
					break;					
				}
				System.out.println(s.sym + ",'" + s.value + "'," + s.left + "," + s.right);
			}
			long end = System.currentTimeMillis();
			System.out.println("Lexing '" + args[0] + "': " + (end-start) + " ms" );
		}												 
		catch( Exception e )
		{
			e.printStackTrace();
		}
	}

	
	private void error( String s )
	{
		System.err.println("Lexical error around '" + getText() + "'(line" + getLineNumber() + "," + getStartIndex() + "," + getEndIndex() + ") " + s);
		System.exit(1);
	}

	private Hashtable keyWordDic = new Hashtable();
	
	//special constants
	public static int EOF = 1;
	public static int WHITESPACE = 2;
	public static int OTHER = 42;

	//complex constants
	public static int STRING = 3;
	public static int INTEGER = 4;
	public static int C_STYLE_COMMENT = 5;
	public static int CPP_STYLE_COMMENT = 6;
	public static int CHAR = 7;
	public static int IDENTIFIER = 8;
	
	// simple constants
	public static final int LEFT_PAR = 101;
	public static final int RIGHT_PAR = 102;
	public static final int LEFT_BRACE = 103;
	public static final int RIGHT_BRACE = 104;
	public static int C_STYLE_COMMENT_END = 105;
	public static int ASSIGN = 106;
	public static int EQUALS = 107;
	public static int INCREMENT = 108;
	public static int DECREMENT = 109;
	public static int COLON = 1010;
	public static int SEMI_COLON = 1011;
	public static int DOT = 1012;
	public static int PLUS = 1013;
	public static int MINUS = 1014;
	public static int STAR = 1015;
	public static int DIVIDE = 1016;
	public static int COMMA = 1017;
	public static int LOGICAL_OR = 1018;
	public static int LOGICAL_AND = 1019;
	public static int LOGICAL_NOT = 1020;
	public static int LESS_THAN = 1021;
	public static int BIGGER_THAN = 1022;
	public static int LESS_THAN_EQUALS = 1023;
	public static int BIGGER_THAN_EQUALS = 1024;
	public static int QUESTION = 1025;
	public static int UNDERSCORE = 1026;
	public static int K_LEFT_PAR = 1027;
	public static int K_RIGHT_PAR = 1028;
	
	
	// keyword constants
	public static final int PUBLIC = 1001;
	public static final int PRIVATE = 1002;
	public static final int CLASS = 1003;
	public static final int VOID = 1004;
	public static final int EXTENDS = 1005;
	public static final int IMPLEMENTS = 1006;
	public static final int INT = 1007;
	public static final int IF = 1008;
	public static final int THEN = 1009;
	public static final int ELSE = 10010;

	
	private void init()
	{
		keyWordDic.put("public",new Integer(PUBLIC));
		keyWordDic.put("private",new Integer(PRIVATE));
		keyWordDic.put("class",new Integer(CLASS));
		keyWordDic.put("void",new Integer(VOID));
		keyWordDic.put("extends",new Integer(EXTENDS));
		keyWordDic.put("implements",new Integer(IMPLEMENTS));
		keyWordDic.put("int",new Integer(INT));
		keyWordDic.put("if",new Integer(IF));
		keyWordDic.put("else",new Integer(ELSE));
		keyWordDic.put("then",new Integer(THEN));
	}

	private Symbol getKeyword( String str, int start, int end )
	{
		Integer value = (Integer)keyWordDic.get( str );
		if( value == null )// identifier
		{
			return new Symbol( IDENTIFIER, start, end, str);	  
		}
		return new Symbol( value.intValue(), start, end, str );
	}



LexerClass%


The Java section is optional and copied in after the Yylex class 
%Java
// no other classes defined
Java%


The LexerDef part IS NOT OPTIONAL and contains the definitions as REGEXP,%{CODE%} pairs
Patters that should be ignored MUST NOT define any code or text! (see whitespace)
%LexerDef

	' '|'\t'|'\n'|'\r'
		%{	 
		%}
	'\"'((^'\"')*)'\"'
		%{
			// string
			String str = getText();
			if( str.indexOf("\n") != -1 )
			{
				error("Newline not permitted in strings");
			}
			return new Symbol( STRING, getStartIndex(), getEndIndex(), str );
		%}
	'/''*' (((^'*')|('*'(^'/')))*) '*''/'	
		%{
			return new Symbol(C_STYLE_COMMENT,getStartIndex(),getEndIndex(),getText());
		%}
	'/''/'((^'\n')*)'\n'
		%{
			// c++ comment
			return new Symbol( CPP_STYLE_COMMENT, getStartIndex(), getEndIndex(), getText().trim() );
		%}
	'\''((['\0'-'\38']|['\40'-'\91']|['\93'-'\127'])|('\\'['\0'-'\127']))'\''
		%{
			// char
			return new Symbol( CHAR, getStartIndex(), getEndIndex(), getText() );
		%}
	['0'-'9']+ 
		%{	 
			// integer constant
			return new Symbol( INTEGER, getStartIndex(), getEndIndex(), getText() );
		%}
	'>''=' %{ return new Symbol(BIGGER_THAN_EQUALS, getStartIndex(), getEndIndex(), getText() ); %}
	'&''&' %{ return new Symbol(LOGICAL_AND, getStartIndex(), getEndIndex(), getText() ); %}
	'|''|' %{ return new Symbol(LOGICAL_OR, getStartIndex(), getEndIndex(), getText() ); %}
	'+''+' %{ return new Symbol(INCREMENT, getStartIndex(), getEndIndex(), getText() ); %}
	'-''-' %{ return new Symbol(DECREMENT, getStartIndex(), getEndIndex(), getText() ); %}
	'=''=' %{ return new Symbol(EQUALS, getStartIndex(), getEndIndex(), getText() ); %}
	',' %{ return new Symbol(COMMA, getStartIndex(), getEndIndex(), getText() ); %}
	';' %{ return new Symbol(SEMI_COLON, getStartIndex(), getEndIndex(), getText() ); %}
	':' %{ return new Symbol(COLON, getStartIndex(), getEndIndex(), getText() ); %}
	'.' %{ return new Symbol(DOT, getStartIndex(), getEndIndex(), getText() ); %}
	'*' %{ return new Symbol(STAR, getStartIndex(), getEndIndex(), getText() ); %}
	'/' %{ return new Symbol(DIVIDE, getStartIndex(), getEndIndex(), getText() ); %}
	'+' %{ return new Symbol(PLUS, getStartIndex(), getEndIndex(), getText() ); %}
	'-' %{ return new Symbol(MINUS, getStartIndex(), getEndIndex(), getText() ); %}
	'(' %{ return new Symbol(LEFT_PAR, getStartIndex(), getEndIndex(), getText() ); %}
	')' %{ return new Symbol(RIGHT_PAR, getStartIndex(), getEndIndex(), getText() ); %}
	'{' %{ return new Symbol(LEFT_BRACE, getStartIndex(), getEndIndex(), getText() ); %}
	'}' %{ return new Symbol(RIGHT_BRACE, getStartIndex(), getEndIndex(), getText() ); %}
	'=' %{ return new Symbol(ASSIGN, getStartIndex(), getEndIndex(), getText() ); %}
	'!' %{ return new Symbol(LOGICAL_NOT, getStartIndex(), getEndIndex(), getText() ); %}
	'<' %{ return new Symbol(LESS_THAN, getStartIndex(), getEndIndex(), getText() ); %}
	'>' %{ return new Symbol(BIGGER_THAN, getStartIndex(), getEndIndex(), getText() ); %}
	'?' %{ return new Symbol(QUESTION, getStartIndex(), getEndIndex(), getText() ); %}
	'_' %{ return new Symbol(UNDERSCORE, getStartIndex(), getEndIndex(), getText() ); %}
	'\[' %{ return new Symbol(K_LEFT_PAR, getStartIndex(), getEndIndex(), getText() ); %}
	'\]' %{ return new Symbol(K_RIGHT_PAR, getStartIndex(), getEndIndex(), getText() ); %}
	'<''=' %{ return new Symbol(LESS_THAN_EQUALS, getStartIndex(), getEndIndex(), getText() ); %}
	(['a'-'z']|['A'-'Z'])((['a'-'z']|['A'-'Z']|['0'-'9'])*)
		%{
			// keywords and identifiers handled by getKeyword	in LexerClass section
			return getKeyword(getText(),getStartIndex(),getEndIndex());
		%}
	eof 
		%{
			// eof returns null by default
		%}		
	.	
		%{
			error("");
		%}
LexerDef%
Source: geocities.com/sunsetstrip/palladium/1303/java/ulex

( geocities.com/sunsetstrip/palladium/1303/java) ( geocities.com/sunsetstrip/palladium/1303) ( geocities.com/sunsetstrip/palladium) ( geocities.com/sunsetstrip)