Go to the first, previous, next, last section, table of contents.


Example Grammar

The regular expression grammar will be used as an example.

The Grammar


# GRAMMAR DECLARATION
this is regexp version 0.1.0;

# PROPERTY DEFINITIONS
property compile-sourcepath = "./src";
property compile-namespace = "com.inxar.syntacs.translator.regexp";
property compile-interpreter-classname = 
    "com.inxar.syntacs.translator.regexp.RegexpInterpreter";

# TERMINAL DECLARATIONS
terminal WHITESPACE;
terminal CHAR;
terminal CHAR_CLASS_CHAR;
terminal PIPE;
terminal STAR;
terminal QUESTION;
terminal PLUS;
terminal OPEN_PAREN;
terminal CLOSE_PAREN;
terminal OPEN_BRACKET;
terminal OPEN_BRACKET_CARET;
terminal OPEN_BRACKET_DASH;
terminal OPEN_BRACKET_CARET_DASH;
terminal CLOSE_BRACKET;
terminal CHAR_CLASS_DASH;
terminal ESC_PIPE;
terminal ESC_STAR;
terminal ESC_QUESTION;
terminal ESC_PLUS;
terminal ESC_OPEN_PAREN;
terminal ESC_CLOSE_PAREN;
terminal ESC_OPEN_BRACKET;
terminal ESC_CLOSE_BRACKET;
terminal ESC_BACKSLASH;
terminal ESC_SPACE;
terminal ESC_TAB;
terminal ESC_VERTICAL_TAB;
terminal ESC_CR;
terminal ESC_LF;
terminal ESC_OCTAL;
terminal ESC_UNICODE;

# TERMINAL DEFINITIONS
WHITESPACE matches "(\t|\n|\v|\r|\s)+";
CHAR matches "[^\\|()[\]*+?]";
CHAR_CLASS_CHAR matches "[^-\]\\]";
PIPE matches "\|";
STAR matches "\*";
QUESTION matches "\?";
PLUS matches "\+";
OPEN_PAREN matches "\(";
CLOSE_PAREN matches "\)";
OPEN_BRACKET matches "(\[(\t|\n|\v|\r|\s)*)";
OPEN_BRACKET_CARET matches "(\[(\t|\n|\v|\r|\s)*^)";
OPEN_BRACKET_DASH matches "(\[(\t|\n|\v|\r|\s)*-)";
OPEN_BRACKET_CARET_DASH matches "(\[(\t|\n|\v|\r|\s)*^(\t|\n|\v|\r|\s)*-)";
CLOSE_BRACKET matches "\]";
CHAR_CLASS_DASH matches "(-)";
ESC_PIPE matches "(\\\|)";
ESC_STAR matches "(\\\*)";
ESC_QUESTION matches "(\\\?)";
ESC_PLUS matches "(\\\+)";
ESC_OPEN_PAREN matches "(\\\()";
ESC_CLOSE_PAREN matches "(\\\))";
ESC_OPEN_BRACKET matches "(\\\[)";
ESC_CLOSE_BRACKET matches "(\\\])";
ESC_BACKSLASH matches "(\\\\)";
ESC_SPACE matches "(\\s)";
ESC_TAB matches "(\\t)";
ESC_VERTICAL_TAB matches "(\\v)";
ESC_CR matches "(\\r)";
ESC_LF matches "(\\n)";
ESC_OCTAL matches "(\\0[0-3][0-7][0-7])";
ESC_UNICODE matches "(\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F])";

# NONTERMINAL DECLARATIONS
nonterminal Goal;
nonterminal Union;
nonterminal Concat;
nonterminal Term;
nonterminal Quantifier;
nonterminal Atom;
nonterminal CharClass;
nonterminal CharClassBegin;
nonterminal CharClassTermList;
nonterminal CharClassTerm;
nonterminal CharClassAtom;

# NONTERMINAL DEFINITIONS
reduce Goal when Union;
reduce Union when Concat;
reduce Union when Union PIPE Concat;
reduce Concat when Term;
reduce Concat when Concat Term;
reduce Term when Atom;
reduce Term when Atom Quantifier;
reduce Quantifier when STAR;
reduce Quantifier when PLUS;
reduce Quantifier when QUESTION;
reduce Atom when CHAR;
reduce Atom when ESC_BACKSLASH;
reduce Atom when ESC_PIPE;
reduce Atom when ESC_PLUS;
reduce Atom when ESC_STAR;
reduce Atom when ESC_QUESTION;
reduce Atom when ESC_OPEN_BRACKET;
reduce Atom when ESC_CLOSE_BRACKET;
reduce Atom when ESC_OPEN_PAREN;
reduce Atom when ESC_CLOSE_PAREN;
reduce Atom when ESC_SPACE;
reduce Atom when ESC_TAB;
reduce Atom when ESC_VERTICAL_TAB;
reduce Atom when ESC_CR;
reduce Atom when ESC_LF;
reduce Atom when ESC_OCTAL;
reduce Atom when ESC_UNICODE;
reduce Atom when CharClass;
reduce Atom when OPEN_PAREN Union CLOSE_PAREN;
reduce CharClass when CharClassBegin CharClassTermList CLOSE_BRACKET;
reduce CharClass when OPEN_BRACKET_CARET_DASH CLOSE_BRACKET;
reduce CharClassBegin when OPEN_BRACKET;
reduce CharClassBegin when OPEN_BRACKET_CARET;
reduce CharClassBegin when OPEN_BRACKET_DASH;
reduce CharClassBegin when OPEN_BRACKET_CARET_DASH;
reduce CharClassTermList when CharClassTerm;
reduce CharClassTermList when CharClassTermList CharClassTerm;
reduce CharClassTerm when CharClassAtom;
reduce CharClassTerm when CharClassAtom CHAR_CLASS_DASH CharClassAtom;
reduce CharClassAtom when CHAR_CLASS_CHAR;
reduce CharClassAtom when ESC_BACKSLASH;
reduce CharClassAtom when ESC_CLOSE_BRACKET;
reduce CharClassAtom when ESC_SPACE;
reduce CharClassAtom when ESC_TAB;
reduce CharClassAtom when ESC_VERTICAL_TAB;
reduce CharClassAtom when ESC_CR;
reduce CharClassAtom when ESC_LF;
reduce CharClassAtom when ESC_OCTAL;
reduce CharClassAtom when ESC_UNICODE;

accept when Goal;

# CONTEXT DECLARATIONS
context default;
context charclass;

# CONTEXT DEFINITIONS
default includes WHITESPACE, CHAR, PIPE, STAR, QUESTION, PLUS, OPEN_PAREN, CLOSE_PAREN, 
                 ESC_PIPE, ESC_QUESTION, ESC_STAR, ESC_PLUS, ESC_OPEN_PAREN, ESC_CLOSE_PAREN, 
                 ESC_OPEN_BRACKET, ESC_BACKSLASH, ESC_SPACE, ESC_TAB, ESC_VERTICAL_TAB, 
                 ESC_CR, ESC_LF, ESC_OCTAL, ESC_UNICODE, ESC_CLOSE_BRACKET,
                 OPEN_BRACKET shifts charclass, 
                 OPEN_BRACKET_CARET shifts charclass, 
                 OPEN_BRACKET_DASH shifts charclass, 
                 OPEN_BRACKET_CARET_DASH shifts charclass;

charclass includes WHITESPACE, CHAR_CLASS_CHAR, CHAR_CLASS_DASH, 
                   ESC_BACKSLASH, ESC_SPACE, ESC_TAB, ESC_VERTICAL_TAB, ESC_CR, ESC_LF, 
                   ESC_OCTAL, ESC_UNICODE, ESC_CLOSE_BRACKET,
                   CLOSE_BRACKET unshifts;

start in context default;

Several things to note in the grammar:

The TranslatorGrammar

Notice that the grammar is code-free; it does not contain any Java code. This means that user-defined code has to go somewhere else. The paradigm used by the STT is to sandwich an object between the lexer and the parser that is reponsible for interpreting lexer match events. A similar interface exists between the parser and the parser-interpreter with reduction events.

Terminal match events are signaled to the LexerInterpreter through the match(int terminal_type, int offset, int length method. The first argument identifies what kind of terminal (i.e. token) was matched, represented as a number. Therefore, each terminal is given a unique number that identifies it at the time of declaration.

Note: since terminal declarations are separate from terminal definitions (the regular expression), this means that some terminals may never be signaled by the lexer. The rationale behind this is to allow the user more flexibility in how they want to structure their grammar: some authors might prefer to match all keywords using a single regular expression, do a symbol lookup, then pass along the appropriate terminal constant to the parser. This has the potential to have much smaller lexer transition tables.

Therefore, the set of terminals known to the lexer and the set of terminals known to the parser may be non-equal.

Unique numbers are assigned to each terminal, nonterminal, and production in the grammar. When the grammar is read by the syntacs processor, the first thing it generates is the TranslatorGrammar. This class contains all the context, terminal, nonterminal, and production constants as well as functions like getTerminal(int ID) that returns the name of a terminal by number.

These constants are used in the switch statements of the LexerInterpreter and ParserInterpreter implementations.

The TranslatorGrammar object is also a factory for Translator instances of that grammar. Therefore, to get an instance of a Translator that parses regular expressions:

TranslatorGrammar tg = new com.inxar.syntacs.translator.regexp.RegexpGrammar();
Translator t = tg.newTranslator();

To generate the translator grammar, run the syntacs compiler on the grammar:

[user@host]$ sttc regexp.stt

The LexerInterpreter

Once you have the TranslatorGrammar and the grammar constants, you can write the code for the LexerInterpreter and ParserInterpreter.

For the LexerInterpreter, the key method to implement is match(int terminal_type, int offset, int length. Within this method the LexerInterpreter presumably does something significant with the output of the lexer.

For our purposes, the LexerInterpreter will generate new Symbol instances and pass them to the parser. The Symbol interface defines a method getSymbolType() that identifies it as a member of the context-free grammar which the parser was built to recognize.

Here is ther salient code:

    public void match(int type, int off, int len)
    {
	Symbol symbol = null;
	
        switch (type) {

	case RegexpGrammar.T_WHITESPACE:
	    return;

	case RegexpGrammar.T_PIPE:
	case RegexpGrammar.T_OPEN_PAREN:
	case RegexpGrammar.T_CLOSE_PAREN:
	case RegexpGrammar.T_CHAR_CLASS_DASH:
	case RegexpGrammar.T_CLOSE_BRACKET:
	case Token.STOP: 
	    symbol = new ConstantSymbol(type); 
	    break;

	case RegexpGrammar.T_CHAR:
	case RegexpGrammar.T_CHAR_CLASS_CHAR:
	    if (len != 1)
		throw new InternalError("Expected token to be only one character.");
	    symbol = newAtom( type, in.retch(off) );
	    break;

	case RegexpGrammar.T_ESC_OCTAL:
	    {
		char c = (char)0;
		try {
		    c = (char)Integer.parseInt( in.stretch(off + 1, len - 1), 8 );
		} catch (NumberFormatException nfex) {
		    trace("NumberFormatException caught while trying to parse "+
			  in.stretch(off + 1, len - 1));
		    nfex.printStackTrace();
		}
		symbol = newAtom(type, c);
		break;
	    }

	case RegexpGrammar.T_ESC_UNICODE:
	    {
		char c = (char)0;
		try {
		    c = (char)Integer.parseInt( in.stretch(off + 2, len - 2), 16 );
		} catch (NumberFormatException nfex) {
		    nfex.printStackTrace();
		}
		symbol = newAtom(type, c);
		break;
	    }

	case RegexpGrammar.T_ESC_SPACE:         symbol = newAtom(type, ' '); break;
	case RegexpGrammar.T_ESC_TAB:           symbol = newAtom(type, '\t'); break;
	case RegexpGrammar.T_ESC_VERTICAL_TAB:  symbol = newAtom(type, '\013'); break;
	case RegexpGrammar.T_ESC_CR:            symbol = newAtom(type, '\r'); break;
	case RegexpGrammar.T_ESC_LF:            symbol = newAtom(type, '\n'); break;
	case RegexpGrammar.T_ESC_BACKSLASH:     symbol = newAtom(type, '\\'); break;
	case RegexpGrammar.T_ESC_CLOSE_BRACKET: symbol = newAtom(type, ']'); break;
	case RegexpGrammar.T_ESC_PIPE:          symbol = newAtom(type, '|'); break;
	case RegexpGrammar.T_ESC_QUESTION:      symbol = newAtom(type, '?'); break;
	case RegexpGrammar.T_ESC_STAR:          symbol = newAtom(type, '*'); break;
	case RegexpGrammar.T_ESC_PLUS:          symbol = newAtom(type, '+'); break;
	case RegexpGrammar.T_ESC_OPEN_BRACKET:  symbol = newAtom(type, '['); break;
	case RegexpGrammar.T_ESC_OPEN_PAREN:    symbol = newAtom(type, '('); break;
	case RegexpGrammar.T_ESC_CLOSE_PAREN:   symbol = newAtom(type, ')'); break;

	case RegexpGrammar.T_OPEN_BRACKET:            
	    symbol = newClass(type, false, false); break;
	case RegexpGrammar.T_OPEN_BRACKET_CARET:      
	    symbol = newClass(type, true, false); break;
	case RegexpGrammar.T_OPEN_BRACKET_DASH:       
	    symbol = newClass(type, false, true); break;
	case RegexpGrammar.T_OPEN_BRACKET_CARET_DASH: 
	    symbol = newClass(type, true, true); break;

	case RegexpGrammar.T_STAR:     
	    symbol = new QuantifierSymbol(type, Regexp.CLOSURE); break;
	case RegexpGrammar.T_QUESTION: 
	    symbol = new QuantifierSymbol(type, Regexp.OPTIONAL); break;
	case RegexpGrammar.T_PLUS:     
	    symbol = new QuantifierSymbol(type, Regexp.PCLOSURE); break;

	default: 
	    throw new InternalError
		("Expected terminal type: " + grammar.getTerminal(type));
        }

	parser.notify(symbol);
    }

    private Symbol newAtom(int type, char value)
    {
	RegexpAtom atom = new RegexpAtom();
	atom.setValue( value );
	atom.setSymbolType(type);
	return atom;
    }

    private Symbol newClass(int type, boolean isNegated, boolean hasDash)
    {
	CharClassBeginSymbol ccbs = new CharClassBeginSymbol(type);
	ccbs.isNegated = isNegated;
	ccbs.hasDash = hasDash;
	return ccbs;
    }

Some points from the code:

The ParserInterpreter

The ParserInterpreter is implemented in a similar manner: the reduce method is generally a big switch block that handles each production. The ParserInterpreter can do whatever it wants with the symbols that are currently on the top of the parse stack -- if they are to be retained in the parse tree, the ParserInterpreter must fetch it from the stack and include it into the nonterminal symbol that will be returned to the parser.

The Sentence object abstracts the part of the parse stack that is being reduced such that indices into the stack refer to the expected component of the production. For example, when the production `P_CharClass__CharClassBegin_CharClassTermList_CLOSE_BRACKET' is being reduced, sentence.at(0) accesses the `CharClassBegin' symbol, sentence.at(1) accesses `CharClassTermList', and sentence.at(2) accesses `CLOSE_BRACKET'.

    public Symbol reduce(int type, Sentence s)
    {
	Symbol symbol = null;

	switch (type) {

	case RegexpGrammar.P_Quantifier__STAR:
	case RegexpGrammar.P_Quantifier__PLUS:
	case RegexpGrammar.P_Quantifier__QUESTION:
	case RegexpGrammar.P_Term__Atom:
	case RegexpGrammar.P_Atom__CHAR:
	case RegexpGrammar.P_Atom__ESC_BACKSLASH:
	case RegexpGrammar.P_Atom__ESC_PIPE:
	case RegexpGrammar.P_Atom__ESC_PLUS:
	case RegexpGrammar.P_Atom__ESC_STAR:
	case RegexpGrammar.P_Atom__ESC_QUESTION:
	case RegexpGrammar.P_Atom__ESC_OPEN_BRACKET:
	case RegexpGrammar.P_Atom__ESC_CLOSE_BRACKET:
	case RegexpGrammar.P_Atom__ESC_OPEN_PAREN:
	case RegexpGrammar.P_Atom__ESC_CLOSE_PAREN:
	case RegexpGrammar.P_Atom__ESC_SPACE:
	case RegexpGrammar.P_Atom__ESC_TAB:
	case RegexpGrammar.P_Atom__ESC_VERTICAL_TAB:
	case RegexpGrammar.P_Atom__ESC_CR:
	case RegexpGrammar.P_Atom__ESC_LF:
	case RegexpGrammar.P_Atom__ESC_OCTAL:
	case RegexpGrammar.P_Atom__ESC_UNICODE:
	case RegexpGrammar.P_Atom__CharClass:
	case RegexpGrammar.P_CharClassTerm__CharClassAtom:
	case RegexpGrammar.P_CharClassAtom__CHAR_CLASS_CHAR:
	case RegexpGrammar.P_CharClassAtom__ESC_BACKSLASH:
	case RegexpGrammar.P_CharClassAtom__ESC_CLOSE_BRACKET:
	case RegexpGrammar.P_CharClassAtom__ESC_SPACE:
	case RegexpGrammar.P_CharClassAtom__ESC_TAB:
	case RegexpGrammar.P_CharClassAtom__ESC_VERTICAL_TAB:
	case RegexpGrammar.P_CharClassAtom__ESC_CR:
	case RegexpGrammar.P_CharClassAtom__ESC_LF:
	case RegexpGrammar.P_CharClassAtom__ESC_OCTAL:
	case RegexpGrammar.P_CharClassAtom__ESC_UNICODE:
	case RegexpGrammar.P_CharClassBegin__OPEN_BRACKET:
	case RegexpGrammar.P_CharClassBegin__OPEN_BRACKET_CARET:
	case RegexpGrammar.P_CharClassBegin__OPEN_BRACKET_DASH:
	case RegexpGrammar.P_CharClassBegin__OPEN_BRACKET_CARET_DASH:
	    {
		symbol =  s.at(0);
		break;
	    }

	case RegexpGrammar.P_Union__Concat:
	    {
		RegexpList union = new RegexpList(Regexp.UNION);
		union.addRegexp( (Regexp)s.at(0) );
		symbol = union;
		break;
	    }

	case RegexpGrammar.P_Concat__Term:
	    {
		RegexpList concat = new RegexpList(Regexp.CONCAT);
		concat.addRegexp( (Regexp)s.at(0) );
		symbol = concat;
		break;
	    }

	case RegexpGrammar.P_Concat__Concat_Term:
	    {
		RegexpList list = (RegexpList)s.at(0);
		list.addRegexp( (Regexp)s.at(1) );
		symbol = list;
		break;
	    }

	case RegexpGrammar.P_Union__Union_PIPE_Concat:
	    {
		RegexpList list = (RegexpList)s.at(0);
		list.addRegexp( (Regexp)s.at(2) );
		symbol = list;
		break;
	    }

	case RegexpGrammar.P_CharClassTermList__CharClassTerm:
	    {
		symbol = new ListSymbol(type, s.at(0));
		break;
	    }

	case RegexpGrammar.P_CharClassTermList__CharClassTermList_CharClassTerm:
	    {
		ListSymbol sym = (ListSymbol)s.at(0);
		sym.list.add(s.at(1));
		symbol = sym;
		break;
	    }

	case RegexpGrammar.P_Term__Atom_Quantifier:
	    {
		QuantifierSymbol q = (QuantifierSymbol)s.at(1); 
		RegexpTerm term = new RegexpTerm(q.regexpType);
		term.setInternal( (Regexp)s.at(0) );
		symbol = term;
		break;
	    }

	case RegexpGrammar.P_Atom__OPEN_PAREN_Union_CLOSE_PAREN:
	    {
		RegexpTerm term = new RegexpTerm(Regexp.GROUP);
		term.setInternal( (Regexp)s.at(1) );
		symbol = term;
		break;
	    }

	case RegexpGrammar.P_CharClassTerm__CharClassAtom_CHAR_CLASS_DASH_CharClassAtom:
	    {
		RegexpAtom lo = (RegexpAtom)s.at(0);
		RegexpAtom hi = (RegexpAtom)s.at(2);
		symbol = new RegexpRange(lo, hi);
		break;
	    }

	case RegexpGrammar.P_CharClass__CharClassBegin_CharClassTermList_CLOSE_BRACKET:
	    {
		RegexpCharClass cc = new RegexpCharClass();
		CharClassBeginSymbol ccbs = (CharClassBeginSymbol)s.at(0);
		cc.isNegated(ccbs.isNegated);
		cc.hasDash(ccbs.hasDash);
		List list = ((ListSymbol)s.at(1)).list;
		cc.setList(list);
		symbol = cc;
		break;
	    }

	case RegexpGrammar.P_CharClass__OPEN_BRACKET_CARET_DASH_CLOSE_BRACKET:
	    {
		RegexpCharClass cc = new RegexpCharClass();
		CharClassBeginSymbol ccbs = (CharClassBeginSymbol)s.at(0);
		cc.isNegated(ccbs.isNegated);
		cc.hasDash(false);

		RegexpAtom dash_atom = new RegexpAtom();
		dash_atom.setValue('-');
		List list = new ArrayList();
		list.add(dash_atom);

		cc.setList(list);
		symbol = cc;
		break;
	    }

	case RegexpGrammar.P_Goal__Union:
	    {
		Regexp regexp = (Regexp)s.at(0);
		this.regexp = regexp;
		symbol = regexp;
		break;
	    }

	default:
      	    throw new InternalError("Unknown Production: "+grammar.getProduction(type));
	}

	return symbol;
    }

Once again, the LexerInterpreter and ParserInterpreter is typically the same object that implements LRTranslatorInterpreter. Once this object has been defined, you can set the property:

property compile-interpreter-classname = 
 "com.inxar.syntacs.translator.regexp.RegexpInterpreter";

And regenerate the translator. At this point the translator can be used:

TranslatorGrammar tg = new com.inxar.syntacs.translator.regexp.RegexpGrammar();
Translator t = tg.newTranslator();

StringReader in = new StringReader("a|b|c");

try {
    Regexp regexp = (Regexp)translator.translate(in);
} catch (TranslationException ex) {
    ex.printStackTrace();
} 


Go to the first, previous, next, last section, table of contents.