import java.io.*; import java.util.*; /** * Scan an input stream looking for tokens defined for a simple language. * * @author Hyung-Joon Kim */ public class Scanner { //----------------------------------------------------------------- /** * Symbol table 0 stores the identifiers that are to be * recognized as keywords, along with their specific Token.xyz * values. */ private SymbolTable keywords; /** The CompilerIO object that is managing all of our IO. */ private CompilerIO io; /** the current line of the source file, null if we have * read to end of file. */ private String srcLine; /** * Zero-based offset to the next unaccepted character in * the srcLine. Points to the character that will be returned * by getCurrentCharacter. idx is incremented every time * acceptCurrentCharacter is called. */ private int idx; //----------------------------------------------------------------- /** * Construct a new Scanner object. The keywords symbol table is * initialized here. * @param io the CompilerIO object to use in reading and writing files. * @param reserved the SymbolTable manager. Table 0 is assumed to be * the reserved word table and is initialized in this constructor. */ public Scanner(CompilerIO io,SymbolTable reserved) { this.io = io; keywords = reserved; keywords.putSymbol(0,"if",new Symbol("if",Token.KW_IF)); keywords.putSymbol(0,"else",new Symbol("else",Token.KW_ELSE)); keywords.putSymbol(0,"prolog",new Symbol("prolog",Token.KW_PROLOG)); keywords.putSymbol(0,"movie",new Symbol("movie",Token.KW_MOVIE)); keywords.putSymbol(0,"show",new Symbol("show",Token.KW_SHOW)); } /** * Starting with the current character, identify the next Token and * return the appropriate Token object to the caller. This method * calls the private helper methods getCurrentCharacter and * acceptCurrentCharacter repeatedly to inspect and advance past * characters until it decides that it has found something * worthy of being reported as a Token. * @return the appropriate Token object */ public Token nextToken() { Token t = null; char a; char b; while (t == null) { skipWhitespace(); if (srcLine == null) { return new Token(Token.EOF); } a = getCurrentCharacter(); switch (a) { case '!': t = new Token(Token.OP_NOT); acceptCurrentCharacter(); break; case '<': t = new Token(Token.OP_LT); acceptCurrentCharacter(); break; case '>': t = new Token(Token.OP_GT); acceptCurrentCharacter(); break; case '+': t = new Token(Token.OP_ADD); acceptCurrentCharacter(); break; case '-': t = new Token(Token.OP_SUB); acceptCurrentCharacter(); break; case '*': t = new Token(Token.OP_MUL); acceptCurrentCharacter(); break; case '(': t = new Token(Token.LPAREN); acceptCurrentCharacter(); break; case ')': t = new Token(Token.RPAREN); acceptCurrentCharacter(); break; case '{': t = new Token(Token.LCURLY); acceptCurrentCharacter(); break; case '}': t = new Token(Token.RCURLY); acceptCurrentCharacter(); break; case '.': t = new Token(Token.DOT); acceptCurrentCharacter(); break; case ',': t = new Token(Token.COMMA); acceptCurrentCharacter(); break; case ';': t = new Token(Token.SEMICOLON); acceptCurrentCharacter(); break; case ':': t = new Token(Token.COLON); acceptCurrentCharacter(); break; case '=': { // assignment operator or equality test acceptCurrentCharacter(); b = getCurrentCharacter(); if (b == '=') { acceptCurrentCharacter(); t = new Token(Token.OP_EQ); } else { t = new Token(Token.OP_ASSIGN); } break; } case '/': { // division operator or comment acceptCurrentCharacter(); b = getCurrentCharacter(); if (b == '/') { while (srcLine != null && b != '\n') { acceptCurrentCharacter(); b = getCurrentCharacter(); } acceptCurrentCharacter(); } else { t = new Token(Token.OP_DIV); } break; } default: if (Character.isDigit(a)) { // numeric literal acceptCurrentCharacter(); StringBuffer number = new StringBuffer(); boolean gotDot = false; number.append(a); b = getCurrentCharacter(); while (srcLine != null && (Character.isDigit(b) || (b == '.' && !gotDot))) { number.append(b); acceptCurrentCharacter(); gotDot |= (b == '.'); b = getCurrentCharacter(); } try { if (gotDot) { double dValue = Double.parseDouble(number.toString()); t = new Token(Token.REAL,dValue); } else { int iValue = Integer.parseInt(number.toString()); t = new Token(Token.INTEGER,iValue); } } catch (NumberFormatException e) { t = new Token(Token.ID,number.toString()); } break; } else if (Character.isLetter(a)) { // identifier or keyword acceptCurrentCharacter(); StringBuffer ident = new StringBuffer(); ident.append(a); b = getCurrentCharacter(); while (srcLine != null && (Character.isLetter(b) || b == '_' || Character.isDigit(b))) { ident.append(b); acceptCurrentCharacter(); b = getCurrentCharacter(); } String name = ident.toString(); Symbol tt = (Symbol)keywords.getSymbol(0,name); if (tt != null) { t = new Token(tt.getType()); } else { t = new Token(Token.ID,name); } break; } else { // scan error invalidCharacter(a); acceptCurrentCharacter(); break; } } } return t; } /** * Skip over whitespace characters. */ private void skipWhitespace() { char c = getCurrentCharacter(); while (srcLine != null && Character.isWhitespace(c)) { acceptCurrentCharacter(); c = getCurrentCharacter(); } } /** * Report invalid character. * @param c the invalid character */ private void invalidCharacter(char c) { io.emit("Scanner: invalid character : \\u"+Integer.toHexString(c)); } /** * Get the current character from the source file. If we are * at EOF, then srcLine is null and this method returns 0. * The caller should check for srcLine == null before using * the returned char. If we are not at EOF, but we are at * the end of this line, then we return a newline character ('\n'). * If none of the above are true, then we are pointing to a * character in the current srcLine and so we return that char. * increment idx. * idx always points to the current character. If it is equal * to the length of the srcLine, then it is trying to point * to the (virtual) newline. * @return the currently available character. */ private char getCurrentCharacter() { if (srcLine == null) { try { srcLine = io.readSrcLine(); idx = 0; } catch (IOException e) { srcLine = null; idx = 0; } } if (srcLine == null) return '\u0000'; if (idx == srcLine.length()) { return '\n'; } return srcLine.charAt(idx); } /** * Accept the previously returned character, and advance the * index to the next available character. This method must * be called every time the scanner decides that it will be * able to use the current character in whatever the current * lexeme is. This is the only way to advance the index through * the srcLine. */ private void acceptCurrentCharacter() { if (srcLine == null) return; idx++; if (idx > srcLine.length()) { try { srcLine = io.readSrcLine(); idx = 0; } catch (IOException e) { srcLine = null; idx = 0; } } } }