// lex.cpp // Glenn G. Chappell // VERSION 3a // 9 Feb 2009 // // For CS 331 Spring 2009 // Source for class Lex // State-machine-based lexical analyzer class #include "lex.h" // for Lex class definition #include // for assert #include using std::string; #include using std::isspace; using std::isalpha; using std::isdigit; using std::isalnum; // ************************************************************************ // class Lex - Definitions of member functions // ************************************************************************ // next // Returns next lexeme & advances where_ // Pre: None. // Post: // Return value is lexeme beginning at index old value of where_, // or "" if where_ was input_.size(). // where_ is index of start of following lexeme, // or else input_.size(), if none. Lex::Lexeme Lex::next() { if (done()) return Lexeme(NONE, ""); Token type; // Type of current lexeme string s; // For value of current lexeme // The following rather lengthy loop does state-machine-based // lexical analysis to find the lexeme in input_ that begins // at index where_. States (type State) are named according // to the shortest character sequence that allows one to reach // that state. START is the start state, and DONE indicates a // lexeme is ready to be output, and the loop can end. enum State { START, LETTER, DIGIT, PLUS, MINUS, DOT, DIG_DOT, PLUS_DOT, MINUS_DOT, DONE }; State currState = START; while (true) { if (currState == DONE) break; char c = currChar(); switch (currState) { case START: assert(c != -1); appendChar(s); if (isalpha(c) || (c == '_')) currState = LETTER; else if (isdigit(c)) currState = DIGIT; else if (c == '+') currState = PLUS; else if (c == '-') currState = MINUS; else if (c == '.') currState = DOT; else if (c >= ' ' && c <= '~') { type = OPERATOR; currState = DONE; } else { type = ILLEGAL; currState = DONE; } break; case LETTER: if (isalnum(c) || c == '_') appendChar(s); else { type = IDENTIFIER; currState = DONE; } break; case DIGIT: if (isdigit(c)) appendChar(s); else if (c == '.') { appendChar(s); currState = DIG_DOT; } else { type = NUMBER; currState = DONE; } break; case PLUS: if (c == '+') { appendChar(s); type = OPERATOR; currState = DONE; } else if (isdigit(c)) { appendChar(s); currState = DIGIT; } else if (c == '.') { appendChar(s); currState = PLUS_DOT; } else { type = OPERATOR; currState = DONE; } break; case MINUS: if (c == '-') { appendChar(s); type = OPERATOR; currState = DONE; } else if (isdigit(c)) { appendChar(s); currState = DIGIT; } else if (c == '.') { appendChar(s); currState = MINUS_DOT; } else { type = OPERATOR; currState = DONE; } break; case DOT: if (isdigit(c)) { appendChar(s); currState = DIG_DOT; } else { type = OPERATOR; currState = DONE; } break; case DIG_DOT: if (isdigit(c)) appendChar(s); else { type = NUMBER; currState = DONE; } break; case PLUS_DOT: if (isdigit(c)) { appendChar(s); currState = DIG_DOT; } else { // Back up a char & spit out "+" OP --where_; s.resize(s.size() - 1); // Remove last char type = OPERATOR; currState = DONE; } break; case MINUS_DOT: if (isdigit(c)) { appendChar(s); currState = DIG_DOT; } else { // Back up a char & spit out "-" OP --where_; s.resize(s.size() - 1); // Remove last char type = OPERATOR; currState = DONE; } break; } } skip(); return Lexeme(type, s); } // skip // Skip whitespace & comments // Advances where to beginning of next lexeme // or input_.size() if none. // Pre: None. // Post: // where_ is index of start of next lexeme_, // or else input_.size(). void Lex::skip() { while (true) { // Skip whitespace while (where_ < input_.size() && isspace(input_[where_])) ++where_; // If not at slash, done if (where_ == input_.size() || input_[where_] != '/') return; // If not at double slash, done if (where_+1 == input_.size() || input_[where_+1] != '/') return; // Double slash! Found a comment; skip it where_ += 2; while (where_ != input_.size() && input_[where_] != '\n') ++where_; } }