src/lexer.rs: implement a naive and dumb lexer

Signed-off-by: Amneesh Singh <natto@weirdnatto.in>
This commit is contained in:
2023-04-08 02:10:57 +05:30
parent 6979a02408
commit 0d7a4bdd4e
3 changed files with 393 additions and 0 deletions

View File

@@ -0,0 +1,386 @@
use std::{iter, str};
#[derive(Debug)]
pub enum Literal {
Int,
Float,
Char,
}
#[derive(Debug)]
pub enum Symbol {
// operators
Plus,
Minus,
Star,
Slash,
Percent,
Caret,
Tilde,
// bitwise
Shl,
Shr,
And,
Or,
// logical
Not,
AndAnd,
OrOr,
// relational
Gt,
Ge,
Lt,
Le,
GtEq,
LtEq,
EqEq,
Ne,
// assignment
Eq,
PlusEq,
MinusEq,
StarEq,
SlashEq,
PercentEq,
CaretEq,
ShlEq,
ShrEq,
AndEq,
OrEq,
//misc
Colon,
Dot,
Hash,
}
#[derive(Debug)]
pub enum Keyword {
Let,
Fn,
// conditionals
If,
Else,
Elseif,
// loops
While,
Do,
For,
// primitives
Int,
Float,
Char,
}
#[derive(Debug)]
pub enum Delimiter {
BraceOpen,
BraceClose,
ParenOpen,
ParenClose,
}
#[derive(Debug)]
pub enum TokenKind {
Literal(Literal),
Symbol(Symbol),
Keyword(Keyword),
Delimiter(Delimiter),
Newline,
Identifier,
}
#[derive(Debug)]
pub struct Token<'a> {
pub kind: TokenKind,
pub value: Option<&'a str>,
pub line: usize,
}
pub struct Lexer<'a> {
file: &'a str,
text: &'a str,
chars: iter::Peekable<str::Chars<'a>>,
line: usize,
start: usize,
end: usize,
}
impl<'a> Lexer<'a> {
pub fn new(file: &'a str, contents: &'a str) -> Lexer<'a> {
Lexer {
file,
text: contents,
chars: contents.chars().peekable(),
line: 1,
start: 0,
end: 0,
}
}
#[inline]
fn error(&self) {
eprintln!("error lexing \"{}:{}:{}\"", self.file, self.line, self.end);
}
fn next(&mut self) -> Option<char> {
self.end += 1;
self.chars.next()
}
fn skip(&mut self, c: char) {
if self.next() != Some(c) {
self.error();
panic!("expected {}", c);
}
}
fn get_numeric(&mut self) -> Token<'a> {
let mut is_float: bool = false;
while let Some(c) = self.chars.peek() {
match c {
'0'..='9' => {}
'.' => {
if is_float {
self.error();
panic!("multiple decimals encountered")
}
is_float = true;
}
_ => break,
}
self.next();
}
Token {
kind: if is_float {
TokenKind::Literal(Literal::Float)
} else {
TokenKind::Literal(Literal::Int)
},
value: Some(&self.text[self.start..self.end]),
line: self.line,
}
}
fn get_char(&mut self) -> Token<'a> {
self.skip('\'');
if matches!(self.next(), Some('\'') | None) {
self.error();
panic!("A character literal cannot be empty");
}
self.skip('\'');
Token {
kind: TokenKind::Literal(Literal::Char),
value: Some(&self.text[self.start + 1..self.end - 1]),
line: self.line,
}
}
fn get_delimiter(&mut self) -> Token<'a> {
macro_rules! token_delimiter {
($a:expr) => {
Token {
kind: TokenKind::Delimiter($a),
value: None,
line: self.line,
}
};
}
use Delimiter::*;
match self.next() {
Some(c) => match c {
'{' => token_delimiter!(BraceOpen),
'}' => token_delimiter!(BraceClose),
'(' => token_delimiter!(ParenOpen),
')' => token_delimiter!(ParenClose),
_ => {
self.error();
panic!("expected delimiter");
}
},
None => {
self.error();
panic!("expected delimiter");
}
}
}
fn get_symbol(&mut self) -> Token<'a> {
// handle ~, ., :
macro_rules! token_symbol {
($a:expr) => {
Token {
kind: TokenKind::Symbol($a),
value: None,
line: self.line,
}
};
}
// handle +, +=, -, -=, *, *=, /, /=, %, %=, ^, ^=, !, !=
macro_rules! token_symbol_eq {
($a:expr, $b:expr) => {
if self.chars.peek() == Some(&'=') {
self.next();
token_symbol!($b)
} else {
token_symbol!($a)
}
};
}
// handle &, |, ||, &&, &=, |=
macro_rules! token_symbol_logical {
($a:expr, $b:expr, $c:expr, $d:expr) => {
match self.chars.peek() {
Some('=') => {
self.next();
token_symbol!($c)
}
Some($d) => {
self.next();
token_symbol!($b)
}
_ => token_symbol!($a),
}
};
}
// handle <, <=, >, >=, <<, >>, <<=, >>=
macro_rules! token_symbol_compare {
($a:expr, $b:expr, $c:expr, $d:expr, $e:expr) => {
match self.chars.peek() {
Some('=') => {
self.next();
token_symbol!($d)
}
Some($e) => {
self.next();
token_symbol_eq!($b, $c)
}
_ => token_symbol!($a),
}
};
}
use Symbol::*;
match self.next() {
Some(c) => match c {
'+' => token_symbol_eq!(Plus, PlusEq),
'-' => token_symbol_eq!(Minus, MinusEq),
'*' => token_symbol_eq!(Star, StarEq),
'/' => token_symbol_eq!(Slash, SlashEq),
'%' => token_symbol_eq!(Percent, PercentEq),
'^' => token_symbol_eq!(Caret, CaretEq),
'!' => token_symbol_eq!(Not, Ne),
'=' => token_symbol_eq!(Eq, EqEq),
'&' => token_symbol_logical!(And, AndAnd, AndEq, '&'),
'|' => token_symbol_logical!(Or, OrOr, OrEq, '|'),
'<' => token_symbol_compare!(Lt, Shl, ShlEq, LtEq, '<'),
'>' => token_symbol_compare!(Gt, Shr, ShrEq, GtEq, '>'),
'~' => token_symbol!(Tilde),
':' => token_symbol!(Colon),
'.' => token_symbol!(Dot),
'#' => token_symbol!(Hash),
_ => {
self.error();
panic!("expected symbol");
}
},
None => {
self.error();
panic!("expected symbol");
}
}
}
fn get_alphanumeric(&mut self) -> Token<'a> {
while let Some(c) = self.chars.peek() {
match c {
'a'..='z' | 'A'..='Z' | '0'..='9' => {}
_ => break,
}
self.next();
}
macro_rules! token_keyword {
($a:expr) => {
Token {
kind: TokenKind::Keyword($a),
value: None,
line: self.line,
}
};
}
use Keyword::*;
match &self.text[self.start..self.end] {
"let" => token_keyword!(Let),
"fn" => token_keyword!(Fn),
"if" => token_keyword!(If),
"else" => token_keyword!(Else),
"elseif" => token_keyword!(Elseif),
"while" => token_keyword!(While),
"do" => token_keyword!(Do),
"for" => token_keyword!(For),
"int" => token_keyword!(Int),
"float" => token_keyword!(Float),
"char" => token_keyword!(Char),
_ => Token {
kind: TokenKind::Identifier,
value: Some(&self.text[self.start..self.end]),
line: self.line,
},
}
}
pub fn lex(&mut self) -> Vec<Token<'a>> {
let mut tokens: Vec<Token> = Vec::new();
while let Some(c) = self.chars.peek() {
match c {
' ' | '\r' | '\t' => {
self.next();
}
'\n' => {
tokens.push(Token {
kind: TokenKind::Newline,
value: None,
line: self.line,
});
self.next();
self.line += 1;
}
'0'..='9' => tokens.push(self.get_numeric()),
'\'' => tokens.push(self.get_char()),
'{' | '}' | '(' | ')' => tokens.push(self.get_delimiter()),
'+' | '-' | '*' | '/' | '%' | '^' | '~' | '&' | '|' | '!' | '<' | '>' | '='
| ':' | '.' | '#' => tokens.push(self.get_symbol()),
'a'..='z' | 'A'..='Z' => tokens.push(self.get_alphanumeric()),
_ => {
self.error();
panic!("unknown character encountered");
}
}
self.start = self.end;
}
tokens
}
}

View File

@@ -1 +1,2 @@
pub mod args;
pub mod lexer;

View File

@@ -2,6 +2,7 @@ use std::fs;
use std::panic;
use tricc::args::Args;
use tricc::lexer::Lexer;
fn main() {
panic::set_hook(Box::new(|panic_info| {
@@ -25,4 +26,9 @@ fn main() {
let file = args.get_file();
let contents = fs::read_to_string(&file).expect("Couldn't read the file");
let mut lexer = Lexer::new(&file, contents.as_str());
let tokens = lexer.lex();
println!("{:?}", tokens);
}