use internment::ArcIntern; use logos::{Lexer, Logos}; use std::fmt; use std::num::ParseIntError; use thiserror::Error; /// A single token of the input stream; used to help the parsing go down /// more easily. /// /// The key way to generate this structure is via the [`Logos`] trait. /// See the [`logos`] documentation for more information; we use the /// [`Token::lexer`] function internally. /// /// The first step in the compilation process is turning the raw string /// data (in UTF-8, which is its own joy) in to a sequence of more sensible /// tokens. Here, for example, we turn "x=5" into three tokens: a /// [`Token::Variable`] for "x", a [`Token::Equals`] for the "=", and /// then a [`Token::Number`] for the "5". Later on, we'll worry about /// making sense of those three tokens. /// /// For now, our list of tokens is relatively straightforward. We'll /// need/want to extend these later. /// /// The [`std::fmt::Display`] implementation for [`Token`] should /// round-trip; if you lex a string generated with the [`std::fmt::Display`] /// trait, you should get back the exact same token. #[derive(Logos, Clone, Debug, PartialEq, Eq)] pub enum Token { // Our first set of tokens are simple characters that we're // going to use to structure NGR programs. #[token("=")] Equals, #[token(";")] Semi, #[token(",")] Comma, #[token("(")] LeftParen, #[token(")")] RightParen, #[token("<")] LessThan, #[token(">")] GreaterThan, #[token("{")] OpenBrace, #[token("}")] CloseBrace, #[token("lambda")] #[token("function")] Function, // Next we take of any reserved words; I always like to put // these before we start recognizing more complicated regular // expressions. I don't think it matters, but it works for me. #[token("print")] Print, // Next are the operators for NGR. We only have 4, now, but // we might extend these later, or even make them user-definable! #[regex(r"[+\-*/]", |v| v.slice().chars().next())] Operator(char), /// Numbers capture both the value we read from the input, /// converted to an `i64`, as well as the base the user used /// to write the number and/or the type the user specified, /// if they did either. #[regex(r"0b[01]+(u8|i8|u16|i16|u32|i32|u64|i64)?", |v| parse_number(Some(2), v))] #[regex(r"0o[0-7]+(u8|i8|u16|i16|u32|i32|u64|i64)?", |v| parse_number(Some(8), v))] #[regex(r"0d[0-9]+(u8|i8|u16|i16|u32|i32|u64|i64)?", |v| parse_number(Some(10), v))] #[regex(r"0x[0-9a-fA-F]+(u8|i8|u16|i16|u32|i32|u64|i64)?", |v| parse_number(Some(16), v))] #[regex(r"[0-9]+(u8|i8|u16|i16|u32|i32|u64|i64)?", |v| parse_number(None, v))] Number((Option, Option, u64)), // Variables; this is a very standard, simple set of characters // for variables, but feel free to experiment with more complicated // things. I chose to force variables to start with a lower case // letter, too. #[regex(r"[a-z][a-zA-Z0-9_]*", |v| ArcIntern::new(v.slice().to_string()))] Variable(ArcIntern), // the next token will be an error token #[error] // we're actually just going to skip whitespace, though #[regex(r"[ \t\r\n\f]+", logos::skip)] // this is an extremely simple version of comments, just line // comments. More complicated /* */ comments can be harder to // implement, and didn't seem worth it at the time. #[regex(r"//.*", logos::skip)] /// This token represents that some core error happened in lexing; /// possibly that something didn't match anything at all. Error, } impl fmt::Display for Token { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Token::Equals => write!(f, "'='"), Token::Semi => write!(f, "';'"), Token::Comma => write!(f, "','"), Token::LeftParen => write!(f, "'('"), Token::RightParen => write!(f, "')'"), Token::LessThan => write!(f, "<"), Token::GreaterThan => write!(f, ">"), Token::OpenBrace => write!(f, "{{"), Token::CloseBrace => write!(f, "}}"), Token::Function => write!(f, "function"), Token::Print => write!(f, "'print'"), Token::Operator(c) => write!(f, "'{}'", c), Token::Number((None, otype, v)) => write!(f, "'{}{}'", v, display_optional_type(otype)), Token::Number((Some(2), otype, v)) => { write!(f, "'0b{:b}{}'", v, display_optional_type(otype)) } Token::Number((Some(8), otype, v)) => { write!(f, "'0o{:o}{}'", v, display_optional_type(otype)) } Token::Number((Some(10), otype, v)) => { write!(f, "'{}{}'", v, display_optional_type(otype)) } Token::Number((Some(16), otype, v)) => { write!(f, "'0x{:x}{}'", v, display_optional_type(otype)) } Token::Number((Some(b), opt_type, v)) => { write!( f, "Invalidly-based-number", b, v, opt_type ) } Token::Variable(s) => write!(f, "'{}'", s), Token::Error => write!(f, ""), } } } /// A sudden and unexpected error in the lexer. #[derive(Debug, Error, PartialEq, Eq)] pub enum LexerError { /// The `usize` here is the offset that we ran into the problem, given /// from the start of the file. #[error("Failed lexing at {0}")] LexFailure(usize), } #[cfg(test)] impl Token { /// Create a variable token with the given name. Very handy for /// testing. pub(crate) fn var(s: &str) -> Token { Token::Variable(ArcIntern::new(s.to_string())) } } #[repr(i64)] #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum ConstantType { U8 = 10, U16 = 11, U32 = 12, U64 = 13, I8 = 20, I16 = 21, I32 = 22, I64 = 23, } impl From for cranelift_codegen::ir::Type { fn from(value: ConstantType) -> Self { match value { ConstantType::I8 | ConstantType::U8 => cranelift_codegen::ir::types::I8, ConstantType::I16 | ConstantType::U16 => cranelift_codegen::ir::types::I16, ConstantType::I32 | ConstantType::U32 => cranelift_codegen::ir::types::I32, ConstantType::I64 | ConstantType::U64 => cranelift_codegen::ir::types::I64, } } } impl ConstantType { /// Returns true if the given type is (a) numeric and (b) signed; pub fn is_signed(&self) -> bool { matches!( self, ConstantType::I8 | ConstantType::I16 | ConstantType::I32 | ConstantType::I64 ) } /// Return the set of types that can be safely casted into this type. pub fn safe_casts_to(self) -> Vec { match self { ConstantType::I8 => vec![ConstantType::I8], ConstantType::I16 => vec![ConstantType::I16, ConstantType::I8, ConstantType::U8], ConstantType::I32 => vec![ ConstantType::I32, ConstantType::I16, ConstantType::I8, ConstantType::U16, ConstantType::U8, ], ConstantType::I64 => vec![ ConstantType::I64, ConstantType::I32, ConstantType::I16, ConstantType::I8, ConstantType::U32, ConstantType::U16, ConstantType::U8, ], ConstantType::U8 => vec![ConstantType::U8], ConstantType::U16 => vec![ConstantType::U16, ConstantType::U8], ConstantType::U32 => vec![ConstantType::U32, ConstantType::U16, ConstantType::U8], ConstantType::U64 => vec![ ConstantType::U64, ConstantType::U32, ConstantType::U16, ConstantType::U8, ], } } /// Return the set of all currently-available constant types pub fn all_types() -> Vec { vec![ ConstantType::U8, ConstantType::U16, ConstantType::U32, ConstantType::U64, ConstantType::I8, ConstantType::I16, ConstantType::I32, ConstantType::I64, ] } /// Return the name of the given type, as a string pub fn name(&self) -> String { match self { ConstantType::I8 => "i8".to_string(), ConstantType::I16 => "i16".to_string(), ConstantType::I32 => "i32".to_string(), ConstantType::I64 => "i64".to_string(), ConstantType::U8 => "u8".to_string(), ConstantType::U16 => "u16".to_string(), ConstantType::U32 => "u32".to_string(), ConstantType::U64 => "u64".to_string(), } } } #[derive(Debug, Error, PartialEq)] pub enum InvalidConstantType { #[error("Unrecognized constant {0} for constant type")] Value(i64), } impl TryFrom for ConstantType { type Error = InvalidConstantType; fn try_from(value: i64) -> Result { match value { 10 => Ok(ConstantType::U8), 11 => Ok(ConstantType::U16), 12 => Ok(ConstantType::U32), 13 => Ok(ConstantType::U64), 20 => Ok(ConstantType::I8), 21 => Ok(ConstantType::I16), 22 => Ok(ConstantType::I32), 23 => Ok(ConstantType::I64), _ => Err(InvalidConstantType::Value(value)), } } } /// Parse a number in the given base, return a pair of the base and the /// parsed number. This is just a helper used for all of the number /// regular expression cases, which kicks off to the obvious Rust /// standard library function. fn parse_number( base: Option, value: &Lexer, ) -> Result<(Option, Option, u64), ParseIntError> { let (radix, strval) = match base { None => (10, value.slice()), Some(radix) => (radix, &value.slice()[2..]), }; let (declared_type, strval) = if let Some(strval) = strval.strip_suffix("u8") { (Some(ConstantType::U8), strval) } else if let Some(strval) = strval.strip_suffix("u16") { (Some(ConstantType::U16), strval) } else if let Some(strval) = strval.strip_suffix("u32") { (Some(ConstantType::U32), strval) } else if let Some(strval) = strval.strip_suffix("u64") { (Some(ConstantType::U64), strval) } else if let Some(strval) = strval.strip_suffix("i8") { (Some(ConstantType::I8), strval) } else if let Some(strval) = strval.strip_suffix("i16") { (Some(ConstantType::I16), strval) } else if let Some(strval) = strval.strip_suffix("i32") { (Some(ConstantType::I32), strval) } else if let Some(strval) = strval.strip_suffix("i64") { (Some(ConstantType::I64), strval) } else { (None, strval) }; let intval = u64::from_str_radix(strval, radix as u32)?; Ok((base, declared_type, intval)) } fn display_optional_type(otype: &Option) -> &'static str { match otype { None => "", Some(ConstantType::I8) => "i8", Some(ConstantType::I16) => "i16", Some(ConstantType::I32) => "i32", Some(ConstantType::I64) => "i64", Some(ConstantType::U8) => "u8", Some(ConstantType::U16) => "u16", Some(ConstantType::U32) => "u32", Some(ConstantType::U64) => "u64", } } #[test] fn lex_numbers() { let mut lex0 = Token::lexer("12 0b1100 0o14 0d12 0xc 12u8 0xci64// 9"); assert_eq!(lex0.next(), Some(Token::Number((None, None, 12)))); assert_eq!(lex0.next(), Some(Token::Number((Some(2), None, 12)))); assert_eq!(lex0.next(), Some(Token::Number((Some(8), None, 12)))); assert_eq!(lex0.next(), Some(Token::Number((Some(10), None, 12)))); assert_eq!(lex0.next(), Some(Token::Number((Some(16), None, 12)))); assert_eq!( lex0.next(), Some(Token::Number((None, Some(ConstantType::U8), 12))) ); assert_eq!( lex0.next(), Some(Token::Number((Some(16), Some(ConstantType::I64), 12))) ); assert_eq!(lex0.next(), None); } #[test] fn lex_symbols() { let mut lex0 = Token::lexer("x + \t y * \n z // rest"); assert_eq!(lex0.next(), Some(Token::var("x"))); assert_eq!(lex0.next(), Some(Token::Operator('+'))); assert_eq!(lex0.next(), Some(Token::var("y"))); assert_eq!(lex0.next(), Some(Token::Operator('*'))); assert_eq!(lex0.next(), Some(Token::var("z"))); assert_eq!(lex0.next(), None); } #[test] fn lexer_spans() { let mut lex0 = Token::lexer("y = x + 1//foo").spanned(); assert_eq!(lex0.next(), Some((Token::var("y"), 0..1))); assert_eq!(lex0.next(), Some((Token::Equals, 2..3))); assert_eq!(lex0.next(), Some((Token::var("x"), 4..5))); assert_eq!(lex0.next(), Some((Token::Operator('+'), 6..7))); assert_eq!(lex0.next(), Some((Token::Number((None, None, 1)), 8..9))); assert_eq!(lex0.next(), None); } #[test] fn further_spans() { let mut lex0 = Token::lexer("x = 2i64 + 2i64;\ny = -x;\nprint y;").spanned(); assert_eq!(lex0.next(), Some((Token::var("x"), 0..1))); assert_eq!(lex0.next(), Some((Token::Equals, 2..3))); assert_eq!( lex0.next(), Some((Token::Number((None, Some(ConstantType::I64), 2)), 4..8)) ); assert_eq!(lex0.next(), Some((Token::Operator('+'), 9..10))); assert_eq!( lex0.next(), Some((Token::Number((None, Some(ConstantType::I64), 2)), 11..15)) ); assert_eq!(lex0.next(), Some((Token::Semi, 15..16))); assert_eq!(lex0.next(), Some((Token::var("y"), 17..18))); assert_eq!(lex0.next(), Some((Token::Equals, 19..20))); assert_eq!(lex0.next(), Some((Token::Operator('-'), 21..22))); assert_eq!(lex0.next(), Some((Token::var("x"), 22..23))); assert_eq!(lex0.next(), Some((Token::Semi, 23..24))); assert_eq!(lex0.next(), Some((Token::Print, 25..30))); assert_eq!(lex0.next(), Some((Token::var("y"), 31..32))); assert_eq!(lex0.next(), Some((Token::Semi, 32..33))); }