use crate::syntax::IntegerWithBase; use crate::syntax::error::LexerError; use proptest_derive::Arbitrary; use std::fmt; use std::ops::Range; use std::str::CharIndices; #[derive(Clone)] pub struct LocatedToken { pub token: Token, pub span: Range, } /// A single token of the input stream; used to help the parsing function over /// more concrete things than bytes. /// /// The [`std::fmt::Display`] implementation is designed to round-trip, so those /// needing a more regular or descriptive option should consider using the /// [`std::fmt::Debug`] implementation instead. #[derive(Clone, Debug, PartialEq, Eq, Arbitrary)] pub enum Token { OpenParen, CloseParen, OpenSquare, CloseSquare, OpenBrace, CloseBrace, Semi, Colon, DoubleColon, Comma, BackTick, Arrow, Lambda(bool), TypeName(#[proptest(regex = r"[A-Z][a-zA-Z0-9_]*")] String), ValueName(#[proptest(regex = r"[a-z_][a-zA-Z0-9_]*")] String), OperatorName( #[proptest( regex = r"[\~\!\@\#\$\%\^\&\*\+\-\=\.<>\?\|][\~\!\@\#\$\%\^\&\*\+\-\=\.<>\?\|_]*", filter = "|x| x != \"->\"" )] String, ), PrimitiveTypeName(#[proptest(regex = r"[A-Z][a-zA-Z0-9_]*")] String), PrimitiveValueName(#[proptest(regex = r"[a-z_][a-zA-Z0-9_]*")] String), Integer(IntegerWithBase), Character(char), String(String), } impl fmt::Display for Token { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Token::OpenParen => write!(f, "("), Token::CloseParen => write!(f, ")"), Token::OpenSquare => write!(f, "["), Token::CloseSquare => write!(f, "]"), Token::OpenBrace => write!(f, "{{"), Token::CloseBrace => write!(f, "}}"), Token::Semi => write!(f, ";"), Token::Colon => write!(f, ":"), Token::DoubleColon => write!(f, "::"), Token::Comma => write!(f, ","), Token::BackTick => write!(f, "`"), Token::Arrow => write!(f, "->"), Token::Lambda(false) => write!(f, "\\"), Token::Lambda(true) => write!(f, "λ"), Token::TypeName(str) => write!(f, "{str}"), Token::ValueName(str) => write!(f, "{str}"), Token::OperatorName(str) => write!(f, "{str}"), Token::PrimitiveTypeName(str) => write!(f, "prim%{str}"), Token::PrimitiveValueName(str) => write!(f, "prim%{str}"), Token::Integer(IntegerWithBase { base, value }) => match base { None => write!(f, "{value}"), Some(2) => write!(f, "0b{value:b}"), Some(8) => write!(f, "0o{value:o}"), Some(10) => write!(f, "0d{value}"), Some(16) => write!(f, "0x{value:x}"), Some(base) => write!(f, ""), }, Token::Character(c) => write!(f, "{c:?}"), Token::String(s) => write!(f, "{s:?}"), } } } #[allow(private_interfaces)] pub enum Lexer<'a> { Working(LexerState<'a>), Errored(LexerError), Done, } struct LexerState<'a> { stream: CharIndices<'a>, buffer: Option<(usize, char)>, } impl<'a> From<&'a str> for Lexer<'a> { fn from(value: &'a str) -> Self { Lexer::new(value) } } impl<'a> Lexer<'a> { pub fn new(stream: &'a str) -> Self { Lexer::Working(LexerState { stream: stream.char_indices(), buffer: None, }) } } impl<'a> Iterator for Lexer<'a> { type Item = Result; fn next(&mut self) -> Option { match self { Lexer::Done => None, Lexer::Errored(e) => Some(Err(e.clone())), Lexer::Working(state) => match state.next_token() { Err(e) => { println!("ERROR: {e}"); *self = Lexer::Errored(e.clone()); Some(Err(e)) } Ok(None) => { *self = Lexer::Done; None } Ok(Some(ltoken)) => Some(Ok(ltoken)), }, } } } impl<'a> LexerState<'a> { fn next_char(&mut self) -> Option<(usize, char)> { self.buffer.take().or_else(|| self.stream.next()) } fn stash_char(&mut self, idx: usize, c: char) { assert!(self.buffer.is_none()); self.buffer = Some((idx, c)); } fn next_token(&mut self) -> Result, LexerError> { while let Some((token_start_offset, char)) = self.next_char() { if char.is_whitespace() { continue; } let simple_response = |token| { Ok(Some(LocatedToken { token, span: token_start_offset..self.stream.offset(), })) }; match char { '(' => return simple_response(Token::OpenParen), ')' => return simple_response(Token::CloseParen), '[' => return simple_response(Token::OpenSquare), ']' => return simple_response(Token::CloseSquare), '{' => return simple_response(Token::OpenBrace), '}' => return simple_response(Token::CloseBrace), ';' => return simple_response(Token::Semi), ',' => return simple_response(Token::Comma), '`' => return simple_response(Token::BackTick), '\\' => return simple_response(Token::Lambda(false)), 'λ' => return simple_response(Token::Lambda(true)), '0' => return self.starts_with_zero(token_start_offset), '\'' => return self.starts_with_single(token_start_offset), '\"' => return self.starts_with_double(token_start_offset), '-' => return self.starts_with_dash(token_start_offset), ':' => return self.starts_with_colon(token_start_offset), _ => {} } if let Some(value) = char.to_digit(10) { return self.parse_integer(token_start_offset, 10, None, value as u64); } if char.is_uppercase() { return self.parse_identifier( token_start_offset, char.into(), |c| c.is_alphanumeric() || c == '_', Token::TypeName, ); } if char.is_alphabetic() || char == '_' { return self.parse_identifier( token_start_offset, char.into(), |c| c.is_alphanumeric() || c == '_', Token::ValueName, ); } if !char.is_alphanumeric() && !char.is_whitespace() && !char.is_control() { return self.parse_identifier( token_start_offset, char.into(), |c| !c.is_alphanumeric() && !c.is_whitespace() && !c.is_control(), Token::OperatorName, ); } } Ok(None) } fn starts_with_zero( &mut self, token_start_offset: usize, ) -> Result, LexerError> { match self.next_char() { None => { let token = Token::Integer(IntegerWithBase { base: None, value: 0, }); Ok(Some(LocatedToken { token, span: token_start_offset..self.stream.offset(), })) } Some((_, 'b')) => self.parse_integer(token_start_offset, 2, Some(2), 0), Some((_, 'o')) => self.parse_integer(token_start_offset, 8, Some(8), 0), Some((_, 'd')) => self.parse_integer(token_start_offset, 10, Some(10), 0), Some((_, 'x')) => self.parse_integer(token_start_offset, 16, Some(16), 0), Some((offset, c)) => { if let Some(value) = c.to_digit(10) { self.parse_integer(token_start_offset, 10, None, value as u64) } else { self.stash_char(offset, c); let token = Token::Integer(IntegerWithBase { base: None, value: 0, }); Ok(Some(LocatedToken { token, span: token_start_offset..offset, })) } } } } fn parse_integer( &mut self, token_start_offset: usize, base: u32, provided_base: Option, mut value: u64, ) -> Result, LexerError> { let mut end_offset = self.stream.offset(); while let Some((offset, c)) = self.next_char() { end_offset = offset; if let Some(digit) = c.to_digit(base) { value = (value * (base as u64)) + (digit as u64); } else { self.stash_char(offset, c); break; } } let token = Token::Integer(IntegerWithBase { base: provided_base, value, }); Ok(Some(LocatedToken { token, span: token_start_offset..end_offset, })) } fn parse_identifier( &mut self, token_start_offset: usize, mut identifier: String, mut allowed_character: fn(char) -> bool, mut builder: fn(String) -> Token, ) -> Result, LexerError> { let mut end_offset = self.stream.offset(); while let Some((offset, c)) = self.next_char() { end_offset = offset; if allowed_character(c) { identifier.push(c); } else if identifier == "prim" && c == '%' { identifier = String::new(); allowed_character = |c| c.is_alphanumeric() || c == '_'; match self.next_char() { None => { return Err(LexerError::IllegalPrimitive { span: token_start_offset..end_offset, }); } Some((_, char)) => { if char.is_uppercase() { identifier.push(char); builder = Token::PrimitiveTypeName; } else if char.is_lowercase() || char == '_' { identifier.push(char); builder = Token::PrimitiveValueName; } else { return Err(LexerError::IllegalPrimitiveCharacter { span: token_start_offset..end_offset, char, }); } } } } else { self.stash_char(offset, c); break; } } Ok(Some(LocatedToken { token: builder(identifier), span: token_start_offset..end_offset, })) } fn starts_with_single( &mut self, token_start_offset: usize, ) -> Result, LexerError> { let Some((_, mut char)) = self.next_char() else { return Err(LexerError::UnfinishedCharacter { span: token_start_offset..self.stream.offset(), }); }; if char == '\\' { char = self.get_escaped_character(token_start_offset)?; } let Some((idx, finish_char)) = self.next_char() else { return Err(LexerError::UnfinishedCharacter { span: token_start_offset..self.stream.offset(), }); }; if finish_char != '\'' { return Err(LexerError::OverlongCharacter { char, span: token_start_offset..self.stream.offset(), }); } Ok(Some(LocatedToken { token: Token::Character(char), span: token_start_offset..idx, })) } fn get_escaped_character(&mut self, token_start_offset: usize) -> Result { let Some((idx, escaped_char)) = self.next_char() else { return Err(LexerError::UnfinishedCharacter { span: token_start_offset..self.stream.offset(), }); }; match escaped_char { '0' => Ok('\0'), 'a' => Ok('\u{0007}'), 'b' => Ok('\u{0008}'), 'f' => Ok('\u{000C}'), 'n' => Ok('\n'), 'r' => Ok('\r'), 't' => Ok('\t'), 'u' => self.get_unicode_sequence(idx), 'v' => Ok('\u{000B}'), '\'' => Ok('\''), '"' => Ok('"'), '\\' => Ok('\\'), _ => Err(LexerError::UnknownEscapeCharacter { escaped_char, span: idx..self.stream.offset(), }), } } fn get_unicode_sequence(&mut self, token_start_offset: usize) -> Result { let Some((_, char)) = self.next_char() else { return Err(LexerError::InvalidUnicode { span: token_start_offset..self.stream.offset(), }); }; if char != '{' { return Err(LexerError::InvalidUnicode { span: token_start_offset..self.stream.offset(), }); } let mut value: u32 = 0; while let Some((idx, char)) = self.next_char() { if let Some(digit) = char.to_digit(16) { if let Some(shifted) = value.checked_shl(4) { value = shifted + digit; continue; } else { return Err(LexerError::InvalidUnicode { span: token_start_offset..idx, }); } } if char == '}' { if let Some(char) = char::from_u32(value) { return Ok(char); } else { return Err(LexerError::InvalidUnicode { span: token_start_offset..idx, }); } } return Err(LexerError::InvalidUnicode { span: token_start_offset..self.stream.offset(), }); } Err(LexerError::InvalidUnicode { span: token_start_offset..self.stream.offset(), }) } fn starts_with_double( &mut self, token_start_offset: usize, ) -> Result, LexerError> { let mut result = String::new(); while let Some((idx, char)) = self.next_char() { match char { '"' => { return Ok(Some(LocatedToken { token: Token::String(result), span: token_start_offset..idx, })); } '\\' => result.push(self.get_escaped_character(idx)?), _ => result.push(char), } } Err(LexerError::UnfinishedString { span: token_start_offset..self.stream.offset(), }) } fn starts_with_dash( &mut self, token_start_offset: usize, ) -> Result, LexerError> { match self.next_char() { None => Ok(Some(LocatedToken { token: Token::OperatorName("-".into()), span: token_start_offset..token_start_offset + 1, })), Some((end, '>')) => { let Some((pbloc, peekaboo)) = self.next_char() else { return Ok(Some(LocatedToken { token: Token::Arrow, span: token_start_offset..end, })); }; let is_operator = !peekaboo.is_alphanumeric() && !peekaboo.is_whitespace() && !peekaboo.is_control(); if is_operator { self.parse_identifier( token_start_offset, format!("->{peekaboo}"), |c| !c.is_alphanumeric() && !c.is_whitespace() && !c.is_control(), Token::OperatorName, ) } else { self.stash_char(pbloc, peekaboo); Ok(Some(LocatedToken { token: Token::Arrow, span: token_start_offset..end, })) } } Some((_, c)) if !c.is_alphanumeric() && !c.is_whitespace() && !c.is_control() => self .parse_identifier( token_start_offset, format!("-{c}"), |c| !c.is_alphanumeric() && !c.is_whitespace() && !c.is_control(), Token::OperatorName, ), Some((idx, c)) => { self.stash_char(idx, c); Ok(Some(LocatedToken { token: Token::OperatorName("-".into()), span: token_start_offset..idx, })) } } } fn starts_with_colon( &mut self, token_start_offset: usize, ) -> Result, LexerError> { match self.next_char() { None => Ok(Some(LocatedToken { token: Token::Colon, span: token_start_offset..token_start_offset + 1, })), Some((pos, ':')) => Ok(Some(LocatedToken { token: Token::DoubleColon, span: token_start_offset..pos, })), Some((pos, char)) => { self.stash_char(pos, char); Ok(Some(LocatedToken { token: Token::Colon, span: token_start_offset..token_start_offset + 1, })) } } } } proptest::proptest! { #[test] fn token_string_token(token: Token) { println!("Starting from {token:?}"); let string = format!("{token}"); let mut tokens = Lexer::from(string.as_str()); let initial_token = tokens.next() .expect("Can get a token without an error.") .expect("Can get a valid token.") .token; proptest::prop_assert_eq!(token, initial_token); proptest::prop_assert!(tokens.next().is_none()); } } #[cfg(test)] fn parsed_single_token(s: &str) -> Token { let mut tokens = Lexer::from(s); let result = tokens .next() .unwrap_or_else(|| panic!("Can get at least one token from {s:?}")) .expect("Can get a valid token.") .token; assert!( tokens.next().is_none(), "Should only get one token from {s:?}" ); result } #[test] fn numbers_work_as_expected() { assert_eq!( Token::Integer(IntegerWithBase { base: None, value: 1 }), parsed_single_token("1") ); assert_eq!( Token::Integer(IntegerWithBase { base: Some(2), value: 1 }), parsed_single_token("0b1") ); assert_eq!( Token::Integer(IntegerWithBase { base: Some(8), value: 1 }), parsed_single_token("0o1") ); assert_eq!( Token::Integer(IntegerWithBase { base: Some(10), value: 1 }), parsed_single_token("0d1") ); assert_eq!( Token::Integer(IntegerWithBase { base: Some(16), value: 1 }), parsed_single_token("0x1") ); assert_eq!( Token::Integer(IntegerWithBase { base: None, value: 10 }), parsed_single_token("10") ); assert_eq!( Token::Integer(IntegerWithBase { base: Some(2), value: 2 }), parsed_single_token("0b10") ); assert_eq!( Token::Integer(IntegerWithBase { base: Some(8), value: 8 }), parsed_single_token("0o10") ); assert_eq!( Token::Integer(IntegerWithBase { base: None, value: 10 }), parsed_single_token("0010") ); assert_eq!( Token::Integer(IntegerWithBase { base: Some(10), value: 10 }), parsed_single_token("0d10") ); assert_eq!( Token::Integer(IntegerWithBase { base: Some(16), value: 16 }), parsed_single_token("0x10") ); } #[test] fn lambda_works() { assert_eq!(Token::Lambda(false), parsed_single_token("\\")); assert_eq!(Token::Lambda(true), parsed_single_token("λ")); assert_eq!(Token::TypeName("Λ".into()), parsed_single_token("Λ")); } #[test] fn types_work_as_expected() { assert_eq!(Token::TypeName("Int".into()), parsed_single_token("Int")); assert_eq!(Token::TypeName("Int8".into()), parsed_single_token("Int8")); assert_eq!(Token::TypeName("Γ".into()), parsed_single_token("Γ")); } #[test] fn values_work_as_expected() { assert_eq!( Token::ValueName("alpha".into()), parsed_single_token("alpha") ); assert_eq!(Token::ValueName("ɑ".into()), parsed_single_token("ɑ")); } #[test] fn primitives() { assert_eq!( Token::PrimitiveValueName("add_u8".into()), parsed_single_token("prim%add_u8"), ); assert_eq!( Token::PrimitiveTypeName("U8".into()), parsed_single_token("prim%U8"), ); assert!(Lexer::from("prim%").next().unwrap().is_err()); assert!(Lexer::from("prim%%").next().unwrap().is_err()); } #[test] fn operators_work_as_expected() { assert_eq!(Token::OperatorName("-".into()), parsed_single_token("-")); assert_eq!(Token::OperatorName("+".into()), parsed_single_token("+")); assert_eq!(Token::OperatorName("*".into()), parsed_single_token("*")); assert_eq!(Token::OperatorName("/".into()), parsed_single_token("/")); assert_eq!(Token::OperatorName("↣".into()), parsed_single_token("↣")); } #[test] fn can_separate_pieces() { let mut lexer = Lexer::from("a-b"); let mut next_token = move || lexer.next().map(|x| x.expect("Can read valid token").token); assert_eq!(Some(Token::ValueName("a".into())), next_token()); assert_eq!(Some(Token::OperatorName("-".into())), next_token()); assert_eq!(Some(Token::ValueName("b".into())), next_token()); assert_eq!(None, next_token()); let mut lexer = Lexer::from("a--b"); let mut next_token = move || lexer.next().map(|x| x.expect("Can read valid token").token); assert_eq!(Some(Token::ValueName("a".into())), next_token()); assert_eq!(Some(Token::OperatorName("--".into())), next_token()); assert_eq!(Some(Token::ValueName("b".into())), next_token()); assert_eq!(None, next_token()); let mut lexer = Lexer::from("a - -b"); let mut next_token = move || lexer.next().map(|x| x.expect("Can read valid token").token); assert_eq!(Some(Token::ValueName("a".into())), next_token()); assert_eq!(Some(Token::OperatorName("-".into())), next_token()); assert_eq!(Some(Token::OperatorName("-".into())), next_token()); assert_eq!(Some(Token::ValueName("b".into())), next_token()); assert_eq!(None, next_token()); } #[test] fn arrow_requires_nonop() { let mut lexer = Lexer::from("->"); let mut next_token = move || lexer.next().map(|x| x.expect("Can read valid token").token); assert_eq!(Some(Token::Arrow), next_token()); let mut lexer = Lexer::from("->*"); let mut next_token = move || lexer.next().map(|x| x.expect("Can read valid token").token); assert_eq!(Some(Token::OperatorName("->*".into())), next_token()); let mut lexer = Lexer::from("->*x"); let mut next_token = move || lexer.next().map(|x| x.expect("Can read valid token").token); assert_eq!(Some(Token::OperatorName("->*".into())), next_token()); let mut lexer = Lexer::from("->x"); let mut next_token = move || lexer.next().map(|x| x.expect("Can read valid token").token); assert_eq!(Some(Token::Arrow), next_token()); } #[test] fn unicode() { let mut lexer = Lexer::from("'\\u{00BE}'"); let mut next_token = move || lexer.next().map(|x| x.expect("Can read valid token").token); assert_eq!(Some(Token::Character('¾')), next_token()); let mut lexer = Lexer::from("'\\u{11111111111111111111111111111}'"); assert!(lexer.next().unwrap().is_err()); let mut lexer = Lexer::from("'\\u{00BE'"); assert!(lexer.next().unwrap().is_err()); let mut lexer = Lexer::from("'\\u00BE}'"); assert!(lexer.next().unwrap().is_err()); let mut lexer = Lexer::from("'\\u"); assert!(lexer.next().unwrap().is_err()); let mut lexer = Lexer::from("'\\u{00Z}'"); assert!(lexer.next().unwrap().is_err()); } #[test] fn character_string_errors() { let mut lexer = Lexer::from("'"); assert!(lexer.next().unwrap().is_err()); let mut lexer = Lexer::from("'-\\"); assert!(lexer.next().unwrap().is_err()); let mut lexer = Lexer::from("''"); assert!(lexer.next().unwrap().is_err()); let mut lexer = Lexer::from("'ab'"); assert!(lexer.next().unwrap().is_err()); let mut lexer = Lexer::from("'\\x'"); assert!(lexer.next().unwrap().is_err()); let mut lexer = Lexer::from("'a'"); assert!(matches!( lexer.next(), Some(Ok(LocatedToken { token: Token::Character('a'), .. })) )); let mut lexer = Lexer::from("'\\0'"); assert!(matches!( lexer.next(), Some(Ok(LocatedToken { token: Token::Character('\0'), .. })) )); let mut lexer = Lexer::from("'\\a'"); assert!(matches!( lexer.next(), Some(Ok(LocatedToken { token: Token::Character(_), .. })) )); let mut lexer = Lexer::from("'\\b'"); assert!(matches!( lexer.next(), Some(Ok(LocatedToken { token: Token::Character(_), .. })) )); let mut lexer = Lexer::from("'\\f'"); assert!(matches!( lexer.next(), Some(Ok(LocatedToken { token: Token::Character(_), .. })) )); let mut lexer = Lexer::from("'\\n'"); assert!(matches!( lexer.next(), Some(Ok(LocatedToken { token: Token::Character(_), .. })) )); let mut lexer = Lexer::from("'\\r'"); assert!(matches!( lexer.next(), Some(Ok(LocatedToken { token: Token::Character(_), .. })) )); let mut lexer = Lexer::from("'\\t'"); assert!(matches!( lexer.next(), Some(Ok(LocatedToken { token: Token::Character(_), .. })) )); let mut lexer = Lexer::from("'\\v'"); assert!(matches!( lexer.next(), Some(Ok(LocatedToken { token: Token::Character(_), .. })) )); let mut lexer = Lexer::from("'\\''"); assert!(matches!( lexer.next(), Some(Ok(LocatedToken { token: Token::Character('\''), .. })) )); let mut lexer = Lexer::from("'\\\\'"); assert!(matches!( lexer.next(), Some(Ok(LocatedToken { token: Token::Character('\\'), .. })) )); let mut lexer = Lexer::from("\"foo"); assert!(lexer.next().unwrap().is_err()); }