✍️ Switch to a handwritten lexer and parser. #1
5
.gitignore
vendored
5
.gitignore
vendored
@@ -6,3 +6,8 @@
|
||||
hsrc/Syntax/Lexer.hs
|
||||
hsrc/Syntax/Parser.hs
|
||||
bang
|
||||
|
||||
|
||||
# Added by cargo
|
||||
|
||||
/target
|
||||
|
||||
1044
Cargo.lock
generated
Normal file
1044
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
16
Cargo.toml
Normal file
16
Cargo.toml
Normal file
@@ -0,0 +1,16 @@
|
||||
[package]
|
||||
name = "bang"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
codespan = "0.12.0"
|
||||
codespan-reporting = "0.12.0"
|
||||
lalrpop-util = "0.20.2"
|
||||
logos = "0.15.1"
|
||||
proptest = "1.7.0"
|
||||
proptest-derive = "0.6.0"
|
||||
thiserror = "2.0.12"
|
||||
|
||||
[build-dependencies]
|
||||
lalrpop = "0.20.2"
|
||||
5
build.rs
Normal file
5
build.rs
Normal file
@@ -0,0 +1,5 @@
|
||||
extern crate lalrpop;
|
||||
|
||||
fn main() {
|
||||
lalrpop::process_root().unwrap();
|
||||
}
|
||||
1
src/bin/bangc.rs
Normal file
1
src/bin/bangc.rs
Normal file
@@ -0,0 +1 @@
|
||||
fn main() {}
|
||||
1
src/lib.rs
Normal file
1
src/lib.rs
Normal file
@@ -0,0 +1 @@
|
||||
pub mod syntax;
|
||||
251
src/syntax.rs
Normal file
251
src/syntax.rs
Normal file
@@ -0,0 +1,251 @@
|
||||
use lalrpop_util::lalrpop_mod;
|
||||
|
||||
mod error;
|
||||
lalrpop_mod!(
|
||||
#[allow(clippy::just_underscores_and_digits, clippy::clone_on_copy)]
|
||||
parser,
|
||||
"/syntax/parser.rs"
|
||||
);
|
||||
pub mod tokens;
|
||||
|
||||
#[cfg(test)]
|
||||
use crate::syntax::error::ParserError;
|
||||
#[cfg(test)]
|
||||
use crate::syntax::parser::*;
|
||||
#[cfg(test)]
|
||||
use crate::syntax::tokens::Lexer;
|
||||
use codespan_reporting::diagnostic::Label;
|
||||
use proptest_derive::Arbitrary;
|
||||
use std::cmp::{max, min};
|
||||
use std::fmt::Debug;
|
||||
use std::ops::Range;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Location {
|
||||
file_id: usize,
|
||||
span: Range<usize>,
|
||||
}
|
||||
|
||||
impl Location {
|
||||
pub fn new(file_id: usize, span: Range<usize>) -> Self {
|
||||
Location { file_id, span }
|
||||
}
|
||||
|
||||
pub fn extend_to(&self, other: &Location) -> Location {
|
||||
assert_eq!(self.file_id, other.file_id);
|
||||
Location {
|
||||
file_id: self.file_id,
|
||||
span: min(self.span.start, other.span.start)..max(self.span.end, other.span.end),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn primary_label(&self) -> Label<usize> {
|
||||
Label::primary(self.file_id, self.span.clone())
|
||||
}
|
||||
|
||||
pub fn secondary_label(&self) -> Label<usize> {
|
||||
Label::secondary(self.file_id, self.span.clone())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Module {
|
||||
definitions: Vec<Definition>,
|
||||
}
|
||||
|
||||
pub struct Definition {
|
||||
location: Location,
|
||||
export: ExportClass,
|
||||
type_restrictions: TypeRestrictions,
|
||||
definition: Def,
|
||||
}
|
||||
|
||||
pub enum Def {
|
||||
Enumeration(EnumerationDef),
|
||||
Structure(StructureDef),
|
||||
Function(FunctionDef),
|
||||
Value(ValueDef),
|
||||
}
|
||||
|
||||
impl Def {
|
||||
fn location(&self) -> &Location {
|
||||
match self {
|
||||
Def::Enumeration(def) => &def.location,
|
||||
Def::Structure(def) => &def.location,
|
||||
Def::Function(def) => &def.location,
|
||||
Def::Value(def) => &def.location,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EnumerationDef {
|
||||
location: Location,
|
||||
options: Vec<EnumerationVariant>,
|
||||
}
|
||||
|
||||
pub struct EnumerationVariant {
|
||||
location: Location,
|
||||
name: String,
|
||||
arguments: Vec<Type>,
|
||||
}
|
||||
|
||||
pub struct StructureDef {
|
||||
name: String,
|
||||
location: Location,
|
||||
fields: Vec<StructureField>,
|
||||
}
|
||||
|
||||
pub struct StructureField {
|
||||
name: String,
|
||||
field_type: Type,
|
||||
}
|
||||
|
||||
pub struct FunctionDef {
|
||||
name: String,
|
||||
location: Location,
|
||||
arguments: Vec<FunctionArg>,
|
||||
return_type: Option<Type>,
|
||||
body: Vec<Statement>,
|
||||
}
|
||||
|
||||
pub struct FunctionArg {
|
||||
name: String,
|
||||
arg_type: Option<Type>,
|
||||
}
|
||||
|
||||
pub struct ValueDef {
|
||||
name: String,
|
||||
location: Location,
|
||||
value: Value,
|
||||
}
|
||||
|
||||
pub enum ExportClass {
|
||||
Public,
|
||||
Private,
|
||||
}
|
||||
|
||||
pub enum Statement {
|
||||
Binding(BindingStmt),
|
||||
}
|
||||
|
||||
pub struct BindingStmt {
|
||||
location: Location,
|
||||
mutable: bool,
|
||||
variable: String,
|
||||
value: Expression,
|
||||
}
|
||||
|
||||
pub enum Expression {
|
||||
Value(Value),
|
||||
}
|
||||
|
||||
pub struct TypeRestrictions {
|
||||
restrictions: Vec<TypeRestriction>,
|
||||
}
|
||||
|
||||
impl TypeRestrictions {
|
||||
fn empty() -> Self {
|
||||
TypeRestrictions {
|
||||
restrictions: vec![],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TypeRestriction {
|
||||
location: Location,
|
||||
class: String,
|
||||
variables: Vec<String>,
|
||||
}
|
||||
|
||||
pub enum Type {
|
||||
Constructor(Location, String),
|
||||
Variable(Location, String),
|
||||
Primitive(Location, String),
|
||||
Application(Box<Type>, Vec<Type>),
|
||||
Function(Vec<Type>, Box<Type>),
|
||||
}
|
||||
|
||||
pub enum Value {
|
||||
Constant(ConstantValue),
|
||||
}
|
||||
|
||||
pub enum ConstantValue {
|
||||
Integer(Location, IntegerWithBase),
|
||||
Character(Location, char),
|
||||
String(Location, String),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Arbitrary)]
|
||||
pub struct IntegerWithBase {
|
||||
#[proptest(strategy = "proptest::prop_oneof![ \
|
||||
proptest::strategy::Just(None), \
|
||||
proptest::strategy::Just(Some(2)), \
|
||||
proptest::strategy::Just(Some(8)), \
|
||||
proptest::strategy::Just(Some(10)), \
|
||||
proptest::strategy::Just(Some(16)), \
|
||||
]")]
|
||||
base: Option<u8>,
|
||||
value: u64,
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn can_parse_constants() {
|
||||
let parse_constant = |str| {
|
||||
let lexer = Lexer::from(str).map(|item| {
|
||||
item.map_err(|e| ParserError::LexerError {
|
||||
file_id: 0,
|
||||
error: e,
|
||||
})
|
||||
});
|
||||
let result = ConstantValueParser::new().parse(0, lexer);
|
||||
result
|
||||
};
|
||||
|
||||
assert!(matches!(
|
||||
parse_constant("16"),
|
||||
Ok(ConstantValue::Integer(
|
||||
_,
|
||||
IntegerWithBase {
|
||||
base: None,
|
||||
value: 16,
|
||||
}
|
||||
))
|
||||
));
|
||||
assert!(matches!(
|
||||
parse_constant("0x10"),
|
||||
Ok(ConstantValue::Integer(
|
||||
_,
|
||||
IntegerWithBase {
|
||||
base: Some(16),
|
||||
value: 16,
|
||||
}
|
||||
))
|
||||
));
|
||||
assert!(matches!(
|
||||
parse_constant("0o20"),
|
||||
Ok(ConstantValue::Integer(
|
||||
_,
|
||||
IntegerWithBase {
|
||||
base: Some(8),
|
||||
value: 16,
|
||||
}
|
||||
))
|
||||
));
|
||||
assert!(matches!(
|
||||
parse_constant("0b10000"),
|
||||
Ok(ConstantValue::Integer(
|
||||
_,
|
||||
IntegerWithBase {
|
||||
base: Some(2),
|
||||
value: 16,
|
||||
}
|
||||
))
|
||||
));
|
||||
assert!(
|
||||
matches!(parse_constant("\"foo\""), Ok(ConstantValue::String(_, x))
|
||||
if x == "foo")
|
||||
);
|
||||
assert!(matches!(
|
||||
parse_constant("'f'"),
|
||||
Ok(ConstantValue::Character(_, 'f'))
|
||||
));
|
||||
}
|
||||
116
src/syntax/error.rs
Normal file
116
src/syntax/error.rs
Normal file
@@ -0,0 +1,116 @@
|
||||
//use codespan_reporting::diagnostic::{Diagnostic, Label};
|
||||
use crate::syntax::tokens::Token;
|
||||
use std::ops::Range;
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ParserError {
|
||||
#[error("Lexer error at {file_id}: {error}")]
|
||||
LexerError { file_id: usize, error: LexerError },
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Error, PartialEq)]
|
||||
pub enum LexerError {
|
||||
#[error("Illegal control character in input stream at offset {offset}")]
|
||||
IllegalControlCharacter { offset: usize },
|
||||
|
||||
#[error("Illegal primitive value/type; it cut off before we could determine which at {span:?}")]
|
||||
IllegalPrimitive { span: Range<usize> },
|
||||
|
||||
#[error("Illegal character in primitive ({char:?}) at {span:?}")]
|
||||
IllegalPrimitiveCharacter { span: Range<usize>, char: char },
|
||||
|
||||
#[error("Unfinished character constant found at {span:?}")]
|
||||
UnfinishedCharacter { span: Range<usize> },
|
||||
|
||||
#[error("Unfinished string constant found at {span:?}")]
|
||||
UnfinishedString { span: Range<usize> },
|
||||
|
||||
#[error("Character {char:?} has some extra bits at the end at {span:?}")]
|
||||
OverlongCharacter { char: char, span: Range<usize> },
|
||||
|
||||
#[error("Unknown escaped character {escaped_char:?} at {span:?}")]
|
||||
UnknownEscapeCharacter {
|
||||
escaped_char: char,
|
||||
span: Range<usize>,
|
||||
},
|
||||
|
||||
#[error("Invalid unicode escape sequence at {span:?}")]
|
||||
InvalidUnicode { span: Range<usize> },
|
||||
}
|
||||
|
||||
impl LexerError {
|
||||
pub fn to_triple(&self) -> (usize, Result<Token, LexerError>, usize) {
|
||||
match self {
|
||||
LexerError::IllegalControlCharacter { offset } => (*offset, Err(self.clone()), *offset),
|
||||
LexerError::IllegalPrimitive { span } => (span.start, Err(self.clone()), span.end),
|
||||
LexerError::IllegalPrimitiveCharacter { span, .. } => {
|
||||
(span.start, Err(self.clone()), span.end)
|
||||
}
|
||||
LexerError::UnfinishedCharacter { span, .. } => {
|
||||
(span.start, Err(self.clone()), span.end)
|
||||
}
|
||||
LexerError::UnfinishedString { span, .. } => (span.start, Err(self.clone()), span.end),
|
||||
LexerError::OverlongCharacter { span, .. } => (span.start, Err(self.clone()), span.end),
|
||||
LexerError::UnknownEscapeCharacter { span, .. } => {
|
||||
(span.start, Err(self.clone()), span.end)
|
||||
}
|
||||
LexerError::InvalidUnicode { span, .. } => (span.start, Err(self.clone()), span.end),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//impl<F> From<LexerError> for Diagnostic<F> {
|
||||
// fn from(value: LexerError) -> Self {
|
||||
// match value {
|
||||
// LexerError::IllegalControlCharacter { file, offset } => Diagnostic::error()
|
||||
// .with_code("E1001")
|
||||
// .with_message("Illegal control character in input stream")
|
||||
// .with_label(Label::primary(file, offset..offset).with_message("illegal character")),
|
||||
//
|
||||
// LexerError::IllegalPrimitive { file, span } => Diagnostic::error()
|
||||
// .with_code("E1002")
|
||||
// .with_message("Illegal primitive; it cut off before it could finish")
|
||||
// .with_label(
|
||||
// Label::primary(file, span)
|
||||
// .with_message("should be at least one character after the %"),
|
||||
// ),
|
||||
//
|
||||
// LexerError::IllegalPrimitiveCharacter { file, span, char } => Diagnostic::error()
|
||||
// .with_code("E1003")
|
||||
// .with_message(format!("Illegal character {char:?} in primitive"))
|
||||
// .with_label(Label::primary(file, span).with_message("illegal character")),
|
||||
//
|
||||
// LexerError::UnfinishedCharacter { file, span } => Diagnostic::error()
|
||||
// .with_code("E1004")
|
||||
// .with_message("Unfinished character in input stream.")
|
||||
// .with_label(Label::primary(file, span).with_message("unfinished character")),
|
||||
//
|
||||
// LexerError::UnfinishedString { file, span } => Diagnostic::error()
|
||||
// .with_code("E1005")
|
||||
// .with_message("Unfinished string in input stream.")
|
||||
// .with_label(Label::primary(file, span).with_message("unfinished string")),
|
||||
//
|
||||
// LexerError::OverlongCharacter { file, char, span } => Diagnostic::error()
|
||||
// .with_code("E1006")
|
||||
// .with_message(format!(
|
||||
// "Character {char:?} has some extra bits at the end of it."
|
||||
// ))
|
||||
// .with_label(Label::primary(file, span).with_message("overlong character")),
|
||||
//
|
||||
// LexerError::UnknownEscapeCharacter {
|
||||
// file,
|
||||
// escaped_char,
|
||||
// span,
|
||||
// } => Diagnostic::error()
|
||||
// .with_code("E1007")
|
||||
// .with_message(format!("Unknown escape character {escaped_char:?}."))
|
||||
// .with_label(Label::primary(file, span).with_message("unknown character")),
|
||||
//
|
||||
// LexerError::InvalidUnicode { file, span } => Diagnostic::error()
|
||||
// .with_code("E1008")
|
||||
// .with_message("Unknown or invalid unicode escape sequence.")
|
||||
// .with_label(Label::primary(file, span).with_message("escape sequence")),
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
39
src/syntax/parser.lalrpop
Normal file
39
src/syntax/parser.lalrpop
Normal file
@@ -0,0 +1,39 @@
|
||||
use crate::syntax::*;
|
||||
use crate::syntax::error::ParserError;
|
||||
use crate::syntax::tokens::*;
|
||||
|
||||
grammar(file_id: usize);
|
||||
|
||||
extern {
|
||||
type Location = usize;
|
||||
type Error = ParserError;
|
||||
|
||||
enum Token {
|
||||
"(" => Token::OpenParen,
|
||||
")" => Token::CloseParen,
|
||||
"[" => Token::OpenSquare,
|
||||
"]" => Token::CloseSquare,
|
||||
"{" => Token::OpenBrace,
|
||||
"}" => Token::CloseBrace,
|
||||
";" => Token::Semi,
|
||||
":" => Token::Colon,
|
||||
"," => Token::Comma,
|
||||
"`" => Token::BackTick,
|
||||
"\\" => Token::Lambda(_),
|
||||
|
||||
"<constructor>" => Token::TypeName(<String>),
|
||||
"<value>" => Token::ValueName(<String>),
|
||||
"<op>" => Token::OperatorName(<String>),
|
||||
"<prim_constructor>" => Token::PrimitiveTypeName(<String>),
|
||||
"<prim_value>" => Token::PrimitiveValueName(<String>),
|
||||
"<integer>" => Token::Integer(<IntegerWithBase>),
|
||||
"<char>" => Token::Character(<char>),
|
||||
"<string>" => Token::String(<String>),
|
||||
}
|
||||
}
|
||||
|
||||
pub ConstantValue: ConstantValue = {
|
||||
<s:@L> <x:"<integer>"> <e:@L> => ConstantValue::Integer(Location::new(file_id, s..e), x),
|
||||
<s:@L> <x:"<char>"> <e:@L> => ConstantValue::Character(Location::new(file_id, s..e), x),
|
||||
<s:@L> <x:"<string>"> <e:@L> => ConstantValue::String(Location::new(file_id, s..e), x),
|
||||
}
|
||||
609
src/syntax/tokens.rs
Normal file
609
src/syntax/tokens.rs
Normal file
@@ -0,0 +1,609 @@
|
||||
use crate::syntax::IntegerWithBase;
|
||||
use crate::syntax::error::LexerError;
|
||||
use proptest_derive::Arbitrary;
|
||||
use std::fmt;
|
||||
use std::str::CharIndices;
|
||||
|
||||
/// A single token of the input stream; used to help the parsing function over
|
||||
/// more concrete things than bytes.
|
||||
///
|
||||
/// The [`std::fmt::Display`] implementation is designed to round-trip, so those
|
||||
/// needing a more regular or descriptive option should consider using the
|
||||
/// [`std::fmt::Debug`] implementation instead.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Arbitrary)]
|
||||
pub enum Token {
|
||||
OpenParen,
|
||||
CloseParen,
|
||||
OpenSquare,
|
||||
CloseSquare,
|
||||
OpenBrace,
|
||||
CloseBrace,
|
||||
Semi,
|
||||
Colon,
|
||||
Comma,
|
||||
BackTick,
|
||||
Lambda(bool),
|
||||
|
||||
TypeName(#[proptest(regex = r"[A-Z][a-zA-Z0-9_]*")] String),
|
||||
ValueName(#[proptest(regex = r"[a-z_][a-zA-Z0-9_]*")] String),
|
||||
OperatorName(
|
||||
#[proptest(
|
||||
regex = r"[\~\!\@\#\$\%\^\&\*\+\-\=\.<>\?\|][\~\!\@\#\$\%\^\&\*\+\-\=\.<>\?\|_]*"
|
||||
)]
|
||||
String,
|
||||
),
|
||||
|
||||
PrimitiveTypeName(#[proptest(regex = r"[A-Z][a-zA-Z0-9_]*")] String),
|
||||
PrimitiveValueName(#[proptest(regex = r"[a-z_][a-zA-Z0-9_]*")] String),
|
||||
|
||||
Integer(IntegerWithBase),
|
||||
Character(char),
|
||||
String(String),
|
||||
}
|
||||
|
||||
impl fmt::Display for Token {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
Token::OpenParen => write!(f, "("),
|
||||
Token::CloseParen => write!(f, ")"),
|
||||
Token::OpenSquare => write!(f, "["),
|
||||
Token::CloseSquare => write!(f, "]"),
|
||||
Token::OpenBrace => write!(f, "{{"),
|
||||
Token::CloseBrace => write!(f, "}}"),
|
||||
Token::Semi => write!(f, ";"),
|
||||
Token::Colon => write!(f, ":"),
|
||||
Token::Comma => write!(f, ","),
|
||||
Token::BackTick => write!(f, "`"),
|
||||
Token::Lambda(false) => write!(f, "\\"),
|
||||
Token::Lambda(true) => write!(f, "λ"),
|
||||
Token::TypeName(str) => write!(f, "{str}"),
|
||||
Token::ValueName(str) => write!(f, "{str}"),
|
||||
Token::OperatorName(str) => write!(f, "{str}"),
|
||||
Token::PrimitiveTypeName(str) => write!(f, "prim%{str}"),
|
||||
Token::PrimitiveValueName(str) => write!(f, "prim%{str}"),
|
||||
Token::Integer(IntegerWithBase { base, value }) => match base {
|
||||
None => write!(f, "{value}"),
|
||||
Some(2) => write!(f, "0b{value:b}"),
|
||||
Some(8) => write!(f, "0o{value:o}"),
|
||||
Some(10) => write!(f, "0d{value}"),
|
||||
Some(16) => write!(f, "0x{value:x}"),
|
||||
Some(base) => write!(f, "<illegal number token base={base} value={value}>"),
|
||||
},
|
||||
Token::Character(c) => write!(f, "{c:?}"),
|
||||
Token::String(s) => write!(f, "{s:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(private_interfaces)]
|
||||
pub enum Lexer<'a> {
|
||||
Working(LexerState<'a>),
|
||||
Errored(LexerError),
|
||||
Done(usize),
|
||||
}
|
||||
|
||||
struct LexerState<'a> {
|
||||
stream: CharIndices<'a>,
|
||||
buffer: Option<(usize, char)>,
|
||||
}
|
||||
|
||||
impl<'a> From<&'a str> for Lexer<'a> {
|
||||
fn from(value: &'a str) -> Self {
|
||||
println!("LEXING '{value}'");
|
||||
Lexer::Working(LexerState {
|
||||
stream: value.char_indices(),
|
||||
buffer: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Lexer<'a> {
|
||||
pub fn new(stream: &'a str) -> Self {
|
||||
Lexer::Working(LexerState {
|
||||
stream: stream.char_indices(),
|
||||
buffer: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Lexer<'a> {
|
||||
type Item = Result<(usize, Token, usize), LexerError>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self {
|
||||
Lexer::Done(_) => None,
|
||||
Lexer::Errored(e) => Some(Err(e.clone())),
|
||||
Lexer::Working(state) => match state.next_token() {
|
||||
Err(e) => {
|
||||
println!("ERROR: {e}");
|
||||
*self = Lexer::Errored(e.clone());
|
||||
Some(Err(e))
|
||||
}
|
||||
|
||||
Ok(None) => {
|
||||
println!("LEXER DONE");
|
||||
*self = Lexer::Done(state.stream.offset());
|
||||
None
|
||||
}
|
||||
|
||||
Ok(Some((start, token, end))) => {
|
||||
println!("TOKEN: {:?}", token);
|
||||
Some(Ok((start, token, end)))
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> LexerState<'a> {
|
||||
fn next_char(&mut self) -> Option<(usize, char)> {
|
||||
let result = self.buffer.take().or_else(|| self.stream.next());
|
||||
println!("next_char() -> {result:?}");
|
||||
result
|
||||
}
|
||||
|
||||
fn stash_char(&mut self, idx: usize, c: char) {
|
||||
println!("stash_char({idx}, {c})");
|
||||
assert!(self.buffer.is_none());
|
||||
self.buffer = Some((idx, c));
|
||||
}
|
||||
|
||||
fn next_token(&mut self) -> Result<Option<(usize, Token, usize)>, LexerError> {
|
||||
while let Some((token_start_offset, char)) = self.next_char() {
|
||||
if char.is_whitespace() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let simple_response =
|
||||
|token| Ok(Some((token_start_offset, token, self.stream.offset())));
|
||||
|
||||
match char {
|
||||
'(' => return simple_response(Token::OpenParen),
|
||||
')' => return simple_response(Token::CloseParen),
|
||||
'[' => return simple_response(Token::OpenSquare),
|
||||
']' => return simple_response(Token::CloseSquare),
|
||||
'{' => return simple_response(Token::OpenBrace),
|
||||
'}' => return simple_response(Token::CloseBrace),
|
||||
';' => return simple_response(Token::Semi),
|
||||
':' => return simple_response(Token::Colon),
|
||||
',' => return simple_response(Token::Comma),
|
||||
'`' => return simple_response(Token::BackTick),
|
||||
'\\' => return simple_response(Token::Lambda(false)),
|
||||
'λ' => return simple_response(Token::Lambda(true)),
|
||||
|
||||
'0' => return self.starts_with_zero(token_start_offset),
|
||||
'\'' => return self.starts_with_single(token_start_offset),
|
||||
'\"' => return self.starts_with_double(token_start_offset),
|
||||
_ => {}
|
||||
}
|
||||
|
||||
if let Some(value) = char.to_digit(10) {
|
||||
return self.parse_integer(token_start_offset, 10, None, value as u64);
|
||||
}
|
||||
|
||||
if char.is_uppercase() {
|
||||
return self.parse_identifier(
|
||||
token_start_offset,
|
||||
char.into(),
|
||||
|c| c.is_alphanumeric() || c == '_',
|
||||
Token::TypeName,
|
||||
);
|
||||
}
|
||||
|
||||
if char.is_alphabetic() || char == '_' {
|
||||
return self.parse_identifier(
|
||||
token_start_offset,
|
||||
char.into(),
|
||||
|c| c.is_alphanumeric() || c == '_',
|
||||
Token::ValueName,
|
||||
);
|
||||
}
|
||||
|
||||
if !char.is_alphanumeric() && !char.is_whitespace() && !char.is_control() {
|
||||
return self.parse_identifier(
|
||||
token_start_offset,
|
||||
char.into(),
|
||||
|c| !c.is_alphanumeric() && !c.is_whitespace() && !c.is_control(),
|
||||
Token::OperatorName,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn starts_with_zero(
|
||||
&mut self,
|
||||
token_start_offset: usize,
|
||||
) -> Result<Option<(usize, Token, usize)>, LexerError> {
|
||||
match self.next_char() {
|
||||
None => {
|
||||
let token = Token::Integer(IntegerWithBase {
|
||||
base: None,
|
||||
value: 0,
|
||||
});
|
||||
Ok(Some((token_start_offset, token, self.stream.offset())))
|
||||
}
|
||||
|
||||
Some((_, 'b')) => self.parse_integer(token_start_offset, 2, Some(2), 0),
|
||||
Some((_, 'o')) => self.parse_integer(token_start_offset, 8, Some(8), 0),
|
||||
Some((_, 'd')) => self.parse_integer(token_start_offset, 10, Some(10), 0),
|
||||
Some((_, 'x')) => self.parse_integer(token_start_offset, 16, Some(16), 0),
|
||||
|
||||
Some((offset, c)) => {
|
||||
if let Some(value) = c.to_digit(10) {
|
||||
self.parse_integer(token_start_offset, 10, None, value as u64)
|
||||
} else {
|
||||
self.stash_char(offset, c);
|
||||
let token = Token::Integer(IntegerWithBase {
|
||||
base: None,
|
||||
value: 0,
|
||||
});
|
||||
Ok(Some((token_start_offset, token, offset)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_integer(
|
||||
&mut self,
|
||||
token_start_offset: usize,
|
||||
base: u32,
|
||||
provided_base: Option<u8>,
|
||||
mut value: u64,
|
||||
) -> Result<Option<(usize, Token, usize)>, LexerError> {
|
||||
let mut end_offset = self.stream.offset();
|
||||
|
||||
while let Some((offset, c)) = self.next_char() {
|
||||
end_offset = offset;
|
||||
if let Some(digit) = c.to_digit(base) {
|
||||
value = (value * (base as u64)) + (digit as u64);
|
||||
} else {
|
||||
self.stash_char(offset, c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let token = Token::Integer(IntegerWithBase {
|
||||
base: provided_base,
|
||||
value,
|
||||
});
|
||||
|
||||
Ok(Some((token_start_offset, token, end_offset)))
|
||||
}
|
||||
|
||||
fn parse_identifier(
|
||||
&mut self,
|
||||
token_start_offset: usize,
|
||||
mut identifier: String,
|
||||
mut allowed_character: fn(char) -> bool,
|
||||
mut builder: fn(String) -> Token,
|
||||
) -> Result<Option<(usize, Token, usize)>, LexerError> {
|
||||
let mut end_offset = self.stream.offset();
|
||||
|
||||
while let Some((offset, c)) = self.next_char() {
|
||||
end_offset = offset;
|
||||
|
||||
if allowed_character(c) {
|
||||
identifier.push(c);
|
||||
} else if identifier == "prim" && c == '%' {
|
||||
identifier = String::new();
|
||||
allowed_character = |c| c.is_alphanumeric() || c == '_';
|
||||
match self.next_char() {
|
||||
None => {
|
||||
return Err(LexerError::IllegalPrimitive {
|
||||
span: token_start_offset..end_offset,
|
||||
});
|
||||
}
|
||||
|
||||
Some((_, char)) => {
|
||||
if char.is_uppercase() {
|
||||
identifier.push(char);
|
||||
builder = Token::PrimitiveTypeName;
|
||||
} else if char.is_lowercase() || char == '_' {
|
||||
identifier.push(char);
|
||||
builder = Token::PrimitiveValueName;
|
||||
} else {
|
||||
return Err(LexerError::IllegalPrimitiveCharacter {
|
||||
span: token_start_offset..end_offset,
|
||||
char,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
self.stash_char(offset, c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Some((token_start_offset, builder(identifier), end_offset)))
|
||||
}
|
||||
|
||||
fn starts_with_single(
|
||||
&mut self,
|
||||
token_start_offset: usize,
|
||||
) -> Result<Option<(usize, Token, usize)>, LexerError> {
|
||||
let Some((_, mut char)) = self.next_char() else {
|
||||
return Err(LexerError::UnfinishedCharacter {
|
||||
span: token_start_offset..self.stream.offset(),
|
||||
});
|
||||
};
|
||||
|
||||
if char == '\\' {
|
||||
char = self.get_escaped_character(token_start_offset)?;
|
||||
}
|
||||
|
||||
let Some((idx, finish_char)) = self.next_char() else {
|
||||
return Err(LexerError::UnfinishedCharacter {
|
||||
span: token_start_offset..self.stream.offset(),
|
||||
});
|
||||
};
|
||||
|
||||
if finish_char != '\'' {
|
||||
return Err(LexerError::OverlongCharacter {
|
||||
char,
|
||||
span: token_start_offset..self.stream.offset(),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(Some((token_start_offset, Token::Character(char), idx)))
|
||||
}
|
||||
|
||||
fn get_escaped_character(&mut self, token_start_offset: usize) -> Result<char, LexerError> {
|
||||
let Some((idx, escaped_char)) = self.next_char() else {
|
||||
return Err(LexerError::UnfinishedCharacter {
|
||||
span: token_start_offset..self.stream.offset(),
|
||||
});
|
||||
};
|
||||
|
||||
match escaped_char {
|
||||
'0' => Ok('\0'),
|
||||
'a' => Ok('\u{0007}'),
|
||||
'b' => Ok('\u{0008}'),
|
||||
'f' => Ok('\u{000C}'),
|
||||
'n' => Ok('\n'),
|
||||
'r' => Ok('\r'),
|
||||
't' => Ok('\t'),
|
||||
'u' => self.get_unicode_sequence(idx),
|
||||
'v' => Ok('\u{000B}'),
|
||||
'\'' => Ok('\''),
|
||||
'"' => Ok('"'),
|
||||
'\\' => Ok('\\'),
|
||||
_ => Err(LexerError::UnknownEscapeCharacter {
|
||||
escaped_char,
|
||||
span: idx..self.stream.offset(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_unicode_sequence(&mut self, token_start_offset: usize) -> Result<char, LexerError> {
|
||||
let Some((_, char)) = self.next_char() else {
|
||||
return Err(LexerError::InvalidUnicode {
|
||||
span: token_start_offset..self.stream.offset(),
|
||||
});
|
||||
};
|
||||
|
||||
if char != '{' {
|
||||
return Err(LexerError::InvalidUnicode {
|
||||
span: token_start_offset..self.stream.offset(),
|
||||
});
|
||||
}
|
||||
|
||||
let mut value = 0;
|
||||
|
||||
while let Some((idx, char)) = self.next_char() {
|
||||
if let Some(digit) = char.to_digit(16) {
|
||||
value = (value * 16) + digit;
|
||||
continue;
|
||||
}
|
||||
|
||||
if char == '}' {
|
||||
if let Some(char) = char::from_u32(value) {
|
||||
return Ok(char);
|
||||
} else {
|
||||
return Err(LexerError::InvalidUnicode {
|
||||
span: token_start_offset..idx,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return Err(LexerError::InvalidUnicode {
|
||||
span: token_start_offset..self.stream.offset(),
|
||||
});
|
||||
}
|
||||
|
||||
Err(LexerError::InvalidUnicode {
|
||||
span: token_start_offset..self.stream.offset(),
|
||||
})
|
||||
}
|
||||
|
||||
fn starts_with_double(
|
||||
&mut self,
|
||||
token_start_offset: usize,
|
||||
) -> Result<Option<(usize, Token, usize)>, LexerError> {
|
||||
let mut result = String::new();
|
||||
|
||||
while let Some((idx, char)) = self.next_char() {
|
||||
match char {
|
||||
'"' => return Ok(Some((token_start_offset, Token::String(result), idx))),
|
||||
|
||||
'\\' => result.push(self.get_escaped_character(idx)?),
|
||||
|
||||
_ => result.push(char),
|
||||
}
|
||||
}
|
||||
|
||||
Err(LexerError::UnfinishedString {
|
||||
span: token_start_offset..self.stream.offset(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
proptest::proptest! {
|
||||
#[test]
|
||||
fn token_string_token(token: Token) {
|
||||
println!("Starting from {token:?}");
|
||||
let string = format!("{token}");
|
||||
let mut tokens = Lexer::from(string.as_str());
|
||||
let initial_token = tokens.next()
|
||||
.expect("Can get a token without an error.")
|
||||
.expect("Can get a valid token.")
|
||||
.1;
|
||||
|
||||
proptest::prop_assert_eq!(token, initial_token);
|
||||
proptest::prop_assert!(tokens.next().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn parsed_single_token(s: &str) -> Token {
|
||||
let mut tokens = Lexer::from(s);
|
||||
let result = tokens
|
||||
.next()
|
||||
.expect(format!("Can get at least one token from {s:?}").as_str())
|
||||
.expect("Can get a valid token.")
|
||||
.1;
|
||||
|
||||
assert!(
|
||||
tokens.next().is_none(),
|
||||
"Should only get one token from {s:?}"
|
||||
);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn numbers_work_as_expected() {
|
||||
assert_eq!(
|
||||
Token::Integer(IntegerWithBase {
|
||||
base: None,
|
||||
value: 1
|
||||
}),
|
||||
parsed_single_token("1")
|
||||
);
|
||||
assert_eq!(
|
||||
Token::Integer(IntegerWithBase {
|
||||
base: Some(2),
|
||||
value: 1
|
||||
}),
|
||||
parsed_single_token("0b1")
|
||||
);
|
||||
assert_eq!(
|
||||
Token::Integer(IntegerWithBase {
|
||||
base: Some(8),
|
||||
value: 1
|
||||
}),
|
||||
parsed_single_token("0o1")
|
||||
);
|
||||
assert_eq!(
|
||||
Token::Integer(IntegerWithBase {
|
||||
base: Some(10),
|
||||
value: 1
|
||||
}),
|
||||
parsed_single_token("0d1")
|
||||
);
|
||||
assert_eq!(
|
||||
Token::Integer(IntegerWithBase {
|
||||
base: Some(16),
|
||||
value: 1
|
||||
}),
|
||||
parsed_single_token("0x1")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
Token::Integer(IntegerWithBase {
|
||||
base: None,
|
||||
value: 10
|
||||
}),
|
||||
parsed_single_token("10")
|
||||
);
|
||||
assert_eq!(
|
||||
Token::Integer(IntegerWithBase {
|
||||
base: Some(2),
|
||||
value: 2
|
||||
}),
|
||||
parsed_single_token("0b10")
|
||||
);
|
||||
assert_eq!(
|
||||
Token::Integer(IntegerWithBase {
|
||||
base: Some(8),
|
||||
value: 8
|
||||
}),
|
||||
parsed_single_token("0o10")
|
||||
);
|
||||
assert_eq!(
|
||||
Token::Integer(IntegerWithBase {
|
||||
base: Some(10),
|
||||
value: 10
|
||||
}),
|
||||
parsed_single_token("0d10")
|
||||
);
|
||||
assert_eq!(
|
||||
Token::Integer(IntegerWithBase {
|
||||
base: Some(16),
|
||||
value: 16
|
||||
}),
|
||||
parsed_single_token("0x10")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lambda_works() {
|
||||
assert_eq!(Token::Lambda(false), parsed_single_token("\\"));
|
||||
assert_eq!(Token::Lambda(true), parsed_single_token("λ"));
|
||||
assert_eq!(Token::TypeName("Λ".into()), parsed_single_token("Λ"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn types_work_as_expected() {
|
||||
assert_eq!(Token::TypeName("Int".into()), parsed_single_token("Int"));
|
||||
assert_eq!(Token::TypeName("Int8".into()), parsed_single_token("Int8"));
|
||||
assert_eq!(Token::TypeName("Γ".into()), parsed_single_token("Γ"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn values_work_as_expected() {
|
||||
assert_eq!(
|
||||
Token::ValueName("alpha".into()),
|
||||
parsed_single_token("alpha")
|
||||
);
|
||||
assert_eq!(Token::ValueName("ɑ".into()), parsed_single_token("ɑ"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn operators_work_as_expected() {
|
||||
assert_eq!(Token::OperatorName("-".into()), parsed_single_token("-"));
|
||||
assert_eq!(Token::OperatorName("+".into()), parsed_single_token("+"));
|
||||
assert_eq!(Token::OperatorName("*".into()), parsed_single_token("*"));
|
||||
assert_eq!(Token::OperatorName("/".into()), parsed_single_token("/"));
|
||||
assert_eq!(Token::OperatorName("↣".into()), parsed_single_token("↣"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn can_separate_pieces() {
|
||||
let mut lexer = Lexer::from("a-b");
|
||||
let mut next_token = move || lexer.next().map(|x| x.expect("Can read valid token").1);
|
||||
|
||||
assert_eq!(Some(Token::ValueName("a".into())), next_token());
|
||||
assert_eq!(Some(Token::OperatorName("-".into())), next_token());
|
||||
assert_eq!(Some(Token::ValueName("b".into())), next_token());
|
||||
assert_eq!(None, next_token());
|
||||
|
||||
let mut lexer = Lexer::from("a--b");
|
||||
let mut next_token = move || lexer.next().map(|x| x.expect("Can read valid token").1);
|
||||
|
||||
assert_eq!(Some(Token::ValueName("a".into())), next_token());
|
||||
assert_eq!(Some(Token::OperatorName("--".into())), next_token());
|
||||
assert_eq!(Some(Token::ValueName("b".into())), next_token());
|
||||
assert_eq!(None, next_token());
|
||||
|
||||
let mut lexer = Lexer::from("a - -b");
|
||||
let mut next_token = move || lexer.next().map(|x| x.expect("Can read valid token").1);
|
||||
|
||||
assert_eq!(Some(Token::ValueName("a".into())), next_token());
|
||||
assert_eq!(Some(Token::OperatorName("-".into())), next_token());
|
||||
assert_eq!(Some(Token::OperatorName("-".into())), next_token());
|
||||
assert_eq!(Some(Token::ValueName("b".into())), next_token());
|
||||
assert_eq!(None, next_token());
|
||||
}
|
||||
Reference in New Issue
Block a user