Files
ngr/src/syntax/tokens.rs

484 lines
17 KiB
Rust

use internment::ArcIntern;
use logos::{Lexer, Logos};
use std::{fmt, str::FromStr};
use thiserror::Error;
/// A single token of the input stream; used to help the parsing go down
/// more easily.
///
/// The key way to generate this structure is via the [`Logos`] trait.
/// See the [`logos`] documentation for more information; we use the
/// [`Token::lexer`] function internally.
///
/// The first step in the compilation process is turning the raw string
/// data (in UTF-8, which is its own joy) in to a sequence of more sensible
/// tokens. Here, for example, we turn "x=5" into three tokens: a
/// [`Token::Variable`] for "x", a [`Token::Equals`] for the "=", and
/// then a [`Token::Number`] for the "5". Later on, we'll worry about
/// making sense of those three tokens.
///
/// For now, our list of tokens is relatively straightforward. We'll
/// need/want to extend these later.
///
/// The [`std::fmt::Display`] implementation for [`Token`] should
/// round-trip; if you lex a string generated with the [`std::fmt::Display`]
/// trait, you should get back the exact same token.
#[derive(Logos, Clone, Debug, PartialEq, Eq)]
pub enum Token {
// we're actually just going to skip whitespace, though
#[regex(r"[ \t\r\n\f]+", logos::skip)]
// this is an extremely simple version of comments, just line
// comments. More complicated /* */ comments can be harder to
// implement, and didn't seem worth it at the time.
#[regex(r"//.*", logos::skip)]
// Our first set of tokens are simple characters that we're
// going to use to structure NGR programs.
#[token("=")]
Equals,
#[token(":")]
Colon,
#[token(";")]
Semi,
#[token(",")]
Comma,
#[token(".")]
Dot,
#[token("(")]
LeftParen,
#[token(")")]
RightParen,
#[token("<")]
LessThan,
#[token(">")]
GreaterThan,
#[token("_")]
Underscore,
#[token("{")]
OpenBrace,
#[token("}")]
CloseBrace,
#[token("->")]
SingleArrow,
#[token("λ")]
#[token("lambda")]
#[token("function")]
Function,
#[token("struct")]
Struct,
// Next we take of any reserved words; I always like to put
// these before we start recognizing more complicated regular
// expressions. I don't think it matters, but it works for me.
#[token("print")]
Print,
// Next are the operators for NGR. We only have 4, now, but
// we might extend these later, or even make them user-definable!
#[regex(r"[+\-*/÷]", |v| v.slice().chars().next())]
Operator(char),
/// Numbers capture both the value we read from the input,
/// converted to an `i64`, as well as the base the user used
/// to write the number and/or the type the user specified,
/// if they did either.
#[regex(r"0b[01]+(u8|i8|u16|i16|u32|i32|u64|i64)?", |v| parse_number(Some(2), v))]
#[regex(r"0o[0-7]+(u8|i8|u16|i16|u32|i32|u64|i64)?", |v| parse_number(Some(8), v))]
#[regex(r"0d[0-9]+(u8|i8|u16|i16|u32|i32|u64|i64)?", |v| parse_number(Some(10), v))]
#[regex(r"0x[0-9a-fA-F]+(u8|i8|u16|i16|u32|i32|u64|i64)?", |v| parse_number(Some(16), v))]
#[regex(r"[0-9]+(u8|i8|u16|i16|u32|i32|u64|i64)?", |v| parse_number(None, v))]
Number((Option<u8>, Option<ConstantType>, u64)),
// Variables; this is a very standard, simple set of characters
// for variables, but feel free to experiment with more complicated
// things. I chose to force variables to start with a lower case
// letter, too.
#[regex(r"[a-z][a-zA-Z0-9_]*", |v| ArcIntern::new(v.slice().to_string()))]
Variable(ArcIntern<String>),
// Type names; these are like variables, but must start with a capital
// letter.
#[regex(r"[A-Z][a-zA-Z0-9_]*", |v| ArcIntern::new(v.slice().to_string()))]
TypeName(ArcIntern<String>),
}
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Token::Equals => write!(f, "'='"),
Token::Colon => write!(f, "':'"),
Token::Semi => write!(f, "';'"),
Token::Comma => write!(f, "','"),
Token::Dot => write!(f, "'.'"),
Token::LeftParen => write!(f, "'('"),
Token::RightParen => write!(f, "')'"),
Token::LessThan => write!(f, "<"),
Token::GreaterThan => write!(f, ">"),
Token::Underscore => write!(f, "_"),
Token::OpenBrace => write!(f, "{{"),
Token::CloseBrace => write!(f, "}}"),
Token::SingleArrow => write!(f, "->"),
Token::Function => write!(f, "function"),
Token::Struct => write!(f, "struct"),
Token::Print => write!(f, "'print'"),
Token::Operator(c) => write!(f, "'{}'", c),
Token::Number((None, otype, v)) => write!(f, "'{}{}'", v, display_optional_type(otype)),
Token::Number((Some(2), otype, v)) => {
write!(f, "'0b{:b}{}'", v, display_optional_type(otype))
}
Token::Number((Some(8), otype, v)) => {
write!(f, "'0o{:o}{}'", v, display_optional_type(otype))
}
Token::Number((Some(10), otype, v)) => {
write!(f, "'{}{}'", v, display_optional_type(otype))
}
Token::Number((Some(16), otype, v)) => {
write!(f, "'0x{:x}{}'", v, display_optional_type(otype))
}
Token::Number((Some(b), opt_type, v)) => {
write!(
f,
"Invalidly-based-number<base={},val={},opt_type={:?}>",
b, v, opt_type
)
}
Token::Variable(s) => write!(f, "'{}'", s),
Token::TypeName(s) => write!(f, "'{}'", s),
}
}
}
/// A sudden and unexpected error in the lexer.
#[derive(Debug, Error, PartialEq, Eq)]
pub enum LexerError {
/// The `usize` here is the offset that we ran into the problem, given
/// from the start of the file.
#[error("Failed lexing at {0}")]
LexFailure(usize),
}
#[cfg(test)]
impl Token {
/// Create a variable token with the given name. Very handy for
/// testing.
pub(crate) fn var(s: &str) -> Token {
Token::Variable(ArcIntern::new(s.to_string()))
}
}
#[repr(i64)]
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum ConstantType {
U8 = 10,
U16 = 11,
U32 = 12,
U64 = 13,
I8 = 20,
I16 = 21,
I32 = 22,
I64 = 23,
Void = 255,
}
impl From<ConstantType> for cranelift_codegen::ir::Type {
fn from(value: ConstantType) -> Self {
match value {
ConstantType::Void => cranelift_codegen::ir::types::I64,
ConstantType::I8 | ConstantType::U8 => cranelift_codegen::ir::types::I8,
ConstantType::I16 | ConstantType::U16 => cranelift_codegen::ir::types::I16,
ConstantType::I32 | ConstantType::U32 => cranelift_codegen::ir::types::I32,
ConstantType::I64 | ConstantType::U64 => cranelift_codegen::ir::types::I64,
}
}
}
pub struct StringNotConstantType();
impl FromStr for ConstantType {
type Err = StringNotConstantType;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"i8" => Ok(ConstantType::I8),
"i16" => Ok(ConstantType::I16),
"i32" => Ok(ConstantType::I32),
"i64" => Ok(ConstantType::I64),
"u8" => Ok(ConstantType::U8),
"u16" => Ok(ConstantType::U16),
"u32" => Ok(ConstantType::U32),
"u64" => Ok(ConstantType::U64),
"void" => Ok(ConstantType::Void),
_ => Err(StringNotConstantType()),
}
}
}
impl fmt::Display for ConstantType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
ConstantType::I8 => write!(f, "i8"),
ConstantType::I16 => write!(f, "i16"),
ConstantType::I32 => write!(f, "i32"),
ConstantType::I64 => write!(f, "i64"),
ConstantType::U8 => write!(f, "u8"),
ConstantType::U16 => write!(f, "u16"),
ConstantType::U32 => write!(f, "u32"),
ConstantType::U64 => write!(f, "u64"),
ConstantType::Void => write!(f, "void"),
}
}
}
impl ConstantType {
/// Return the set of types that can be safely casted into this type.
pub fn safe_casts_to(self) -> Vec<ConstantType> {
match self {
ConstantType::Void => vec![ConstantType::Void],
ConstantType::I8 => vec![ConstantType::I8],
ConstantType::I16 => vec![ConstantType::I16, ConstantType::I8, ConstantType::U8],
ConstantType::I32 => vec![
ConstantType::I32,
ConstantType::I16,
ConstantType::I8,
ConstantType::U16,
ConstantType::U8,
],
ConstantType::I64 => vec![
ConstantType::I64,
ConstantType::I32,
ConstantType::I16,
ConstantType::I8,
ConstantType::U32,
ConstantType::U16,
ConstantType::U8,
],
ConstantType::U8 => vec![ConstantType::U8],
ConstantType::U16 => vec![ConstantType::U16, ConstantType::U8],
ConstantType::U32 => vec![ConstantType::U32, ConstantType::U16, ConstantType::U8],
ConstantType::U64 => vec![
ConstantType::U64,
ConstantType::U32,
ConstantType::U16,
ConstantType::U8,
],
}
}
/// Return the set of all currently-available constant types
pub fn all_types() -> Vec<Self> {
vec![
ConstantType::U8,
ConstantType::U16,
ConstantType::U32,
ConstantType::U64,
ConstantType::I8,
ConstantType::I16,
ConstantType::I32,
ConstantType::I64,
]
}
/// Return the name of the given type, as a string
pub fn name(&self) -> String {
match self {
ConstantType::Void => "void".to_string(),
ConstantType::I8 => "i8".to_string(),
ConstantType::I16 => "i16".to_string(),
ConstantType::I32 => "i32".to_string(),
ConstantType::I64 => "i64".to_string(),
ConstantType::U8 => "u8".to_string(),
ConstantType::U16 => "u16".to_string(),
ConstantType::U32 => "u32".to_string(),
ConstantType::U64 => "u64".to_string(),
}
}
/// Return the set of all primitives that can return this
/// type, along with the argument types for those primitives.
///
/// A "None" value as an argument type means that the argument
/// type is unconstrained by the return type.
pub fn primitives_for(&self) -> Vec<(crate::ir::Primitive, Vec<Option<ConstantType>>)> {
use crate::ir::Primitive::*;
match self {
ConstantType::Void => vec![(Print, vec![None])],
ConstantType::I8 | ConstantType::I16 | ConstantType::I32 | ConstantType::I64 => vec![
(Plus, vec![Some(*self), Some(*self)]),
(Minus, vec![Some(*self), Some(*self)]),
(Times, vec![Some(*self), Some(*self)]),
(Divide, vec![Some(*self), Some(*self)]),
(Negate, vec![Some(*self)]),
],
ConstantType::U8 | ConstantType::U16 | ConstantType::U32 | ConstantType::U64 => vec![
(Plus, vec![Some(*self), Some(*self)]),
(Minus, vec![Some(*self), Some(*self)]),
(Times, vec![Some(*self), Some(*self)]),
(Divide, vec![Some(*self), Some(*self)]),
],
}
}
}
#[derive(Debug, Error, PartialEq)]
pub enum InvalidConstantType {
#[error("Unrecognized constant {0} for constant type")]
Value(i64),
}
impl TryFrom<i64> for ConstantType {
type Error = InvalidConstantType;
fn try_from(value: i64) -> Result<Self, Self::Error> {
match value {
10 => Ok(ConstantType::U8),
11 => Ok(ConstantType::U16),
12 => Ok(ConstantType::U32),
13 => Ok(ConstantType::U64),
20 => Ok(ConstantType::I8),
21 => Ok(ConstantType::I16),
22 => Ok(ConstantType::I32),
23 => Ok(ConstantType::I64),
255 => Ok(ConstantType::Void),
_ => Err(InvalidConstantType::Value(value)),
}
}
}
/// Parse a number in the given base, return a pair of the base and the
/// parsed number. This is just a helper used for all of the number
/// regular expression cases, which kicks off to the obvious Rust
/// standard library function.
fn parse_number(
base: Option<u8>,
value: &Lexer<Token>,
) -> Result<(Option<u8>, Option<ConstantType>, u64), ()> {
let (radix, strval) = match base {
None => (10, value.slice()),
Some(radix) => (radix, &value.slice()[2..]),
};
let (declared_type, strval) = if let Some(strval) = strval.strip_suffix("u8") {
(Some(ConstantType::U8), strval)
} else if let Some(strval) = strval.strip_suffix("u16") {
(Some(ConstantType::U16), strval)
} else if let Some(strval) = strval.strip_suffix("u32") {
(Some(ConstantType::U32), strval)
} else if let Some(strval) = strval.strip_suffix("u64") {
(Some(ConstantType::U64), strval)
} else if let Some(strval) = strval.strip_suffix("i8") {
(Some(ConstantType::I8), strval)
} else if let Some(strval) = strval.strip_suffix("i16") {
(Some(ConstantType::I16), strval)
} else if let Some(strval) = strval.strip_suffix("i32") {
(Some(ConstantType::I32), strval)
} else if let Some(strval) = strval.strip_suffix("i64") {
(Some(ConstantType::I64), strval)
} else {
(None, strval)
};
let intval = u64::from_str_radix(strval, radix as u32).map_err(|_| ())?;
Ok((base, declared_type, intval))
}
fn display_optional_type(otype: &Option<ConstantType>) -> &'static str {
match otype {
None => "",
Some(ConstantType::Void) => "void",
Some(ConstantType::I8) => "i8",
Some(ConstantType::I16) => "i16",
Some(ConstantType::I32) => "i32",
Some(ConstantType::I64) => "i64",
Some(ConstantType::U8) => "u8",
Some(ConstantType::U16) => "u16",
Some(ConstantType::U32) => "u32",
Some(ConstantType::U64) => "u64",
}
}
#[test]
fn lex_numbers() {
let mut lex0 = Token::lexer("12 0b1100 0o14 0d12 0xc 12u8 0xci64// 9");
assert_eq!(lex0.next(), Some(Ok(Token::Number((None, None, 12)))));
assert_eq!(lex0.next(), Some(Ok(Token::Number((Some(2), None, 12)))));
assert_eq!(lex0.next(), Some(Ok(Token::Number((Some(8), None, 12)))));
assert_eq!(lex0.next(), Some(Ok(Token::Number((Some(10), None, 12)))));
assert_eq!(lex0.next(), Some(Ok(Token::Number((Some(16), None, 12)))));
assert_eq!(
lex0.next(),
Some(Ok(Token::Number((None, Some(ConstantType::U8), 12))))
);
assert_eq!(
lex0.next(),
Some(Ok(Token::Number((Some(16), Some(ConstantType::I64), 12))))
);
assert_eq!(lex0.next(), None);
}
#[test]
fn lex_symbols() {
let mut lex0 = Token::lexer("x + \t y * \n z // rest");
assert_eq!(lex0.next(), Some(Ok(Token::var("x"))));
assert_eq!(lex0.next(), Some(Ok(Token::Operator('+'))));
assert_eq!(lex0.next(), Some(Ok(Token::var("y"))));
assert_eq!(lex0.next(), Some(Ok(Token::Operator('*'))));
assert_eq!(lex0.next(), Some(Ok(Token::var("z"))));
assert_eq!(lex0.next(), None);
}
#[test]
fn lexer_spans() {
let mut lex0 = Token::lexer("y = x + 1//foo").spanned();
assert_eq!(lex0.next(), Some((Ok(Token::var("y")), 0..1)));
assert_eq!(lex0.next(), Some((Ok(Token::Equals), 2..3)));
assert_eq!(lex0.next(), Some((Ok(Token::var("x")), 4..5)));
assert_eq!(lex0.next(), Some((Ok(Token::Operator('+')), 6..7)));
assert_eq!(
lex0.next(),
Some((Ok(Token::Number((None, None, 1))), 8..9))
);
assert_eq!(lex0.next(), None);
}
#[test]
fn further_spans() {
let mut lex0 = Token::lexer("x = 2i64 + 2i64;\ny = -x;\nprint y;").spanned();
assert_eq!(lex0.next(), Some((Ok(Token::var("x")), 0..1)));
assert_eq!(lex0.next(), Some((Ok(Token::Equals), 2..3)));
assert_eq!(
lex0.next(),
Some((Ok(Token::Number((None, Some(ConstantType::I64), 2))), 4..8))
);
assert_eq!(lex0.next(), Some((Ok(Token::Operator('+')), 9..10)));
assert_eq!(
lex0.next(),
Some((
Ok(Token::Number((None, Some(ConstantType::I64), 2))),
11..15
))
);
assert_eq!(lex0.next(), Some((Ok(Token::Semi), 15..16)));
assert_eq!(lex0.next(), Some((Ok(Token::var("y")), 17..18)));
assert_eq!(lex0.next(), Some((Ok(Token::Equals), 19..20)));
assert_eq!(lex0.next(), Some((Ok(Token::Operator('-')), 21..22)));
assert_eq!(lex0.next(), Some((Ok(Token::var("x")), 22..23)));
assert_eq!(lex0.next(), Some((Ok(Token::Semi), 23..24)));
assert_eq!(lex0.next(), Some((Ok(Token::Print), 25..30)));
assert_eq!(lex0.next(), Some((Ok(Token::var("y")), 31..32)));
assert_eq!(lex0.next(), Some((Ok(Token::Semi), 32..33)));
}