ngr/src/syntax/tokens.rs

use internment::ArcIntern;
use logos::{Lexer, Logos};
use std::fmt;
use std::num::ParseIntError;
use thiserror::Error;

/// A single token of the input stream; used to help the parsing go down
/// more easily.
///
/// The key way to generate this structure is via the [`Logos`] trait.
/// See the [`logos`] documentation for more information; we use the
/// [`Token::lexer`] function internally.
///
/// The first step in the compilation process is turning the raw string
/// data (in UTF-8, which is its own joy) in to a sequence of more sensible
/// tokens. Here, for example, we turn "x=5" into three tokens: a
/// [`Token::Variable`] for "x", a [`Token::Equals`] for the "=", and
/// then a [`Token::Number`] for the "5". Later on, we'll worry about
/// making sense of those three tokens.
///
/// For now, our list of tokens is relatively straightforward. We'll
/// need/want to extend these later.
///
/// The [`std::fmt::Display`] implementation for [`Token`] should
/// round-trip; if you lex a string generated with the [`std::fmt::Display`]
/// trait, you should get back the exact same token.
#[derive(Logos, Clone, Debug, PartialEq, Eq)]
pub enum Token {
    // Our first set of tokens are simple characters that we're
    // going to use to structure NGR programs.
    #[token("=")]
    Equals,

    #[token(";")]
    Semi,

    #[token("(")]
    LeftParen,

    #[token(")")]
    RightParen,

    #[token("<")]
    LessThan,

    #[token(">")]
    GreaterThan,

    // Next we take of any reserved words; I always like to put
    // these before we start recognizing more complicated regular
    // expressions. I don't think it matters, but it works for me.
    #[token("print")]
    Print,

    // Next are the operators for NGR. We only have 4, now, but
    // we might extend these later, or even make them user-definable!
    #[regex(r"[+\-*/]", |v| v.slice().chars().next())]
    Operator(char),

    /// Numbers capture both the value we read from the input,
    /// converted to an `i64`, as well as the base the user used
    /// to write the number and/or the type the user specified,
    /// if they did either.
    #[regex(r"0b[01]+(u8|i8|u16|i16|u32|i32|u64|i64)?", |v| parse_number(Some(2), v))]
    #[regex(r"0o[0-7]+(u8|i8|u16|i16|u32|i32|u64|i64)?", |v| parse_number(Some(8), v))]
    #[regex(r"0d[0-9]+(u8|i8|u16|i16|u32|i32|u64|i64)?", |v| parse_number(Some(10), v))]
    #[regex(r"0x[0-9a-fA-F]+(u8|i8|u16|i16|u32|i32|u64|i64)?", |v| parse_number(Some(16), v))]
    #[regex(r"[0-9]+(u8|i8|u16|i16|u32|i32|u64|i64)?", |v| parse_number(None, v))]
    Number((Option<u8>, Option<ConstantType>, u64)),

    // Variables; this is a very standard, simple set of characters
    // for variables, but feel free to experiment with more complicated
    // things. I chose to force variables to start with a lower case
    // letter, too.
    #[regex(r"[a-z][a-zA-Z0-9_]*", |v| ArcIntern::new(v.slice().to_string()))]
    Variable(ArcIntern<String>),

    // the next token will be an error token
    #[error]
    // we're actually just going to skip whitespace, though
    #[regex(r"[ \t\r\n\f]+", logos::skip)]
    // this is an extremely simple version of comments, just line
    // comments. More complicated /* */ comments can be harder to
    // implement, and didn't seem worth it at the time.
    #[regex(r"//.*", logos::skip)]
    /// This token represents that some core error happened in lexing;
    /// possibly that something didn't match anything at all.
    Error,
}

impl fmt::Display for Token {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
            Token::Equals => write!(f, "'='"),
            Token::Semi => write!(f, "';'"),
            Token::LeftParen => write!(f, "'('"),
            Token::RightParen => write!(f, "')'"),
            Token::LessThan => write!(f, "<"),
            Token::GreaterThan => write!(f, ">"),
            Token::Print => write!(f, "'print'"),
            Token::Operator(c) => write!(f, "'{}'", c),
            Token::Number((None, otype, v)) => write!(f, "'{}{}'", v, display_optional_type(otype)),
            Token::Number((Some(2), otype, v)) => {
                write!(f, "'0b{:b}{}'", v, display_optional_type(otype))
            }
            Token::Number((Some(8), otype, v)) => {
                write!(f, "'0o{:o}{}'", v, display_optional_type(otype))
            }
            Token::Number((Some(10), otype, v)) => {
                write!(f, "'{}{}'", v, display_optional_type(otype))
            }
            Token::Number((Some(16), otype, v)) => {
                write!(f, "'0x{:x}{}'", v, display_optional_type(otype))
            }
            Token::Number((Some(b), opt_type, v)) => {
                write!(
                    f,
                    "Invalidly-based-number<base={},val={},opt_type={:?}>",
                    b, v, opt_type
                )
            }
            Token::Variable(s) => write!(f, "'{}'", s),
            Token::Error => write!(f, "<error>"),
        }
    }
}

/// A sudden and unexpected error in the lexer.
#[derive(Debug, Error, PartialEq, Eq)]
pub enum LexerError {
    /// The `usize` here is the offset that we ran into the problem, given
    /// from the start of the file.
    #[error("Failed lexing at {0}")]
    LexFailure(usize),
}

#[cfg(test)]
impl Token {
    /// Create a variable token with the given name. Very handy for
    /// testing.
    pub(crate) fn var(s: &str) -> Token {
        Token::Variable(ArcIntern::new(s.to_string()))
    }
}

#[repr(i64)]
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum ConstantType {
    U8 = 10,
    U16 = 11,
    U32 = 12,
    U64 = 13,
    I8 = 20,
    I16 = 21,
    I32 = 22,
    I64 = 23,
}

impl From<ConstantType> for cranelift_codegen::ir::Type {
    fn from(value: ConstantType) -> Self {
        match value {
            ConstantType::I8 | ConstantType::U8 => cranelift_codegen::ir::types::I8,
            ConstantType::I16 | ConstantType::U16 => cranelift_codegen::ir::types::I16,
            ConstantType::I32 | ConstantType::U32 => cranelift_codegen::ir::types::I32,
            ConstantType::I64 | ConstantType::U64 => cranelift_codegen::ir::types::I64,
        }
    }
}

impl ConstantType {
    /// Returns true if the given type is (a) numeric and (b) signed;
    pub fn is_signed(&self) -> bool {
        matches!(self, ConstantType::I8 | ConstantType::I16 | ConstantType::I32 | ConstantType::I64)
    }
}

#[derive(Debug, Error, PartialEq)]
pub enum InvalidConstantType {
    #[error("Unrecognized constant {0} for constant type")]
    Value(i64),
}

impl TryFrom<i64> for ConstantType {
    type Error = InvalidConstantType;

    fn try_from(value: i64) -> Result<Self, Self::Error> {
        match value {
            10 => Ok(ConstantType::U8),
            11 => Ok(ConstantType::U16),
            12 => Ok(ConstantType::U32),
            13 => Ok(ConstantType::U64),
            20 => Ok(ConstantType::I8),
            21 => Ok(ConstantType::I16),
            22 => Ok(ConstantType::I32),
            23 => Ok(ConstantType::I64),
            _ => Err(InvalidConstantType::Value(value)),
        }
    }
}

/// Parse a number in the given base, return a pair of the base and the
/// parsed number. This is just a helper used for all of the number
/// regular expression cases, which kicks off to the obvious Rust
/// standard library function.
fn parse_number(
    base: Option<u8>,
    value: &Lexer<Token>,
) -> Result<(Option<u8>, Option<ConstantType>, u64), ParseIntError> {
    let (radix, strval) = match base {
        None => (10, value.slice()),
        Some(radix) => (radix, &value.slice()[2..]),
    };

    let (declared_type, strval) = if let Some(strval) = strval.strip_suffix("u8") {
        (Some(ConstantType::U8), strval)
    } else if let Some(strval) = strval.strip_suffix("u16") {
        (Some(ConstantType::U16), strval)
    } else if let Some(strval) = strval.strip_suffix("u32") {
        (Some(ConstantType::U32), strval)
    } else if let Some(strval) = strval.strip_suffix("u64") {
        (Some(ConstantType::U64), strval)
    } else if let Some(strval) = strval.strip_suffix("i8") {
        (Some(ConstantType::I8), strval)
    } else if let Some(strval) = strval.strip_suffix("i16") {
        (Some(ConstantType::I16), strval)
    } else if let Some(strval) = strval.strip_suffix("i32") {
        (Some(ConstantType::I32), strval)
    } else if let Some(strval) = strval.strip_suffix("i64") {
        (Some(ConstantType::I64), strval)
    } else {
        (None, strval)
    };

    let intval = u64::from_str_radix(strval, radix as u32)?;
    Ok((base, declared_type, intval))
}

fn display_optional_type(otype: &Option<ConstantType>) -> &'static str {
    match otype {
        None => "",
        Some(ConstantType::I8) => "i8",
        Some(ConstantType::I16) => "i16",
        Some(ConstantType::I32) => "i32",
        Some(ConstantType::I64) => "i64",
        Some(ConstantType::U8) => "u8",
        Some(ConstantType::U16) => "u16",
        Some(ConstantType::U32) => "u32",
        Some(ConstantType::U64) => "u64",
    }
}

#[test]
fn lex_numbers() {
    let mut lex0 = Token::lexer("12 0b1100 0o14 0d12 0xc 12u8 0xci64// 9");
    assert_eq!(lex0.next(), Some(Token::Number((None, None, 12))));
    assert_eq!(lex0.next(), Some(Token::Number((Some(2), None, 12))));
    assert_eq!(lex0.next(), Some(Token::Number((Some(8), None, 12))));
    assert_eq!(lex0.next(), Some(Token::Number((Some(10), None, 12))));
    assert_eq!(lex0.next(), Some(Token::Number((Some(16), None, 12))));
    assert_eq!(
        lex0.next(),
        Some(Token::Number((None, Some(ConstantType::U8), 12)))
    );
    assert_eq!(
        lex0.next(),
        Some(Token::Number((Some(16), Some(ConstantType::I64), 12)))
    );
    assert_eq!(lex0.next(), None);
}

#[test]
fn lex_symbols() {
    let mut lex0 = Token::lexer("x + \t y * \n z // rest");
    assert_eq!(lex0.next(), Some(Token::var("x")));
    assert_eq!(lex0.next(), Some(Token::Operator('+')));
    assert_eq!(lex0.next(), Some(Token::var("y")));
    assert_eq!(lex0.next(), Some(Token::Operator('*')));
    assert_eq!(lex0.next(), Some(Token::var("z")));
    assert_eq!(lex0.next(), None);
}

#[test]
fn lexer_spans() {
    let mut lex0 = Token::lexer("y = x + 1//foo").spanned();
    assert_eq!(lex0.next(), Some((Token::var("y"), 0..1)));
    assert_eq!(lex0.next(), Some((Token::Equals, 2..3)));
    assert_eq!(lex0.next(), Some((Token::var("x"), 4..5)));
    assert_eq!(lex0.next(), Some((Token::Operator('+'), 6..7)));
    assert_eq!(lex0.next(), Some((Token::Number((None, None, 1)), 8..9)));
    assert_eq!(lex0.next(), None);
}