Files
ngr/src/syntax/tokens.rs
Adam Wick 1fbfd0c2d2 📜 Add better documentation across the compiler. (#3)
These changes pay particular attention to API endpoints, to try to
ensure that any rustdocs generated are detailed and sensible. A good
next step, eventually, might be to include doctest examples, as well.
For the moment, it's not clear that they would provide a lot of value,
though.

In addition, this does a couple refactors to simplify the code base in
ways that make things clearer or, at least, briefer.
2023-05-13 15:00:08 -05:00

174 lines
6.4 KiB
Rust

use internment::ArcIntern;
use logos::{Lexer, Logos};
use std::fmt;
use std::num::ParseIntError;
use thiserror::Error;
/// A single token of the input stream; used to help the parsing go down
/// more easily.
///
/// The key way to generate this structure is via the [`Logos`] trait.
/// See the [`logos`] documentation for more information; we use the
/// [`Token::lexer`] function internally.
///
/// The first step in the compilation process is turning the raw string
/// data (in UTF-8, which is its own joy) in to a sequence of more sensible
/// tokens. Here, for example, we turn "x=5" into three tokens: a
/// [`Token::Variable`] for "x", a [`Token::Equals`] for the "=", and
/// then a [`Token::Number`] for the "5". Later on, we'll worry about
/// making sense of those three tokens.
///
/// For now, our list of tokens is relatively straightforward. We'll
/// need/want to extend these later.
///
/// The [`std::fmt::Display`] implementation for [`Token`] should
/// round-trip; if you lex a string generated with the [`std::fmt::Display`]
/// trait, you should get back the exact same token.
#[derive(Logos, Clone, Debug, PartialEq, Eq)]
pub enum Token {
// Our first set of tokens are simple characters that we're
// going to use to structure NGR programs.
#[token("=")]
Equals,
#[token(";")]
Semi,
#[token("(")]
LeftParen,
#[token(")")]
RightParen,
// Next we take of any reserved words; I always like to put
// these before we start recognizing more complicated regular
// expressions. I don't think it matters, but it works for me.
#[token("print")]
Print,
// Next are the operators for NGR. We only have 4, now, but
// we might extend these later, or even make them user-definable!
#[regex(r"[+\-*/]", |v| v.slice().chars().next())]
Operator(char),
/// Numbers capture both the value we read from the input,
/// converted to an `i64`, as well as the base the user used
/// to write the number, if they did so.
#[regex(r"0b[01]+", |v| parse_number(Some(2), v))]
#[regex(r"0o[0-7]+", |v| parse_number(Some(8), v))]
#[regex(r"0d[0-9]+", |v| parse_number(Some(10), v))]
#[regex(r"0x[0-9a-fA-F]+", |v| parse_number(Some(16), v))]
#[regex(r"[0-9]+", |v| parse_number(None, v))]
Number((Option<u8>, i64)),
// Variables; this is a very standard, simple set of characters
// for variables, but feel free to experiment with more complicated
// things. I chose to force variables to start with a lower case
// letter, too.
#[regex(r"[a-z][a-zA-Z0-9_]*", |v| ArcIntern::new(v.slice().to_string()))]
Variable(ArcIntern<String>),
// the next token will be an error token
#[error]
// we're actually just going to skip whitespace, though
#[regex(r"[ \t\r\n\f]+", logos::skip)]
// this is an extremely simple version of comments, just line
// comments. More complicated /* */ comments can be harder to
// implement, and didn't seem worth it at the time.
#[regex(r"//.*", logos::skip)]
/// This token represents that some core error happened in lexing;
/// possibly that something didn't match anything at all.
Error,
}
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Token::Equals => write!(f, "'='"),
Token::Semi => write!(f, "';'"),
Token::LeftParen => write!(f, "'('"),
Token::RightParen => write!(f, "')'"),
Token::Print => write!(f, "'print'"),
Token::Operator(c) => write!(f, "'{}'", c),
Token::Number((None, v)) => write!(f, "'{}'", v),
Token::Number((Some(2), v)) => write!(f, "'0b{:b}'", v),
Token::Number((Some(8), v)) => write!(f, "'0o{:o}'", v),
Token::Number((Some(10), v)) => write!(f, "'{}'", v),
Token::Number((Some(16), v)) => write!(f, "'0x{:x}'", v),
Token::Number((Some(b), v)) => {
write!(f, "Invalidly-based-number<base={},val={}>", b, v)
}
Token::Variable(s) => write!(f, "'{}'", s),
Token::Error => write!(f, "<error>"),
}
}
}
/// A sudden and unexpected error in the lexer.
#[derive(Debug, Error, PartialEq, Eq)]
pub enum LexerError {
/// The `usize` here is the offset that we ran into the problem, given
/// from the start of the file.
#[error("Failed lexing at {0}")]
LexFailure(usize),
}
#[cfg(test)]
impl Token {
/// Create a variable token with the given name. Very handy for
/// testing.
pub(crate) fn var(s: &str) -> Token {
Token::Variable(ArcIntern::new(s.to_string()))
}
}
/// Parse a number in the given base, return a pair of the base and the
/// parsed number. This is just a helper used for all of the number
/// regular expression cases, which kicks off to the obvious Rust
/// standard library function.
fn parse_number(
base: Option<u8>,
value: &Lexer<Token>,
) -> Result<(Option<u8>, i64), ParseIntError> {
let (radix, strval) = match base {
None => (10, value.slice()),
Some(radix) => (radix, &value.slice()[2..]),
};
let intval = i64::from_str_radix(strval, radix as u32)?;
Ok((base, intval))
}
#[test]
fn lex_numbers() {
let mut lex0 = Token::lexer("12 0b1100 0o14 0d12 0xc // 9");
assert_eq!(lex0.next(), Some(Token::Number((None, 12))));
assert_eq!(lex0.next(), Some(Token::Number((Some(2), 12))));
assert_eq!(lex0.next(), Some(Token::Number((Some(8), 12))));
assert_eq!(lex0.next(), Some(Token::Number((Some(10), 12))));
assert_eq!(lex0.next(), Some(Token::Number((Some(16), 12))));
assert_eq!(lex0.next(), None);
}
#[test]
fn lex_symbols() {
let mut lex0 = Token::lexer("x + \t y * \n z // rest");
assert_eq!(lex0.next(), Some(Token::var("x")));
assert_eq!(lex0.next(), Some(Token::Operator('+')));
assert_eq!(lex0.next(), Some(Token::var("y")));
assert_eq!(lex0.next(), Some(Token::Operator('*')));
assert_eq!(lex0.next(), Some(Token::var("z")));
assert_eq!(lex0.next(), None);
}
#[test]
fn lexer_spans() {
let mut lex0 = Token::lexer("y = x + 1//foo").spanned();
assert_eq!(lex0.next(), Some((Token::var("y"), 0..1)));
assert_eq!(lex0.next(), Some((Token::Equals, 2..3)));
assert_eq!(lex0.next(), Some((Token::var("x"), 4..5)));
assert_eq!(lex0.next(), Some((Token::Operator('+'), 6..7)));
assert_eq!(lex0.next(), Some((Token::Number((None, 1)), 8..9)));
assert_eq!(lex0.next(), None);
}