📜 Add better documentation across the compiler. #3
@@ -19,7 +19,7 @@ cranelift-module = "0.94.0"
|
|||||||
cranelift-native = "0.94.0"
|
cranelift-native = "0.94.0"
|
||||||
cranelift-object = "0.94.0"
|
cranelift-object = "0.94.0"
|
||||||
internment = { version = "0.7.0", default-features = false, features = ["arc"] }
|
internment = { version = "0.7.0", default-features = false, features = ["arc"] }
|
||||||
lalrpop-util = "^0.19.7"
|
lalrpop-util = "^0.20.0"
|
||||||
lazy_static = "^1.4.0"
|
lazy_static = "^1.4.0"
|
||||||
logos = "^0.12.0"
|
logos = "^0.12.0"
|
||||||
pretty = { version = "^0.11.2", features = ["termcolor"] }
|
pretty = { version = "^0.11.2", features = ["termcolor"] }
|
||||||
@@ -30,4 +30,4 @@ tempfile = "^3.5.0"
|
|||||||
thiserror = "^1.0.30"
|
thiserror = "^1.0.30"
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
lalrpop = "^0.19.7"
|
lalrpop = "^0.20.0"
|
||||||
|
|||||||
@@ -8,7 +8,8 @@
|
|||||||
//!
|
//!
|
||||||
//! * Turning the string into a series of language-specific [`Token`]s.
|
//! * Turning the string into a series of language-specific [`Token`]s.
|
||||||
//! * Taking those tokens, and computing a basic syntax tree from them,
|
//! * Taking those tokens, and computing a basic syntax tree from them,
|
||||||
//! using our [`parser`].
|
//! using our parser ([`ProgramParser`] or [`StatementParser`], generated
|
||||||
|
//! by [`lalrpop`](https://lalrpop.github.io/lalrpop/)).
|
||||||
//! * Validating the tree we have parsed, using the [`validate`] module,
|
//! * Validating the tree we have parsed, using the [`validate`] module,
|
||||||
//! returning any warnings or errors we have found.
|
//! returning any warnings or errors we have found.
|
||||||
//! * Simplifying the tree we have parsed, using the [`simplify`] module,
|
//! * Simplifying the tree we have parsed, using the [`simplify`] module,
|
||||||
@@ -37,7 +38,7 @@ pub mod simplify;
|
|||||||
mod tokens;
|
mod tokens;
|
||||||
lalrpop_mod!(
|
lalrpop_mod!(
|
||||||
#[allow(clippy::just_underscores_and_digits, clippy::clone_on_copy)]
|
#[allow(clippy::just_underscores_and_digits, clippy::clone_on_copy)]
|
||||||
pub parser,
|
parser,
|
||||||
"/syntax/parser.rs"
|
"/syntax/parser.rs"
|
||||||
);
|
);
|
||||||
mod pretty;
|
mod pretty;
|
||||||
@@ -45,7 +46,7 @@ pub mod validate;
|
|||||||
|
|
||||||
pub use crate::syntax::ast::*;
|
pub use crate::syntax::ast::*;
|
||||||
pub use crate::syntax::location::Location;
|
pub use crate::syntax::location::Location;
|
||||||
use crate::syntax::parser::ProgramParser;
|
pub use crate::syntax::parser::{ProgramParser, StatementParser};
|
||||||
pub use crate::syntax::tokens::{LexerError, Token};
|
pub use crate::syntax::tokens::{LexerError, Token};
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
use ::pretty::{Arena, Pretty};
|
use ::pretty::{Arena, Pretty};
|
||||||
@@ -56,8 +57,6 @@ use proptest::{prop_assert, prop_assert_eq};
|
|||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
|
||||||
use self::parser::StatementParser;
|
|
||||||
|
|
||||||
#[derive(Debug, Error)]
|
#[derive(Debug, Error)]
|
||||||
pub enum ParserError {
|
pub enum ParserError {
|
||||||
#[error("Invalid token")]
|
#[error("Invalid token")]
|
||||||
@@ -82,7 +81,7 @@ impl ParserError {
|
|||||||
ParseError::InvalidToken { location } => {
|
ParseError::InvalidToken { location } => {
|
||||||
ParserError::InvalidToken(Location::new(file_idx, location))
|
ParserError::InvalidToken(Location::new(file_idx, location))
|
||||||
}
|
}
|
||||||
ParseError::UnrecognizedEOF { location, expected } => {
|
ParseError::UnrecognizedEof { location, expected } => {
|
||||||
ParserError::UnrecognizedEOF(Location::new(file_idx, location), expected)
|
ParserError::UnrecognizedEOF(Location::new(file_idx, location), expected)
|
||||||
}
|
}
|
||||||
ParseError::UnrecognizedToken {
|
ParseError::UnrecognizedToken {
|
||||||
|
|||||||
@@ -1,14 +1,32 @@
|
|||||||
|
//! The parser for NGR!
|
||||||
|
//!
|
||||||
|
//! This file contains the grammar for the NGR language; a grammar is a nice,
|
||||||
|
//! machine-readable way to describe how your language's syntax works. For
|
||||||
|
//! example, here we describe a program as a series of statements, statements
|
||||||
|
//! as either variable binding or print statements, etc. As the grammar gets
|
||||||
|
//! more complicated, using tools like [`lalrpop`] becomes even more important.
|
||||||
|
//! (Although, at some point, things can become so complicated that you might
|
||||||
|
//! eventually want to leave lalrpop behind.)
|
||||||
|
//!
|
||||||
use crate::syntax::{LexerError, Location};
|
use crate::syntax::{LexerError, Location};
|
||||||
use crate::syntax::ast::{Program,Statement,Expression,Value};
|
use crate::syntax::ast::{Program,Statement,Expression,Value};
|
||||||
use crate::syntax::tokens::Token;
|
use crate::syntax::tokens::Token;
|
||||||
use internment::ArcIntern;
|
use internment::ArcIntern;
|
||||||
|
|
||||||
|
// one cool thing about lalrpop: we can pass arguments. in this case, the
|
||||||
|
// file index of the file we're parsing. we combine this with the file offset
|
||||||
|
// that Logos gives us to make a [`crate::syntax::Location`].
|
||||||
grammar(file_idx: usize);
|
grammar(file_idx: usize);
|
||||||
|
|
||||||
|
// this is a slighlyt odd way to describe this, but: consider this section
|
||||||
|
// as describing the stuff that is external to the lalrpop grammar that it
|
||||||
|
// needs to know to do its job.
|
||||||
extern {
|
extern {
|
||||||
type Location = usize;
|
type Location = usize; // Logos, our lexer, implements locations as
|
||||||
|
// offsets from the start of the file.
|
||||||
type Error = LexerError;
|
type Error = LexerError;
|
||||||
|
|
||||||
|
// here we redeclare all of the tokens.
|
||||||
enum Token {
|
enum Token {
|
||||||
"=" => Token::Equals,
|
"=" => Token::Equals,
|
||||||
";" => Token::Semi,
|
";" => Token::Semi,
|
||||||
@@ -22,57 +40,123 @@ extern {
|
|||||||
"*" => Token::Operator('*'),
|
"*" => Token::Operator('*'),
|
||||||
"/" => Token::Operator('/'),
|
"/" => Token::Operator('/'),
|
||||||
|
|
||||||
|
// the previous items just match their tokens, and if you try
|
||||||
|
// to name and use "their value", you get their source location.
|
||||||
|
// For these, we want "their value" to be their actual contents,
|
||||||
|
// which is why we put their types in angle brackets.
|
||||||
"<num>" => Token::Number((<Option<u8>>,<i64>)),
|
"<num>" => Token::Number((<Option<u8>>,<i64>)),
|
||||||
"<var>" => Token::Variable(<ArcIntern<String>>),
|
"<var>" => Token::Variable(<ArcIntern<String>>),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub Program: Program = {
|
pub Program: Program = {
|
||||||
|
// a program is just a set of statements
|
||||||
<stmts:Statements> => Program {
|
<stmts:Statements> => Program {
|
||||||
statements: stmts
|
statements: stmts
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Statements: Vec<Statement> = {
|
Statements: Vec<Statement> = {
|
||||||
|
// a statement is either a set of statements followed by another
|
||||||
|
// statement (note, here, that you can name the result of a sub-parse
|
||||||
|
// using <name: subrule>) ...
|
||||||
<mut stmts:Statements> <stmt:Statement> => {
|
<mut stmts:Statements> <stmt:Statement> => {
|
||||||
stmts.push(stmt);
|
stmts.push(stmt);
|
||||||
stmts
|
stmts
|
||||||
},
|
},
|
||||||
|
|
||||||
|
// ... or it's nothing. This may feel like an awkward way to define
|
||||||
|
// lists of things -- and it is a bit awkward -- but there are actual
|
||||||
|
// technical reasons that you want to (a) use recursivion to define
|
||||||
|
// these, and (b) use *left* recursion, specifically. That's why, in
|
||||||
|
// this file, all of the recursive cases are to the left, like they
|
||||||
|
// are above.
|
||||||
|
//
|
||||||
|
// the details of why left recursion is better is actually pretty
|
||||||
|
// fiddly and in the weeds, and if you're interested you should look
|
||||||
|
// up LALR parsers versus LL parsers; both their differences and how
|
||||||
|
// they're constructed, as they're kind of neat.
|
||||||
|
//
|
||||||
|
// but if you're just writing grammars with lalrpop, then you should
|
||||||
|
// just remember that you should always use left recursion, and be
|
||||||
|
// done with it.
|
||||||
=> {
|
=> {
|
||||||
Vec::new()
|
Vec::new()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub Statement: Statement = {
|
pub Statement: Statement = {
|
||||||
|
// A statement can be a variable binding. Note, here, that we use this
|
||||||
|
// funny @L thing to get the source location before the variable, so that
|
||||||
|
// we can say that this statement spans across everything.
|
||||||
<l:@L> <v:"<var>"> "=" <e:Expression> ";" => Statement::Binding(Location::new(file_idx, l), v.to_string(), e),
|
<l:@L> <v:"<var>"> "=" <e:Expression> ";" => Statement::Binding(Location::new(file_idx, l), v.to_string(), e),
|
||||||
|
|
||||||
|
// Alternatively, a statement can just be a print statement.
|
||||||
"print" <l:@L> <v:"<var>"> ";" => Statement::Print(Location::new(file_idx, l), v.to_string()),
|
"print" <l:@L> <v:"<var>"> ";" => Statement::Print(Location::new(file_idx, l), v.to_string()),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Expressions! Expressions are a little fiddly, because we're going to
|
||||||
|
// use a little bit of a trick to make sure that we get operator precedence
|
||||||
|
// right. The trick works by creating a top-level `Expression` grammar entry
|
||||||
|
// that just points to the thing with the *weakest* precedence. In this case,
|
||||||
|
// we have addition, subtraction, multiplication, and division, so addition
|
||||||
|
// and subtraction have the weakest precedence.
|
||||||
|
//
|
||||||
|
// Then, as we go down the precedence tree, each item will recurse (left!)
|
||||||
|
// to other items at the same precedence level. The right hand operator, for
|
||||||
|
// binary operators (which is all of ours, at the moment) will then be one
|
||||||
|
// level stronger precendence. In addition, we'll let people just fall through
|
||||||
|
// to the next level; so if there isn't an addition or subtraction, we'll just
|
||||||
|
// fall through to the multiplication/division case.
|
||||||
|
//
|
||||||
|
// Finally, at the bottom, we'll have the core expressions (like constants,
|
||||||
|
// variables, etc.) as well as a parenthesized version of `Expression`, which
|
||||||
|
// gets us right up top again.
|
||||||
|
//
|
||||||
|
// Understanding why this works to solve all your operator precedence problems
|
||||||
|
// is a little hard to give an easy intuition for, but for myself it helped
|
||||||
|
// to run through a few examples. Consider thinking about how you want to
|
||||||
|
// parse something like "1 + 2 * 3", for example, versus "1 + 2 + 3" or
|
||||||
|
// "1 * 2 + 3", and hopefully that'll help.
|
||||||
Expression: Expression = {
|
Expression: Expression = {
|
||||||
AdditiveExpression,
|
AdditiveExpression,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// we group addition and subtraction under the heading "additive"
|
||||||
AdditiveExpression: Expression = {
|
AdditiveExpression: Expression = {
|
||||||
<e1:AdditiveExpression> <l:@L> "+" <e2:MultiplicativeExpression> => Expression::Primitive(Location::new(file_idx, l), "+".to_string(), vec![e1, e2]),
|
<e1:AdditiveExpression> <l:@L> "+" <e2:MultiplicativeExpression> => Expression::Primitive(Location::new(file_idx, l), "+".to_string(), vec![e1, e2]),
|
||||||
<e1:AdditiveExpression> <l:@L> "-" <e2:MultiplicativeExpression> => Expression::Primitive(Location::new(file_idx, l), "-".to_string(), vec![e1, e2]),
|
<e1:AdditiveExpression> <l:@L> "-" <e2:MultiplicativeExpression> => Expression::Primitive(Location::new(file_idx, l), "-".to_string(), vec![e1, e2]),
|
||||||
MultiplicativeExpression,
|
MultiplicativeExpression,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// similarly, we group multiplication and division under "multiplicative"
|
||||||
MultiplicativeExpression: Expression = {
|
MultiplicativeExpression: Expression = {
|
||||||
<e1:MultiplicativeExpression> <l:@L> "*" <e2:AtomicExpression> => Expression::Primitive(Location::new(file_idx, l), "*".to_string(), vec![e1, e2]),
|
<e1:MultiplicativeExpression> <l:@L> "*" <e2:AtomicExpression> => Expression::Primitive(Location::new(file_idx, l), "*".to_string(), vec![e1, e2]),
|
||||||
<e1:MultiplicativeExpression> <l:@L> "/" <e2:AtomicExpression> => Expression::Primitive(Location::new(file_idx, l), "/".to_string(), vec![e1, e2]),
|
<e1:MultiplicativeExpression> <l:@L> "/" <e2:AtomicExpression> => Expression::Primitive(Location::new(file_idx, l), "/".to_string(), vec![e1, e2]),
|
||||||
AtomicExpression,
|
AtomicExpression,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// finally, we describe our lowest-level expressions as "atomic", because
|
||||||
|
// they cannot be further divided into parts
|
||||||
AtomicExpression: Expression = {
|
AtomicExpression: Expression = {
|
||||||
|
// just a variable reference
|
||||||
<l:@L> <v:"<var>"> => Expression::Reference(Location::new(file_idx, l), v.to_string()),
|
<l:@L> <v:"<var>"> => Expression::Reference(Location::new(file_idx, l), v.to_string()),
|
||||||
|
// just a number
|
||||||
<l:@L> <n:"<num>"> => {
|
<l:@L> <n:"<num>"> => {
|
||||||
let val = Value::Number(n.0, n.1);
|
let val = Value::Number(n.0, n.1);
|
||||||
Expression::Value(Location::new(file_idx, l), val)
|
Expression::Value(Location::new(file_idx, l), val)
|
||||||
},
|
},
|
||||||
|
// a tricky case: also just a number, but using a negative sign. an
|
||||||
|
// alternative way to do this -- and we may do this eventually -- is
|
||||||
|
// to implement a unary negation expression. this has the odd effect
|
||||||
|
// that the user never actually writes down a negative number; they just
|
||||||
|
// write positive numbers which are immediately sent to a negation
|
||||||
|
// primitive!
|
||||||
<l:@L> "-" <n:"<num>"> => {
|
<l:@L> "-" <n:"<num>"> => {
|
||||||
let val = Value::Number(n.0, -n.1);
|
let val = Value::Number(n.0, -n.1);
|
||||||
Expression::Value(Location::new(file_idx, l), val)
|
Expression::Value(Location::new(file_idx, l), val)
|
||||||
},
|
},
|
||||||
|
// finally, let people parenthesize expressions and get back to a
|
||||||
|
// lower precedence
|
||||||
"(" <e:Expression> ")" => e,
|
"(" <e:Expression> ")" => e,
|
||||||
}
|
}
|
||||||
@@ -4,8 +4,30 @@ use std::fmt;
|
|||||||
use std::num::ParseIntError;
|
use std::num::ParseIntError;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
|
||||||
|
/// A single token of the input stream; used to help the parsing go down
|
||||||
|
/// more easily.
|
||||||
|
///
|
||||||
|
/// The key way to generate this structure is via the [`Logos`] trait.
|
||||||
|
/// See the [`logos`] documentation for more information; we use the
|
||||||
|
/// [`Token::lexer`] function internally.
|
||||||
|
///
|
||||||
|
/// The first step in the compilation process is turning the raw string
|
||||||
|
/// data (in UTF-8, which is its own joy) in to a sequence of more sensible
|
||||||
|
/// tokens. Here, for example, we turn "x=5" into three tokens: a
|
||||||
|
/// [`Token::Variable`] for "x", a [`Token::Equals`] for the "=", and
|
||||||
|
/// then a [`Token::Number`] for the "5". Later on, we'll worry about
|
||||||
|
/// making sense of those three tokens.
|
||||||
|
///
|
||||||
|
/// For now, our list of tokens is relatively straightforward. We'll
|
||||||
|
/// need/want to extend these later.
|
||||||
|
///
|
||||||
|
/// The [`std::fmt::Display`] implementation for [`Token`] should
|
||||||
|
/// round-trip; if you lex a string generated with the [`std::fmt::Display`]
|
||||||
|
/// trait, you should get back the exact same token.
|
||||||
#[derive(Logos, Clone, Debug, PartialEq, Eq)]
|
#[derive(Logos, Clone, Debug, PartialEq, Eq)]
|
||||||
pub enum Token {
|
pub enum Token {
|
||||||
|
// Our first set of tokens are simple characters that we're
|
||||||
|
// going to use to structure NGR programs.
|
||||||
#[token("=")]
|
#[token("=")]
|
||||||
Equals,
|
Equals,
|
||||||
|
|
||||||
@@ -18,12 +40,20 @@ pub enum Token {
|
|||||||
#[token(")")]
|
#[token(")")]
|
||||||
RightParen,
|
RightParen,
|
||||||
|
|
||||||
|
// Next we take of any reserved words; I always like to put
|
||||||
|
// these before we start recognizing more complicated regular
|
||||||
|
// expressions. I don't think it matters, but it works for me.
|
||||||
#[token("print")]
|
#[token("print")]
|
||||||
Print,
|
Print,
|
||||||
|
|
||||||
|
// Next are the operators for NGR. We only have 4, now, but
|
||||||
|
// we might extend these later, or even make them user-definable!
|
||||||
#[regex(r"[+\-*/]", |v| v.slice().chars().next())]
|
#[regex(r"[+\-*/]", |v| v.slice().chars().next())]
|
||||||
Operator(char),
|
Operator(char),
|
||||||
|
|
||||||
|
/// Numbers capture both the value we read from the input,
|
||||||
|
/// converted to an `i64`, as well as the base the user used
|
||||||
|
/// to write the number, if they did so.
|
||||||
#[regex(r"0b[01]+", |v| parse_number(Some(2), v))]
|
#[regex(r"0b[01]+", |v| parse_number(Some(2), v))]
|
||||||
#[regex(r"0o[0-7]+", |v| parse_number(Some(8), v))]
|
#[regex(r"0o[0-7]+", |v| parse_number(Some(8), v))]
|
||||||
#[regex(r"0d[0-9]+", |v| parse_number(Some(10), v))]
|
#[regex(r"0d[0-9]+", |v| parse_number(Some(10), v))]
|
||||||
@@ -31,12 +61,23 @@ pub enum Token {
|
|||||||
#[regex(r"[0-9]+", |v| parse_number(None, v))]
|
#[regex(r"[0-9]+", |v| parse_number(None, v))]
|
||||||
Number((Option<u8>, i64)),
|
Number((Option<u8>, i64)),
|
||||||
|
|
||||||
|
// Variables; this is a very standard, simple set of characters
|
||||||
|
// for variables, but feel free to experiment with more complicated
|
||||||
|
// things. I chose to force variables to start with a lower case
|
||||||
|
// letter, too.
|
||||||
#[regex(r"[a-z][a-zA-Z0-9_]*", |v| ArcIntern::new(v.slice().to_string()))]
|
#[regex(r"[a-z][a-zA-Z0-9_]*", |v| ArcIntern::new(v.slice().to_string()))]
|
||||||
Variable(ArcIntern<String>),
|
Variable(ArcIntern<String>),
|
||||||
|
|
||||||
|
// the next token will be an error token
|
||||||
#[error]
|
#[error]
|
||||||
|
// we're actually just going to skip whitespace, though
|
||||||
#[regex(r"[ \t\r\n\f]+", logos::skip)]
|
#[regex(r"[ \t\r\n\f]+", logos::skip)]
|
||||||
|
// this is an extremely simple version of comments, just line
|
||||||
|
// comments. More complicated /* */ comments can be harder to
|
||||||
|
// implement, and didn't seem worth it at the time.
|
||||||
#[regex(r"//.*", logos::skip)]
|
#[regex(r"//.*", logos::skip)]
|
||||||
|
/// This token represents that some core error happened in lexing;
|
||||||
|
/// possibly that something didn't match anything at all.
|
||||||
Error,
|
Error,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -63,19 +104,28 @@ impl fmt::Display for Token {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A sudden and unexpected error in the lexer.
|
||||||
#[derive(Debug, Error, PartialEq, Eq)]
|
#[derive(Debug, Error, PartialEq, Eq)]
|
||||||
pub enum LexerError {
|
pub enum LexerError {
|
||||||
|
/// The `usize` here is the offset that we ran into the problem, given
|
||||||
|
/// from the start of the file.
|
||||||
#[error("Failed lexing at {0}")]
|
#[error("Failed lexing at {0}")]
|
||||||
LexFailure(usize),
|
LexFailure(usize),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
impl Token {
|
impl Token {
|
||||||
|
/// Create a variable token with the given name. Very handy for
|
||||||
|
/// testing.
|
||||||
pub(crate) fn var(s: &str) -> Token {
|
pub(crate) fn var(s: &str) -> Token {
|
||||||
Token::Variable(ArcIntern::new(s.to_string()))
|
Token::Variable(ArcIntern::new(s.to_string()))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Parse a number in the given base, return a pair of the base and the
|
||||||
|
/// parsed number. This is just a helper used for all of the number
|
||||||
|
/// regular expression cases, which kicks off to the obvious Rust
|
||||||
|
/// standard library function.
|
||||||
fn parse_number(
|
fn parse_number(
|
||||||
base: Option<u8>,
|
base: Option<u8>,
|
||||||
value: &Lexer<Token>,
|
value: &Lexer<Token>,
|
||||||
|
|||||||
Reference in New Issue
Block a user