From d0c7e02928be836ae3abc98d5e549a5579b2d93b Mon Sep 17 00:00:00 2001 From: Adam Wick Date: Sun, 7 May 2023 21:41:09 -0700 Subject: [PATCH] Commentary in the lexer and parser. --- Cargo.toml | 4 +- src/syntax.rs | 11 +++-- src/syntax/parser.lalrpop | 86 ++++++++++++++++++++++++++++++++++++++- src/syntax/tokens.rs | 50 +++++++++++++++++++++++ 4 files changed, 142 insertions(+), 9 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 59b8033..0451d2e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ cranelift-module = "0.94.0" cranelift-native = "0.94.0" cranelift-object = "0.94.0" internment = { version = "0.7.0", default-features = false, features = ["arc"] } -lalrpop-util = "^0.19.7" +lalrpop-util = "^0.20.0" lazy_static = "^1.4.0" logos = "^0.12.0" pretty = { version = "^0.11.2", features = ["termcolor"] } @@ -30,4 +30,4 @@ tempfile = "^3.5.0" thiserror = "^1.0.30" [build-dependencies] -lalrpop = "^0.19.7" +lalrpop = "^0.20.0" diff --git a/src/syntax.rs b/src/syntax.rs index e6f0fbc..79ca709 100644 --- a/src/syntax.rs +++ b/src/syntax.rs @@ -8,7 +8,8 @@ //! //! * Turning the string into a series of language-specific [`Token`]s. //! * Taking those tokens, and computing a basic syntax tree from them, -//! using our [`parser`]. +//! using our parser ([`ProgramParser`] or [`StatementParser`], generated +//! by [`lalrpop`](https://lalrpop.github.io/lalrpop/)). //! * Validating the tree we have parsed, using the [`validate`] module, //! returning any warnings or errors we have found. //! * Simplifying the tree we have parsed, using the [`simplify`] module, @@ -37,7 +38,7 @@ pub mod simplify; mod tokens; lalrpop_mod!( #[allow(clippy::just_underscores_and_digits, clippy::clone_on_copy)] - pub parser, + parser, "/syntax/parser.rs" ); mod pretty; @@ -45,7 +46,7 @@ pub mod validate; pub use crate::syntax::ast::*; pub use crate::syntax::location::Location; -use crate::syntax::parser::ProgramParser; +pub use crate::syntax::parser::{ProgramParser, StatementParser}; pub use crate::syntax::tokens::{LexerError, Token}; #[cfg(test)] use ::pretty::{Arena, Pretty}; @@ -56,8 +57,6 @@ use proptest::{prop_assert, prop_assert_eq}; use std::str::FromStr; use thiserror::Error; -use self::parser::StatementParser; - #[derive(Debug, Error)] pub enum ParserError { #[error("Invalid token")] @@ -82,7 +81,7 @@ impl ParserError { ParseError::InvalidToken { location } => { ParserError::InvalidToken(Location::new(file_idx, location)) } - ParseError::UnrecognizedEOF { location, expected } => { + ParseError::UnrecognizedEof { location, expected } => { ParserError::UnrecognizedEOF(Location::new(file_idx, location), expected) } ParseError::UnrecognizedToken { diff --git a/src/syntax/parser.lalrpop b/src/syntax/parser.lalrpop index 85b21b3..3d8de29 100644 --- a/src/syntax/parser.lalrpop +++ b/src/syntax/parser.lalrpop @@ -1,14 +1,32 @@ +//! The parser for NGR! +//! +//! This file contains the grammar for the NGR language; a grammar is a nice, +//! machine-readable way to describe how your language's syntax works. For +//! example, here we describe a program as a series of statements, statements +//! as either variable binding or print statements, etc. As the grammar gets +//! more complicated, using tools like [`lalrpop`] becomes even more important. +//! (Although, at some point, things can become so complicated that you might +//! eventually want to leave lalrpop behind.) +//! use crate::syntax::{LexerError, Location}; use crate::syntax::ast::{Program,Statement,Expression,Value}; use crate::syntax::tokens::Token; use internment::ArcIntern; +// one cool thing about lalrpop: we can pass arguments. in this case, the +// file index of the file we're parsing. we combine this with the file offset +// that Logos gives us to make a [`crate::syntax::Location`]. grammar(file_idx: usize); +// this is a slighlyt odd way to describe this, but: consider this section +// as describing the stuff that is external to the lalrpop grammar that it +// needs to know to do its job. extern { - type Location = usize; + type Location = usize; // Logos, our lexer, implements locations as + // offsets from the start of the file. type Error = LexerError; + // here we redeclare all of the tokens. enum Token { "=" => Token::Equals, ";" => Token::Semi, @@ -22,57 +40,123 @@ extern { "*" => Token::Operator('*'), "/" => Token::Operator('/'), + // the previous items just match their tokens, and if you try + // to name and use "their value", you get their source location. + // For these, we want "their value" to be their actual contents, + // which is why we put their types in angle brackets. "" => Token::Number((>,)), "" => Token::Variable(>), } } pub Program: Program = { + // a program is just a set of statements => Program { statements: stmts } } Statements: Vec = { + // a statement is either a set of statements followed by another + // statement (note, here, that you can name the result of a sub-parse + // using ) ... => { stmts.push(stmt); stmts }, + + // ... or it's nothing. This may feel like an awkward way to define + // lists of things -- and it is a bit awkward -- but there are actual + // technical reasons that you want to (a) use recursivion to define + // these, and (b) use *left* recursion, specifically. That's why, in + // this file, all of the recursive cases are to the left, like they + // are above. + // + // the details of why left recursion is better is actually pretty + // fiddly and in the weeds, and if you're interested you should look + // up LALR parsers versus LL parsers; both their differences and how + // they're constructed, as they're kind of neat. + // + // but if you're just writing grammars with lalrpop, then you should + // just remember that you should always use left recursion, and be + // done with it. => { Vec::new() } } pub Statement: Statement = { + // A statement can be a variable binding. Note, here, that we use this + // funny @L thing to get the source location before the variable, so that + // we can say that this statement spans across everything. "> "=" ";" => Statement::Binding(Location::new(file_idx, l), v.to_string(), e), + + // Alternatively, a statement can just be a print statement. "print" "> ";" => Statement::Print(Location::new(file_idx, l), v.to_string()), } +// Expressions! Expressions are a little fiddly, because we're going to +// use a little bit of a trick to make sure that we get operator precedence +// right. The trick works by creating a top-level `Expression` grammar entry +// that just points to the thing with the *weakest* precedence. In this case, +// we have addition, subtraction, multiplication, and division, so addition +// and subtraction have the weakest precedence. +// +// Then, as we go down the precedence tree, each item will recurse (left!) +// to other items at the same precedence level. The right hand operator, for +// binary operators (which is all of ours, at the moment) will then be one +// level stronger precendence. In addition, we'll let people just fall through +// to the next level; so if there isn't an addition or subtraction, we'll just +// fall through to the multiplication/division case. +// +// Finally, at the bottom, we'll have the core expressions (like constants, +// variables, etc.) as well as a parenthesized version of `Expression`, which +// gets us right up top again. +// +// Understanding why this works to solve all your operator precedence problems +// is a little hard to give an easy intuition for, but for myself it helped +// to run through a few examples. Consider thinking about how you want to +// parse something like "1 + 2 * 3", for example, versus "1 + 2 + 3" or +// "1 * 2 + 3", and hopefully that'll help. Expression: Expression = { AdditiveExpression, } +// we group addition and subtraction under the heading "additive" AdditiveExpression: Expression = { "+" => Expression::Primitive(Location::new(file_idx, l), "+".to_string(), vec![e1, e2]), "-" => Expression::Primitive(Location::new(file_idx, l), "-".to_string(), vec![e1, e2]), MultiplicativeExpression, } +// similarly, we group multiplication and division under "multiplicative" MultiplicativeExpression: Expression = { "*" => Expression::Primitive(Location::new(file_idx, l), "*".to_string(), vec![e1, e2]), "/" => Expression::Primitive(Location::new(file_idx, l), "/".to_string(), vec![e1, e2]), AtomicExpression, } +// finally, we describe our lowest-level expressions as "atomic", because +// they cannot be further divided into parts AtomicExpression: Expression = { + // just a variable reference "> => Expression::Reference(Location::new(file_idx, l), v.to_string()), + // just a number "> => { let val = Value::Number(n.0, n.1); Expression::Value(Location::new(file_idx, l), val) }, + // a tricky case: also just a number, but using a negative sign. an + // alternative way to do this -- and we may do this eventually -- is + // to implement a unary negation expression. this has the odd effect + // that the user never actually writes down a negative number; they just + // write positive numbers which are immediately sent to a negation + // primitive! "-" "> => { let val = Value::Number(n.0, -n.1); Expression::Value(Location::new(file_idx, l), val) }, + // finally, let people parenthesize expressions and get back to a + // lower precedence "(" ")" => e, } \ No newline at end of file diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs index 78d0c8a..161520a 100644 --- a/src/syntax/tokens.rs +++ b/src/syntax/tokens.rs @@ -4,8 +4,30 @@ use std::fmt; use std::num::ParseIntError; use thiserror::Error; +/// A single token of the input stream; used to help the parsing go down +/// more easily. +/// +/// The key way to generate this structure is via the [`Logos`] trait. +/// See the [`logos`] documentation for more information; we use the +/// [`Token::lexer`] function internally. +/// +/// The first step in the compilation process is turning the raw string +/// data (in UTF-8, which is its own joy) in to a sequence of more sensible +/// tokens. Here, for example, we turn "x=5" into three tokens: a +/// [`Token::Variable`] for "x", a [`Token::Equals`] for the "=", and +/// then a [`Token::Number`] for the "5". Later on, we'll worry about +/// making sense of those three tokens. +/// +/// For now, our list of tokens is relatively straightforward. We'll +/// need/want to extend these later. +/// +/// The [`std::fmt::Display`] implementation for [`Token`] should +/// round-trip; if you lex a string generated with the [`std::fmt::Display`] +/// trait, you should get back the exact same token. #[derive(Logos, Clone, Debug, PartialEq, Eq)] pub enum Token { + // Our first set of tokens are simple characters that we're + // going to use to structure NGR programs. #[token("=")] Equals, @@ -18,12 +40,20 @@ pub enum Token { #[token(")")] RightParen, + // Next we take of any reserved words; I always like to put + // these before we start recognizing more complicated regular + // expressions. I don't think it matters, but it works for me. #[token("print")] Print, + // Next are the operators for NGR. We only have 4, now, but + // we might extend these later, or even make them user-definable! #[regex(r"[+\-*/]", |v| v.slice().chars().next())] Operator(char), + /// Numbers capture both the value we read from the input, + /// converted to an `i64`, as well as the base the user used + /// to write the number, if they did so. #[regex(r"0b[01]+", |v| parse_number(Some(2), v))] #[regex(r"0o[0-7]+", |v| parse_number(Some(8), v))] #[regex(r"0d[0-9]+", |v| parse_number(Some(10), v))] @@ -31,12 +61,23 @@ pub enum Token { #[regex(r"[0-9]+", |v| parse_number(None, v))] Number((Option, i64)), + // Variables; this is a very standard, simple set of characters + // for variables, but feel free to experiment with more complicated + // things. I chose to force variables to start with a lower case + // letter, too. #[regex(r"[a-z][a-zA-Z0-9_]*", |v| ArcIntern::new(v.slice().to_string()))] Variable(ArcIntern), + // the next token will be an error token #[error] + // we're actually just going to skip whitespace, though #[regex(r"[ \t\r\n\f]+", logos::skip)] + // this is an extremely simple version of comments, just line + // comments. More complicated /* */ comments can be harder to + // implement, and didn't seem worth it at the time. #[regex(r"//.*", logos::skip)] + /// This token represents that some core error happened in lexing; + /// possibly that something didn't match anything at all. Error, } @@ -63,19 +104,28 @@ impl fmt::Display for Token { } } +/// A sudden and unexpected error in the lexer. #[derive(Debug, Error, PartialEq, Eq)] pub enum LexerError { + /// The `usize` here is the offset that we ran into the problem, given + /// from the start of the file. #[error("Failed lexing at {0}")] LexFailure(usize), } #[cfg(test)] impl Token { + /// Create a variable token with the given name. Very handy for + /// testing. pub(crate) fn var(s: &str) -> Token { Token::Variable(ArcIntern::new(s.to_string())) } } +/// Parse a number in the given base, return a pair of the base and the +/// parsed number. This is just a helper used for all of the number +/// regular expression cases, which kicks off to the obvious Rust +/// standard library function. fn parse_number( base: Option, value: &Lexer,