2023-05-13 12:34:48 -07:00
4 changed files with 142 additions and 9 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ cranelift-module = "0.94.0"
 cranelift-native = "0.94.0"
 cranelift-object = "0.94.0"
 internment = { version = "0.7.0", default-features = false, features = ["arc"] }
-lalrpop-util = "^0.19.7"
+lalrpop-util = "^0.20.0"
 lazy_static = "^1.4.0"
 logos = "^0.12.0"
 pretty = { version = "^0.11.2", features = ["termcolor"] }
@@ -30,4 +30,4 @@ tempfile = "^3.5.0"
 thiserror = "^1.0.30"
 [build-dependencies]
-lalrpop = "^0.19.7"
+lalrpop = "^0.20.0"
--- a/src/syntax.rs
+++ b/src/syntax.rs
@@ -8,7 +8,8 @@
 //! 
 //!   * Turning the string into a series of language-specific [`Token`]s.
 //!   * Taking those tokens, and computing a basic syntax tree from them,
-//!     using our [`parser`].
+//!     using our parser ([`ProgramParser`] or [`StatementParser`], generated
 //!     by [`lalrpop`](https://lalrpop.github.io/lalrpop/)).
 //!   * Validating the tree we have parsed, using the [`validate`] module,
 //!     returning any warnings or errors we have found.
 //!   * Simplifying the tree we have parsed, using the [`simplify`] module,
@@ -37,7 +38,7 @@ pub mod simplify;
 mod tokens;
 lalrpop_mod!(
    #[allow(clippy::just_underscores_and_digits, clippy::clone_on_copy)]
-    pub parser,
+    parser,
    "/syntax/parser.rs"
 );
 mod pretty;
@@ -45,7 +46,7 @@ pub mod validate;
 pub use crate::syntax::ast::*;
 pub use crate::syntax::location::Location;
-use crate::syntax::parser::ProgramParser;
+pub use crate::syntax::parser::{ProgramParser, StatementParser};
 pub use crate::syntax::tokens::{LexerError, Token};
 #[cfg(test)]
 use ::pretty::{Arena, Pretty};
@@ -56,8 +57,6 @@ use proptest::{prop_assert, prop_assert_eq};
 use std::str::FromStr;
 use thiserror::Error;
 use self::parser::StatementParser;
 #[derive(Debug, Error)]
 pub enum ParserError {
    #[error("Invalid token")]
@@ -82,7 +81,7 @@ impl ParserError {
            ParseError::InvalidToken { location } => {
                ParserError::InvalidToken(Location::new(file_idx, location))
            }
-            ParseError::UnrecognizedEOF { location, expected } => {
+            ParseError::UnrecognizedEof { location, expected } => {
                ParserError::UnrecognizedEOF(Location::new(file_idx, location), expected)
            }
            ParseError::UnrecognizedToken {
--- a/src/syntax/parser.lalrpop
+++ b/src/syntax/parser.lalrpop
@@ -1,14 +1,32 @@
 //! The parser for NGR!
 //!
 //! This file contains the grammar for the NGR language; a grammar is a nice,
 //! machine-readable way to describe how your language's syntax works. For
 //! example, here we describe a program as a series of statements, statements
 //! as either variable binding or print statements, etc. As the grammar gets
 //! more complicated, using tools like [`lalrpop`] becomes even more important.
 //! (Although, at some point, things can become so complicated that you might
 //! eventually want to leave lalrpop behind.)
 //!
 use crate::syntax::{LexerError, Location};
 use crate::syntax::ast::{Program,Statement,Expression,Value};
 use crate::syntax::tokens::Token;
 use internment::ArcIntern;
 // one cool thing about lalrpop: we can pass arguments. in this case, the
 // file index of the file we're parsing. we combine this with the file offset
 // that Logos gives us to make a [`crate::syntax::Location`].
 grammar(file_idx: usize);
 // this is a slighlyt odd way to describe this, but: consider this section
 // as describing the stuff that is external to the lalrpop grammar that it
 // needs to know to do its job.
 extern {
-    type Location = usize;
+    type Location = usize; // Logos, our lexer, implements locations as
                           // offsets from the start of the file.
    type Error = LexerError;
    // here we redeclare all of the tokens.
    enum Token {
        "=" => Token::Equals,
        ";" => Token::Semi,
@@ -22,57 +40,123 @@ extern {
        "*" => Token::Operator('*'),
        "/" => Token::Operator('/'),
        // the previous items just match their tokens, and if you try
        // to name and use "their value", you get their source location.
        // For these, we want "their value" to be their actual contents,
        // which is why we put their types in angle brackets.
        "<num>" => Token::Number((<Option<u8>>,<i64>)),
        "<var>" => Token::Variable(<ArcIntern<String>>),
    }
 }
 pub Program: Program = {
    // a program is just a set of statements
    <stmts:Statements> => Program {
        statements: stmts
    }
 }
 Statements: Vec<Statement> = {
    // a statement is either a set of statements followed by another
    // statement (note, here, that you can name the result of a sub-parse
    // using <name: subrule>) ...
    <mut stmts:Statements> <stmt:Statement> => {
        stmts.push(stmt);
        stmts
    },
    // ... or it's nothing. This may feel like an awkward way to define
    // lists of things -- and it is a bit awkward -- but there are actual
    // technical reasons that you want to (a) use recursivion to define
    // these, and (b) use *left* recursion, specifically. That's why, in
    // this file, all of the recursive cases are to the left, like they
    // are above.
    //
    // the details of why left recursion is better is actually pretty
    // fiddly and in the weeds, and if you're interested you should look
    // up LALR parsers versus LL parsers; both their differences and how
    // they're constructed, as they're kind of neat.
    //
    // but if you're just writing grammars with lalrpop, then you should
    // just remember that you should always use left recursion, and be
    // done with it. 
    => {
        Vec::new()
    }
 }
 pub Statement: Statement = {
    // A statement can be a variable binding. Note, here, that we use this
    // funny @L thing to get the source location before the variable, so that
    // we can say that this statement spans across everything.
    <l:@L> <v:"<var>"> "=" <e:Expression> ";" => Statement::Binding(Location::new(file_idx, l), v.to_string(), e),
    // Alternatively, a statement can just be a print statement.
    "print" <l:@L> <v:"<var>"> ";" => Statement::Print(Location::new(file_idx, l), v.to_string()),
 }
 // Expressions! Expressions are a little fiddly, because we're going to
 // use a little bit of a trick to make sure that we get operator precedence
 // right. The trick works by creating a top-level `Expression` grammar entry
 // that just points to the thing with the *weakest* precedence. In this case,
 // we have addition, subtraction, multiplication, and division, so addition
 // and subtraction have the weakest precedence.
 //
 // Then, as we go down the precedence tree, each item will recurse (left!)
 // to other items at the same precedence level. The right hand operator, for
 // binary operators (which is all of ours, at the moment) will then be one
 // level stronger precendence. In addition, we'll let people just fall through
 // to the next level; so if there isn't an addition or subtraction, we'll just
 // fall through to the multiplication/division case.
 //
 // Finally, at the bottom, we'll have the core expressions (like constants,
 // variables, etc.) as well as a parenthesized version of `Expression`, which
 // gets us right up top again.
 //
 // Understanding why this works to solve all your operator precedence problems
 // is a little hard to give an easy intuition for, but for myself it helped
 // to run through a few examples. Consider thinking about how you want to
 // parse something like "1 + 2 * 3", for example, versus "1 + 2 + 3" or
 // "1 * 2 + 3", and hopefully that'll help.
 Expression: Expression = {
    AdditiveExpression,
 }
 // we group addition and subtraction under the heading "additive"
 AdditiveExpression: Expression = {
    <e1:AdditiveExpression> <l:@L> "+" <e2:MultiplicativeExpression> => Expression::Primitive(Location::new(file_idx, l), "+".to_string(), vec![e1, e2]),
    <e1:AdditiveExpression> <l:@L> "-" <e2:MultiplicativeExpression> => Expression::Primitive(Location::new(file_idx, l), "-".to_string(), vec![e1, e2]),
    MultiplicativeExpression,
 }
 // similarly, we group multiplication and division under "multiplicative"
 MultiplicativeExpression: Expression = {
    <e1:MultiplicativeExpression> <l:@L> "*" <e2:AtomicExpression> => Expression::Primitive(Location::new(file_idx, l), "*".to_string(), vec![e1, e2]),
    <e1:MultiplicativeExpression> <l:@L> "/" <e2:AtomicExpression> => Expression::Primitive(Location::new(file_idx, l), "/".to_string(), vec![e1, e2]),
    AtomicExpression,
 }
 // finally, we describe our lowest-level expressions as "atomic", because
 // they cannot be further divided into parts
 AtomicExpression: Expression = {
    // just a variable reference
    <l:@L> <v:"<var>"> => Expression::Reference(Location::new(file_idx, l), v.to_string()),
    // just a number
    <l:@L> <n:"<num>"> => {
        let val = Value::Number(n.0, n.1);
        Expression::Value(Location::new(file_idx, l), val)
    },
    // a tricky case: also just a number, but using a negative sign. an
    // alternative way to do this -- and we may do this eventually -- is
    // to implement a unary negation expression. this has the odd effect
    // that the user never actually writes down a negative number; they just
    // write positive numbers which are immediately sent to a negation
    // primitive!
    <l:@L> "-" <n:"<num>"> => {
        let val = Value::Number(n.0, -n.1);
        Expression::Value(Location::new(file_idx, l), val)
    },
    // finally, let people parenthesize expressions and get back to a
    // lower precedence
    "(" <e:Expression> ")" => e,
 }
--- a/src/syntax/tokens.rs
+++ b/src/syntax/tokens.rs
@@ -4,8 +4,30 @@ use std::fmt;
 use std::num::ParseIntError;
 use thiserror::Error;
 /// A single token of the input stream; used to help the parsing go down
 /// more easily.
 /// 
 /// The key way to generate this structure is via the [`Logos`] trait.
 /// See the [`logos`] documentation for more information; we use the
 /// [`Token::lexer`] function internally.
 /// 
 /// The first step in the compilation process is turning the raw string
 /// data (in UTF-8, which is its own joy) in to a sequence of more sensible
 /// tokens. Here, for example, we turn "x=5" into three tokens: a
 /// [`Token::Variable`] for "x", a [`Token::Equals`] for the "=", and
 /// then a [`Token::Number`] for the "5". Later on, we'll worry about
 /// making sense of those three tokens.
 /// 
 /// For now, our list of tokens is relatively straightforward. We'll
 /// need/want to extend these later.
 /// 
 /// The [`std::fmt::Display`] implementation for [`Token`] should
 /// round-trip; if you lex a string generated with the [`std::fmt::Display`]
 /// trait, you should get back the exact same token.
 #[derive(Logos, Clone, Debug, PartialEq, Eq)]
 pub enum Token {
    // Our first set of tokens are simple characters that we're
    // going to use to structure NGR programs. 
    #[token("=")]
    Equals,
@@ -18,12 +40,20 @@ pub enum Token {
    #[token(")")]
    RightParen,
    // Next we take of any reserved words; I always like to put
    // these before we start recognizing more complicated regular
    // expressions. I don't think it matters, but it works for me.
    #[token("print")]
    Print,
    // Next are the operators for NGR. We only have 4, now, but
    // we might extend these later, or even make them user-definable!
    #[regex(r"[+\-*/]", |v| v.slice().chars().next())]
    Operator(char),
    /// Numbers capture both the value we read from the input,
    /// converted to an `i64`, as well as the base the user used
    /// to write the number, if they did so.
    #[regex(r"0b[01]+", |v| parse_number(Some(2), v))]
    #[regex(r"0o[0-7]+", |v| parse_number(Some(8), v))]
    #[regex(r"0d[0-9]+", |v| parse_number(Some(10), v))]
@@ -31,12 +61,23 @@ pub enum Token {
    #[regex(r"[0-9]+", |v| parse_number(None, v))]
    Number((Option<u8>, i64)),
    // Variables; this is a very standard, simple set of characters
    // for variables, but feel free to experiment with more complicated
    // things. I chose to force variables to start with a lower case
    // letter, too.
    #[regex(r"[a-z][a-zA-Z0-9_]*", |v| ArcIntern::new(v.slice().to_string()))]
    Variable(ArcIntern<String>),
    // the next token will be an error token
    #[error]
    // we're actually just going to skip whitespace, though
    #[regex(r"[ \t\r\n\f]+", logos::skip)]
    // this is an extremely simple version of comments, just line
    // comments. More complicated /* */ comments can be harder to
    // implement, and didn't seem worth it at the time. 
    #[regex(r"//.*", logos::skip)]
    /// This token represents that some core error happened in lexing;
    /// possibly that something didn't match anything at all.
    Error,
 }
@@ -63,19 +104,28 @@ impl fmt::Display for Token {
    }
 }
 /// A sudden and unexpected error in the lexer.
 #[derive(Debug, Error, PartialEq, Eq)]
 pub enum LexerError {
    /// The `usize` here is the offset that we ran into the problem, given
    /// from the start of the file.
    #[error("Failed lexing at {0}")]
    LexFailure(usize),
 }
 #[cfg(test)]
 impl Token {
    /// Create a variable token with the given name. Very handy for
    /// testing.
    pub(crate) fn var(s: &str) -> Token {
        Token::Variable(ArcIntern::new(s.to_string()))
    }
 }
 /// Parse a number in the given base, return a pair of the base and the
 /// parsed number. This is just a helper used for all of the number
 /// regular expression cases, which kicks off to the obvious Rust
 /// standard library function.
 fn parse_number(
    base: Option<u8>,
    value: &Lexer<Token>,