//! The parser for NGR! //! //! This file contains the grammar for the NGR language; a grammar is a nice, //! machine-readable way to describe how your language's syntax works. For //! example, here we describe a program as a series of statements, statements //! as either variable binding or print statements, etc. As the grammar gets //! more complicated, using tools like [`lalrpop`] becomes even more important. //! (Although, at some point, things can become so complicated that you might //! eventually want to leave lalrpop behind.) //! use crate::syntax::{LexerError, Location}; use crate::syntax::ast::{Program,TopLevel,Statement,Expression,Value,Name}; use crate::syntax::tokens::{ConstantType, Token}; use internment::ArcIntern; // one cool thing about lalrpop: we can pass arguments. in this case, the // file index of the file we're parsing. we combine this with the file offset // that Logos gives us to make a [`crate::syntax::Location`]. grammar(file_idx: usize); // this is a slighlyt odd way to describe this, but: consider this section // as describing the stuff that is external to the lalrpop grammar that it // needs to know to do its job. extern { type Location = usize; // Logos, our lexer, implements locations as // offsets from the start of the file. type Error = LexerError; // here we redeclare all of the tokens. enum Token { "=" => Token::Equals, ";" => Token::Semi, "," => Token::Comma, "(" => Token::LeftParen, ")" => Token::RightParen, "<" => Token::LessThan, ">" => Token::GreaterThan, "{" => Token::OpenBrace, "}" => Token::CloseBrace, "function" => Token::Function, "print" => Token::Print, "+" => Token::Operator('+'), "-" => Token::Operator('-'), "*" => Token::Operator('*'), "/" => Token::Operator('/'), // the previous items just match their tokens, and if you try // to name and use "their value", you get their source location. // For these, we want "their value" to be their actual contents, // which is why we put their types in angle brackets. "" => Token::Number((>,>,)), "" => Token::Variable(>), } } pub Program: Program = { // a program is just a set of statements => Program { items } } ProgramTopLevel: Vec = { => { rest.push(t); rest }, => Vec::new(), } pub TopLevel: TopLevel = { => f, => TopLevel::Statement(s), } Function: TopLevel = { "function" "(" OptionalComma ")" => TopLevel::Function(opt_name, args, exp), } OptionalName: Option = { "> => Some(Name::new(v, Location::new(file_idx, name_start..name_end))), => None, } Arguments: Vec = { => { args.push(arg); args }, => Vec::new(), } Argument: Name = { "> => Name::new(v, Location::new(file_idx, name_start..name_end)), } OptionalComma: () = { => (), "," => (), } Statements: Vec = { // a statement is either a set of statements followed by another // statement (note, here, that you can name the result of a sub-parse // using ) ... => { stmts.push(stmt); stmts }, // ... or it's nothing. This may feel like an awkward way to define // lists of things -- and it is a bit awkward -- but there are actual // technical reasons that you want to (a) use recursivion to define // these, and (b) use *left* recursion, specifically. That's why, in // this file, all of the recursive cases are to the left, like they // are above. // // the details of why left recursion is better is actually pretty // fiddly and in the weeds, and if you're interested you should look // up LALR parsers versus LL parsers; both their differences and how // they're constructed, as they're kind of neat. // // but if you're just writing grammars with lalrpop, then you should // just remember that you should always use left recursion, and be // done with it. => { Vec::new() } } Statement: Statement = { // A statement can be a variable binding. Note, here, that we use this // funny @L thing to get the source location before the variable, so that // we can say that this statement spans across everything. "> "=" ";" => Statement::Binding( Location::new(file_idx, ls..le), Name::new(v, Location::new(file_idx, ls..var_end)), e, ), // A statement can just be a print statement. "print" "> ";" => Statement::Print( Location::new(file_idx, ls..le), Name::new(v, Location::new(file_idx, name_start..name_end)), ), } // Expressions! Expressions are a little fiddly, because we're going to // use a little bit of a trick to make sure that we get operator precedence // right. The trick works by creating a top-level `Expression` grammar entry // that just points to the thing with the *weakest* precedence. In this case, // we have addition, subtraction, multiplication, and division, so addition // and subtraction have the weakest precedence. // // Then, as we go down the precedence tree, each item will recurse (left!) // to other items at the same precedence level. The right hand operator, for // binary operators (which is all of ours, at the moment) will then be one // level stronger precendence. In addition, we'll let people just fall through // to the next level; so if there isn't an addition or subtraction, we'll just // fall through to the multiplication/division case. // // Finally, at the bottom, we'll have the core expressions (like constants, // variables, etc.) as well as a parenthesized version of `Expression`, which // gets us right up top again. // // Understanding why this works to solve all your operator precedence problems // is a little hard to give an easy intuition for, but for myself it helped // to run through a few examples. Consider thinking about how you want to // parse something like "1 + 2 * 3", for example, versus "1 + 2 + 3" or // "1 * 2 + 3", and hopefully that'll help. Expression: Expression = { AdditiveExpression, } // we group addition and subtraction under the heading "additive" AdditiveExpression: Expression = { "+" => Expression::Primitive(Location::new(file_idx, ls..le), "+".to_string(), vec![e1, e2]), "-" => Expression::Primitive(Location::new(file_idx, ls..le), "-".to_string(), vec![e1, e2]), MultiplicativeExpression, } // similarly, we group multiplication and division under "multiplicative" MultiplicativeExpression: Expression = { "*" => Expression::Primitive(Location::new(file_idx, ls..le), "*".to_string(), vec![e1, e2]), "/" => Expression::Primitive(Location::new(file_idx, ls..le), "/".to_string(), vec![e1, e2]), UnaryExpression, } UnaryExpression: Expression = { "-" => Expression::Primitive(Location::new(file_idx, l..le), "-".to_string(), vec![e]), "<" "> ">" => Expression::Cast(Location::new(file_idx, l..le), v.to_string(), Box::new(e)), AtomicExpression, } // finally, we describe our lowest-level expressions as "atomic", because // they cannot be further divided into parts AtomicExpression: Expression = { // just a variable reference "> => Expression::Reference(Location::new(file_idx, l..end), v.to_string()), // just a number "> => Expression::Value(Location::new(file_idx, l..end), Value::Number(n.0, n.1, n.2)), // this expression could actually be a block! "{" "}" => unimplemented!(), // finally, let people parenthesize expressions and get back to a // lower precedence "(" ")" => e, }