ngr/src/syntax/parser.lalrpop

//! The parser for NGR!
//!
//! This file contains the grammar for the NGR language; a grammar is a nice,
//! machine-readable way to describe how your language's syntax works. For
//! example, here we describe a program as a series of statements, statements
//! as either variable binding or print statements, etc. As the grammar gets
//! more complicated, using tools like [`lalrpop`] becomes even more important.
//! (Although, at some point, things can become so complicated that you might
//! eventually want to leave lalrpop behind.)
//!
use crate::syntax::{LexerError, Location};
use crate::syntax::ast::{Program,TopLevel,Statement,Expression,Value,Name};
use crate::syntax::tokens::{ConstantType, Token};
use internment::ArcIntern;

// one cool thing about lalrpop: we can pass arguments. in this case, the
// file index of the file we're parsing. we combine this with the file offset
// that Logos gives us to make a [`crate::syntax::Location`].
grammar(file_idx: usize);

// this is a slighlyt odd way to describe this, but: consider this section
// as describing the stuff that is external to the lalrpop grammar that it
// needs to know to do its job.
extern {
    type Location = usize; // Logos, our lexer, implements locations as
                           // offsets from the start of the file.
    type Error = LexerError;

    // here we redeclare all of the tokens.
    enum Token {
        "=" => Token::Equals,
        ";" => Token::Semi,
        "," => Token::Comma,
        "(" => Token::LeftParen,
        ")" => Token::RightParen,
        "<" => Token::LessThan,
        ">" => Token::GreaterThan,
        "{" => Token::OpenBrace,
        "}" => Token::CloseBrace,

        "function" => Token::Function,
        "print" => Token::Print,

        "+" => Token::Operator('+'),
        "-" => Token::Operator('-'),
        "*" => Token::Operator('*'),
        "/" => Token::Operator('/'),

        // the previous items just match their tokens, and if you try
        // to name and use "their value", you get their source location.
        // For these, we want "their value" to be their actual contents,
        // which is why we put their types in angle brackets.
        "<num>" => Token::Number((<Option<u8>>,<Option<ConstantType>>,<u64>)),
        "<var>" => Token::Variable(<ArcIntern<String>>),
    }
}

pub Program: Program = {
    // a program is just a set of statements
    <items:ProgramTopLevel> => Program {
        items
    }
}

ProgramTopLevel: Vec<TopLevel> = {
    <mut rest: ProgramTopLevel> <t:TopLevel> => {
        rest.push(t);
        rest
    },
    => Vec::new(),
}

pub TopLevel: TopLevel = {
    <f:Function> => f,
    <s:Statement> => TopLevel::Statement(s),
}

Function: TopLevel = {
    "function" <opt_name:OptionalName> "(" <args:Arguments> OptionalComma ")" <exp:Expression> =>
        TopLevel::Function(opt_name, args, exp),
}

OptionalName: Option<Name> = {
    <name_start: @L> <v:"<var>"> <name_end: @L> =>
        Some(Name::new(v, Location::new(file_idx, name_start..name_end))),
    => None,
}

Arguments: Vec<Name> = {
    <mut args:Arguments> <arg:Argument> => {
        args.push(arg);
        args
    },

    => Vec::new(),
}

Argument: Name = {
    <name_start: @L> <v:"<var>"> <name_end: @L> =>
        Name::new(v, Location::new(file_idx, name_start..name_end)),
}

OptionalComma: () = {
    => (),
    "," => (),
}

Statements: Vec<Statement> = {
    // a statement is either a set of statements followed by another
    // statement (note, here, that you can name the result of a sub-parse
    // using <name: subrule>) ...
    <mut stmts:Statements> <stmt:Statement> => {
        stmts.push(stmt);
        stmts
    },

    // ... or it's nothing. This may feel like an awkward way to define
    // lists of things -- and it is a bit awkward -- but there are actual
    // technical reasons that you want to (a) use recursivion to define
    // these, and (b) use *left* recursion, specifically. That's why, in
    // this file, all of the recursive cases are to the left, like they
    // are above.
    //
    // the details of why left recursion is better is actually pretty
    // fiddly and in the weeds, and if you're interested you should look
    // up LALR parsers versus LL parsers; both their differences and how
    // they're constructed, as they're kind of neat.
    //
    // but if you're just writing grammars with lalrpop, then you should
    // just remember that you should always use left recursion, and be
    // done with it.
    => {
        Vec::new()
    }
}

Statement: Statement = {
    // A statement can be a variable binding. Note, here, that we use this
    // funny @L thing to get the source location before the variable, so that
    // we can say that this statement spans across everything.
    <ls: @L> <v:"<var>"> <var_end: @L> "=" <e:Expression> ";" <le: @L> =>
        Statement::Binding(
            Location::new(file_idx, ls..le),
            Name::new(v, Location::new(file_idx, ls..var_end)),
            e,
        ),

    // A statement can just be a print statement.
    <ls: @L> "print" <name_start: @L> <v:"<var>"> <name_end: @L> ";" <le: @L> =>
        Statement::Print(
            Location::new(file_idx, ls..le),
            Name::new(v, Location::new(file_idx, name_start..name_end)),
        ),
}

// Expressions! Expressions are a little fiddly, because we're going to
// use a little bit of a trick to make sure that we get operator precedence
// right. The trick works by creating a top-level `Expression` grammar entry
// that just points to the thing with the *weakest* precedence. In this case,
// we have addition, subtraction, multiplication, and division, so addition
// and subtraction have the weakest precedence.
//
// Then, as we go down the precedence tree, each item will recurse (left!)
// to other items at the same precedence level. The right hand operator, for
// binary operators (which is all of ours, at the moment) will then be one
// level stronger precendence. In addition, we'll let people just fall through
// to the next level; so if there isn't an addition or subtraction, we'll just
// fall through to the multiplication/division case.
//
// Finally, at the bottom, we'll have the core expressions (like constants,
// variables, etc.) as well as a parenthesized version of `Expression`, which
// gets us right up top again.
//
// Understanding why this works to solve all your operator precedence problems
// is a little hard to give an easy intuition for, but for myself it helped
// to run through a few examples. Consider thinking about how you want to
// parse something like "1 + 2 * 3", for example, versus "1 + 2 + 3" or
// "1 * 2 + 3", and hopefully that'll help.
Expression: Expression = {
    AdditiveExpression,
}

// we group addition and subtraction under the heading "additive"
AdditiveExpression: Expression = {
    <ls: @L> <e1:AdditiveExpression> <l: @L> "+" <e2:MultiplicativeExpression> <le: @L> =>
        Expression::Primitive(Location::new(file_idx, ls..le), "+".to_string(), vec![e1, e2]),
    <ls: @L> <e1:AdditiveExpression> <l: @L> "-" <e2:MultiplicativeExpression> <le: @L> =>
        Expression::Primitive(Location::new(file_idx, ls..le), "-".to_string(), vec![e1, e2]),
    MultiplicativeExpression,
}

// similarly, we group multiplication and division under "multiplicative"
MultiplicativeExpression: Expression = {
    <ls: @L> <e1:MultiplicativeExpression> <l: @L> "*" <e2:UnaryExpression> <le: @L> =>
        Expression::Primitive(Location::new(file_idx, ls..le), "*".to_string(), vec![e1, e2]),
    <ls: @L> <e1:MultiplicativeExpression> <l: @L> "/" <e2:UnaryExpression> <le: @L> =>
        Expression::Primitive(Location::new(file_idx, ls..le), "/".to_string(), vec![e1, e2]),
    UnaryExpression,
}

UnaryExpression: Expression = {
    <l: @L> "-" <e:UnaryExpression> <le: @L> =>
       Expression::Primitive(Location::new(file_idx, l..le), "-".to_string(), vec![e]),
    <l: @L> "<" <v:"<var>"> ">" <e:UnaryExpression> <le: @L> =>
       Expression::Cast(Location::new(file_idx, l..le), v.to_string(), Box::new(e)),
    AtomicExpression,
}

// finally, we describe our lowest-level expressions as "atomic", because
// they cannot be further divided into parts
AtomicExpression: Expression = {
    // just a variable reference
    <l: @L> <v:"<var>"> <end: @L> => Expression::Reference(Location::new(file_idx, l..end), v.to_string()),
    // just a number
    <l: @L> <n:"<num>"> <end: @L> => Expression::Value(Location::new(file_idx, l..end), Value::Number(n.0, n.1, n.2)),
    // this expression could actually be a block!
    "{" <stmts:Statements> "}" => unimplemented!(),
    // finally, let people parenthesize expressions and get back to a
    // lower precedence
    "(" <e:Expression> ")" => e,
}