ngr/src/syntax.rs

//! NGR Parsing: Reading input, turning it into sense (or errors).
//!
//! This module implement the front end of the compiler, which is responsible for
//! reading in NGR syntax as a string, turning it into a series of reasonable Rust
//! structures for us to manipulate, and doing some validation while it's at it.
//!
//! The core flow for this work is:
//!
//!   * Turning the string into a series of language-specific [`Token`]s.
//!   * Taking those tokens, and computing a basic syntax tree from them,
//!     using our parser ([`ProgramParser`] or [`TopLevelParser`], generated
//!     by [`lalrpop`](https://lalrpop.github.io/lalrpop/)).
//!   * Validating the tree we have parsed, using [`Program::validate`],
//!     returning any warnings or errors we have found.
//!
//! In addition to all of this, we make sure that the structures defined in this
//! module are all:
//!
//!   * Instances of [`Pretty`](::pretty::Pretty), so that you can print stuff back
//!     out that can be read by a human.
//!   * Instances of [`Arbitrary`](proptest::prelude::Arbitrary), so they can be
//!     used in `proptest`-based property testing. There are built-in tests in
//!     the library, for example, to make sure that the pretty-printing round-trips.
//!   * Can be evaluated using an `eval` function, for comparison with later
//!     versions of the function downstream.
use codespan_reporting::{diagnostic::Diagnostic, files::SimpleFiles};
use lalrpop_util::lalrpop_mod;
use logos::Logos;

pub mod arbitrary;
mod ast;
pub mod eval;
mod location;
mod tokens;
lalrpop_mod!(
    #[allow(clippy::just_underscores_and_digits, clippy::clone_on_copy)]
    parser,
    "/syntax/parser.rs"
);
pub mod pretty;
mod validate;

#[cfg(test)]
use crate::syntax::arbitrary::GenerationEnvironment;
pub use crate::syntax::ast::*;
pub use crate::syntax::location::Location;
pub use crate::syntax::parser::{ProgramParser, TopLevelParser};
pub use crate::syntax::tokens::{LexerError, Token};
use lalrpop_util::ParseError;
#[cfg(test)]
use proptest::{arbitrary::Arbitrary, prop_assert};
use std::ops::Range;
#[cfg(test)]
use std::str::FromStr;
use thiserror::Error;

/// One of the many errors that can occur when processing text input.
///
/// If you get one of these and want to display it to the user, we strongly
/// suggest using the [`From`] implementation to turn this into a [`Diagnostic`],
/// and then printing it via [`codespan_reporting`].
#[derive(Debug, Error)]
pub enum ParserError {
    /// Raised by the lexer when we see some text that doesn't make
    /// any sense in the language.
    #[error("Invalid token")]
    InvalidToken(Location),

    /// Raised when we're parsing the file and run into an EOF in a
    /// place we really weren't expecting.
    #[error("Unrecognized EOF")]
    UnrecognizedEOF(Location, Vec<String>),

    /// Raised when we're parsing the file, and run into a token in a
    /// place we weren't expecting it.
    #[error("Unrecognized token")]
    UnrecognizedToken(Location, Token, Vec<String>),

    /// Raised when we were expecting the end of the file, but instead
    /// got another token.
    #[error("Extra token")]
    ExtraToken(Location, Token),

    /// Raised when the lexer just had some sort of internal problem
    /// and just gave up.
    #[error("Lexing failure")]
    LexFailure(Location),

    /// Raised when we tried to reference a file, or add a file, to our
    /// file database, and the database ran into a problem.
    #[error("File database error")]
    FileDatabaseError(#[from] codespan_reporting::files::Error),

    /// Raised when the OS is having problems giving us data.
    #[error("Read error")]
    ReadError(#[from] std::io::Error),
}

impl ParserError {
    /// Convert one of lalrpop's parser errors into one of our own, which we can more
    /// easily implement translation into [`Diagnostic`].
    ///
    /// This function is relatively straightforward, because we match the errors pretty
    /// closely. The major thing we do here is convert [`lalrpop`]'s notion of a location,
    /// which is just an offset that it got from the lexer, into an actual location that
    /// we can use in our [`Diagnostic`]s.
    fn convert(file_idx: usize, err: ParseError<usize, Token, ParserError>) -> Self {
        match err {
            ParseError::InvalidToken { location } => {
                ParserError::InvalidToken(Location::new(file_idx, location..location + 1))
            }
            ParseError::UnrecognizedEof { location, expected } => ParserError::UnrecognizedEOF(
                Location::new(file_idx, location..location + 1),
                expected,
            ),
            ParseError::UnrecognizedToken {
                token: (start, token, end),
                expected,
            } => {
                ParserError::UnrecognizedToken(Location::new(file_idx, start..end), token, expected)
            }
            ParseError::ExtraToken {
                token: (start, token, end),
            } => ParserError::ExtraToken(Location::new(file_idx, start..end), token),
            ParseError::User { error } => error,
        }
    }
}

/// This is just a nice little function to print out what we expected, if
/// we had some expectations. Because English is a little wonky, there's
/// some odd stuff with whether we get 0, 1, 2, or more, and it's nice to
/// just split that bit of logic out.
fn display_expected(expected: &[String]) -> String {
    match expected.len() {
        0 => "".to_string(),
        1 => format!("; expected {}", expected[0]),
        2 => format!("; expected {} or {}", expected[0], expected[1]),
        n => format!(
            "; expected {}or {}",
            comma_separate(&expected[0..n - 1]),
            expected[n - 1]
        ),
    }
}

/// Given a list of strings, comma separate (with a space) them, as in an
/// English list.
fn comma_separate(strings: &[String]) -> String {
    let mut result = String::new();

    for s in strings.iter() {
        result.push_str(s);
        result.push_str(", ");
    }

    result
}

impl<'a> From<&'a ParserError> for Diagnostic<usize> {
    fn from(value: &ParserError) -> Self {
        match value {
            // this was just a token we didn't understand
            ParserError::InvalidToken(location) => location
                .labelled_error("extremely odd token")
                .with_message("encountered extremely confusing token"),

            // unexpected EOF!
            ParserError::UnrecognizedEOF(location, expected) => location.error().with_message(
                format!("expected enf of file{}", display_expected(expected)),
            ),

            // encountered a token where it shouldn't be
            ParserError::UnrecognizedToken(loc, token, expected) => {
                let expected_str =
                    format!("unexpected token {}{}", token, display_expected(expected));
                let unexpected_str = format!("unexpected token {}", token);

                Diagnostic::error()
                    .with_message(expected_str)
                    .with_labels(vec![loc.primary_label().with_message(unexpected_str)])
            }

            // I think we get this when we get a token, but were expected EOF
            ParserError::ExtraToken(loc, token) => {
                let expected_str =
                    format!("unexpected token {} after the expected end of file", token);
                let unexpected_str = format!("unexpected token {}", token);

                Diagnostic::error()
                    .with_message(expected_str)
                    .with_labels(vec![loc.primary_label().with_message(unexpected_str)])
            }

            // simple lexer errors
            ParserError::LexFailure(location) => {
                location.error().with_message("unexpected character")
            }

            ParserError::FileDatabaseError(e) => Diagnostic::error().with_message(e.to_string()),

            ParserError::ReadError(e) => Diagnostic::error().with_message(e.to_string()),
        }
    }
}

impl Program {
    /// Parse the given file, adding it to the database as part of the process.
    ///
    /// This operation reads the file from disk and adds it to the database for future
    /// reference. If you get an error, we strongly suggest conversion to [`Diagnostic`]
    /// and then reporting it to the user via [`codespan_reporting`]. You should use
    /// this function if you're pretty sure that you've never seen this file before,
    /// and [`Program::parse`] if you have and know its index and already have it in
    /// memory.
    pub fn parse_file(
        file_database: &mut SimpleFiles<String, String>,
        file_name: &str,
    ) -> Result<Self, ParserError> {
        let file_contents = std::fs::read_to_string(file_name)?;
        let file_handle = file_database.add(file_name.to_string(), file_contents);
        let file_db_info = file_database.get(file_handle)?;
        Program::parse(file_handle, file_db_info.source())
    }

    /// Parse a block of text you have in memory, using the given index for [`Location`]s.
    ///
    /// If you use a nonsensical file index, everything will work fine until you try to
    /// report an error, at which point [`codespan_reporting`] may have some nasty things
    /// to say to you.
    pub fn parse(file_idx: usize, buffer: &str) -> Result<Program, ParserError> {
        let lexer = Token::lexer(buffer)
            .spanned()
            .map(|x| permute_lexer_result(file_idx, x));
        ProgramParser::new()
            .parse(file_idx, lexer)
            .map_err(|e| ParserError::convert(file_idx, e))
    }
}

impl TopLevel {
    /// Parse a top-level item that you have in memory, using the given index for [`Location`]s.
    ///
    /// As with [`Program::parse`], if you use a bad file index, you'll get weird behaviors
    /// when you try to print errors, but things should otherwise work fine. This function
    /// will only parse a single statement, which is useful in the REPL, but probably shouldn't
    /// be used when reading in whole files.
    pub fn parse(file_idx: usize, buffer: &str) -> Result<TopLevel, ParserError> {
        let lexer = Token::lexer(buffer)
            .spanned()
            .map(|x| permute_lexer_result(file_idx, x));
        TopLevelParser::new()
            .parse(file_idx, lexer)
            .map_err(|e| ParserError::convert(file_idx, e))
    }
}

fn permute_lexer_result(
    file_idx: usize,
    result: (Result<Token, ()>, Range<usize>),
) -> Result<(usize, Token, usize), ParserError> {
    let (token, range) = result;

    match token {
        Ok(v) => Ok((range.start, v, range.end)),
        Err(()) => Err(ParserError::LexFailure(Location::new(file_idx, range))),
    }
}

#[cfg(test)]
impl FromStr for Program {
    type Err = ParserError;

    fn from_str(s: &str) -> Result<Program, ParserError> {
        Program::parse(0, s)
    }
}

#[test]
fn order_of_operations() {
    let muladd1 = "x = 1 + 2 * 3;";
    let testfile = 0;
    assert_eq!(
        Program::from_str(muladd1).unwrap(),
        Program {
            items: vec![TopLevel::Expression(Expression::Binding(
                Location::new(testfile, 0..1),
                Name::manufactured("x"),
                Box::new(Expression::Call(
                    Location::new(testfile, 6..7),
                    Box::new(Expression::Primitive(
                        Location::new(testfile, 6..7),
                        Name::manufactured("+")
                    )),
                    vec![
                        Expression::Value(
                            Location::new(testfile, 4..5),
                            Value::Number(None, None, 1),
                        ),
                        Expression::Call(
                            Location::new(testfile, 10..11),
                            Box::new(Expression::Primitive(
                                Location::new(testfile, 10..11),
                                Name::manufactured("*")
                            )),
                            vec![
                                Expression::Value(
                                    Location::new(testfile, 8..9),
                                    Value::Number(None, None, 2),
                                ),
                                Expression::Value(
                                    Location::new(testfile, 12..13),
                                    Value::Number(None, None, 3),
                                ),
                            ]
                        )
                    ]
                ))
            ))],
        }
    );
}

proptest::proptest! {
     #[test]
     fn random_syntaxes_validate(program: Program) {
        let (errors, _) = program.validate();
        prop_assert!(errors.is_empty());
     }

     #[test]
     fn generated_run_or_overflow(program in Program::arbitrary_with(GenerationEnvironment::new(false))) {
        use crate::eval::{EvalError, PrimOpError};
        prop_assert!(matches!(program.eval(), Ok(_) | Err(EvalError::PrimOp(PrimOpError::MathFailure(_)))));
     }
}