//! NGR Parsing: Reading input, turning it into sense (or errors). //! //! This module implement the front end of the compiler, which is responsible for //! reading in NGR syntax as a string, turning it into a series of reasonable Rust //! structures for us to manipulate, and doing some validation while it's at it. //! //! The core flow for this work is: //! //! * Turning the string into a series of language-specific [`Token`]s. //! * Taking those tokens, and computing a basic syntax tree from them, //! using our parser ([`ProgramParser`] or [`TopLevelParser`], generated //! by [`lalrpop`](https://lalrpop.github.io/lalrpop/)). //! * Validating the tree we have parsed, using [`Program::validate`], //! returning any warnings or errors we have found. //! //! In addition to all of this, we make sure that the structures defined in this //! module are all: //! //! * Instances of [`Pretty`](::pretty::Pretty), so that you can print stuff back //! out that can be read by a human. //! * Instances of [`Arbitrary`](proptest::prelude::Arbitrary), so they can be //! used in `proptest`-based property testing. There are built-in tests in //! the library, for example, to make sure that the pretty-printing round-trips. //! * Can be evaluated using an `eval` function, for comparison with later //! versions of the function downstream. use codespan_reporting::{diagnostic::Diagnostic, files::SimpleFiles}; use lalrpop_util::lalrpop_mod; use logos::Logos; pub mod arbitrary; mod ast; pub mod eval; mod location; mod tokens; lalrpop_mod!( #[allow(clippy::just_underscores_and_digits, clippy::clone_on_copy)] parser, "/syntax/parser.rs" ); pub mod pretty; mod validate; #[cfg(test)] use crate::syntax::arbitrary::GenerationEnvironment; pub use crate::syntax::ast::*; pub use crate::syntax::location::Location; pub use crate::syntax::parser::{ProgramParser, TopLevelParser}; pub use crate::syntax::tokens::{LexerError, Token}; use lalrpop_util::ParseError; #[cfg(test)] use proptest::{arbitrary::Arbitrary, prop_assert}; use std::ops::Range; #[cfg(test)] use std::str::FromStr; use thiserror::Error; /// One of the many errors that can occur when processing text input. /// /// If you get one of these and want to display it to the user, we strongly /// suggest using the [`From`] implementation to turn this into a [`Diagnostic`], /// and then printing it via [`codespan_reporting`]. #[derive(Debug, Error)] pub enum ParserError { /// Raised by the lexer when we see some text that doesn't make /// any sense in the language. #[error("Invalid token")] InvalidToken(Location), /// Raised when we're parsing the file and run into an EOF in a /// place we really weren't expecting. #[error("Unrecognized EOF")] UnrecognizedEOF(Location, Vec), /// Raised when we're parsing the file, and run into a token in a /// place we weren't expecting it. #[error("Unrecognized token")] UnrecognizedToken(Location, Token, Vec), /// Raised when we were expecting the end of the file, but instead /// got another token. #[error("Extra token")] ExtraToken(Location, Token), /// Raised when the lexer just had some sort of internal problem /// and just gave up. #[error("Lexing failure")] LexFailure(Location), /// Raised when we tried to reference a file, or add a file, to our /// file database, and the database ran into a problem. #[error("File database error")] FileDatabaseError(#[from] codespan_reporting::files::Error), /// Raised when the OS is having problems giving us data. #[error("Read error")] ReadError(#[from] std::io::Error), } impl ParserError { /// Convert one of lalrpop's parser errors into one of our own, which we can more /// easily implement translation into [`Diagnostic`]. /// /// This function is relatively straightforward, because we match the errors pretty /// closely. The major thing we do here is convert [`lalrpop`]'s notion of a location, /// which is just an offset that it got from the lexer, into an actual location that /// we can use in our [`Diagnostic`]s. fn convert(file_idx: usize, err: ParseError) -> Self { match err { ParseError::InvalidToken { location } => { ParserError::InvalidToken(Location::new(file_idx, location..location + 1)) } ParseError::UnrecognizedEof { location, expected } => ParserError::UnrecognizedEOF( Location::new(file_idx, location..location + 1), expected, ), ParseError::UnrecognizedToken { token: (start, token, end), expected, } => { ParserError::UnrecognizedToken(Location::new(file_idx, start..end), token, expected) } ParseError::ExtraToken { token: (start, token, end), } => ParserError::ExtraToken(Location::new(file_idx, start..end), token), ParseError::User { error } => error, } } } /// This is just a nice little function to print out what we expected, if /// we had some expectations. Because English is a little wonky, there's /// some odd stuff with whether we get 0, 1, 2, or more, and it's nice to /// just split that bit of logic out. fn display_expected(expected: &[String]) -> String { match expected.len() { 0 => "".to_string(), 1 => format!("; expected {}", expected[0]), 2 => format!("; expected {} or {}", expected[0], expected[1]), n => format!( "; expected {}or {}", comma_separate(&expected[0..n - 1]), expected[n - 1] ), } } /// Given a list of strings, comma separate (with a space) them, as in an /// English list. fn comma_separate(strings: &[String]) -> String { let mut result = String::new(); for s in strings.iter() { result.push_str(s); result.push_str(", "); } result } impl<'a> From<&'a ParserError> for Diagnostic { fn from(value: &ParserError) -> Self { match value { // this was just a token we didn't understand ParserError::InvalidToken(location) => location .labelled_error("extremely odd token") .with_message("encountered extremely confusing token"), // unexpected EOF! ParserError::UnrecognizedEOF(location, expected) => location.error().with_message( format!("expected enf of file{}", display_expected(expected)), ), // encountered a token where it shouldn't be ParserError::UnrecognizedToken(loc, token, expected) => { let expected_str = format!("unexpected token {}{}", token, display_expected(expected)); let unexpected_str = format!("unexpected token {}", token); Diagnostic::error() .with_message(expected_str) .with_labels(vec![loc.primary_label().with_message(unexpected_str)]) } // I think we get this when we get a token, but were expected EOF ParserError::ExtraToken(loc, token) => { let expected_str = format!("unexpected token {} after the expected end of file", token); let unexpected_str = format!("unexpected token {}", token); Diagnostic::error() .with_message(expected_str) .with_labels(vec![loc.primary_label().with_message(unexpected_str)]) } // simple lexer errors ParserError::LexFailure(location) => { location.error().with_message("unexpected character") } ParserError::FileDatabaseError(e) => Diagnostic::error().with_message(e.to_string()), ParserError::ReadError(e) => Diagnostic::error().with_message(e.to_string()), } } } impl Program { /// Parse the given file, adding it to the database as part of the process. /// /// This operation reads the file from disk and adds it to the database for future /// reference. If you get an error, we strongly suggest conversion to [`Diagnostic`] /// and then reporting it to the user via [`codespan_reporting`]. You should use /// this function if you're pretty sure that you've never seen this file before, /// and [`Program::parse`] if you have and know its index and already have it in /// memory. pub fn parse_file( file_database: &mut SimpleFiles, file_name: &str, ) -> Result { let file_contents = std::fs::read_to_string(file_name)?; let file_handle = file_database.add(file_name.to_string(), file_contents); let file_db_info = file_database.get(file_handle)?; Program::parse(file_handle, file_db_info.source()) } /// Parse a block of text you have in memory, using the given index for [`Location`]s. /// /// If you use a nonsensical file index, everything will work fine until you try to /// report an error, at which point [`codespan_reporting`] may have some nasty things /// to say to you. pub fn parse(file_idx: usize, buffer: &str) -> Result { let lexer = Token::lexer(buffer) .spanned() .map(|x| permute_lexer_result(file_idx, x)); ProgramParser::new() .parse(file_idx, lexer) .map_err(|e| ParserError::convert(file_idx, e)) } } impl TopLevel { /// Parse a top-level item that you have in memory, using the given index for [`Location`]s. /// /// As with [`Program::parse`], if you use a bad file index, you'll get weird behaviors /// when you try to print errors, but things should otherwise work fine. This function /// will only parse a single statement, which is useful in the REPL, but probably shouldn't /// be used when reading in whole files. pub fn parse(file_idx: usize, buffer: &str) -> Result { let lexer = Token::lexer(buffer) .spanned() .map(|x| permute_lexer_result(file_idx, x)); TopLevelParser::new() .parse(file_idx, lexer) .map_err(|e| ParserError::convert(file_idx, e)) } } fn permute_lexer_result( file_idx: usize, result: (Result, Range), ) -> Result<(usize, Token, usize), ParserError> { let (token, range) = result; match token { Ok(v) => Ok((range.start, v, range.end)), Err(()) => Err(ParserError::LexFailure(Location::new(file_idx, range))), } } #[cfg(test)] impl FromStr for Program { type Err = ParserError; fn from_str(s: &str) -> Result { Program::parse(0, s) } } #[test] fn order_of_operations() { let muladd1 = "x = 1 + 2 * 3;"; let testfile = 0; assert_eq!( Program::from_str(muladd1).unwrap(), Program { items: vec![TopLevel::Expression(Expression::Binding( Location::new(testfile, 0..1), Name::manufactured("x"), Box::new(Expression::Call( Location::new(testfile, 6..7), Box::new(Expression::Primitive( Location::new(testfile, 6..7), Name::manufactured("+") )), vec![ Expression::Value( Location::new(testfile, 4..5), Value::Number(None, None, 1), ), Expression::Call( Location::new(testfile, 10..11), Box::new(Expression::Primitive( Location::new(testfile, 10..11), Name::manufactured("*") )), vec![ Expression::Value( Location::new(testfile, 8..9), Value::Number(None, None, 2), ), Expression::Value( Location::new(testfile, 12..13), Value::Number(None, None, 3), ), ] ) ] )) ))], } ); } proptest::proptest! { #[test] fn random_syntaxes_validate(program: Program) { let (errors, _) = program.validate(); prop_assert!(errors.is_empty()); } #[test] fn generated_run_or_overflow(program in Program::arbitrary_with(GenerationEnvironment::new(false))) { use crate::eval::{EvalError, PrimOpError}; prop_assert!(matches!(program.eval(), Ok(_) | Err(EvalError::PrimOp(PrimOpError::MathFailure(_))))); } }