337 lines
13 KiB
Rust
337 lines
13 KiB
Rust
//! NGR Parsing: Reading input, turning it into sense (or errors).
|
|
//!
|
|
//! This module implement the front end of the compiler, which is responsible for
|
|
//! reading in NGR syntax as a string, turning it into a series of reasonable Rust
|
|
//! structures for us to manipulate, and doing some validation while it's at it.
|
|
//!
|
|
//! The core flow for this work is:
|
|
//!
|
|
//! * Turning the string into a series of language-specific [`Token`]s.
|
|
//! * Taking those tokens, and computing a basic syntax tree from them,
|
|
//! using our parser ([`ProgramParser`] or [`TopLevelParser`], generated
|
|
//! by [`lalrpop`](https://lalrpop.github.io/lalrpop/)).
|
|
//! * Validating the tree we have parsed, using [`Program::validate`],
|
|
//! returning any warnings or errors we have found.
|
|
//!
|
|
//! In addition to all of this, we make sure that the structures defined in this
|
|
//! module are all:
|
|
//!
|
|
//! * Instances of [`Pretty`](::pretty::Pretty), so that you can print stuff back
|
|
//! out that can be read by a human.
|
|
//! * Instances of [`Arbitrary`](proptest::prelude::Arbitrary), so they can be
|
|
//! used in `proptest`-based property testing. There are built-in tests in
|
|
//! the library, for example, to make sure that the pretty-printing round-trips.
|
|
//! * Can be evaluated using an `eval` function, for comparison with later
|
|
//! versions of the function downstream.
|
|
use codespan_reporting::{diagnostic::Diagnostic, files::SimpleFiles};
|
|
use lalrpop_util::lalrpop_mod;
|
|
use logos::Logos;
|
|
|
|
pub mod arbitrary;
|
|
mod ast;
|
|
pub mod eval;
|
|
mod location;
|
|
mod tokens;
|
|
lalrpop_mod!(
|
|
#[allow(clippy::just_underscores_and_digits, clippy::clone_on_copy)]
|
|
parser,
|
|
"/syntax/parser.rs"
|
|
);
|
|
pub mod pretty;
|
|
mod validate;
|
|
|
|
#[cfg(test)]
|
|
use crate::syntax::arbitrary::GenerationEnvironment;
|
|
pub use crate::syntax::ast::*;
|
|
pub use crate::syntax::location::Location;
|
|
pub use crate::syntax::parser::{ProgramParser, TopLevelParser};
|
|
pub use crate::syntax::tokens::{LexerError, Token};
|
|
use lalrpop_util::ParseError;
|
|
#[cfg(test)]
|
|
use proptest::{arbitrary::Arbitrary, prop_assert};
|
|
use std::ops::Range;
|
|
#[cfg(test)]
|
|
use std::str::FromStr;
|
|
use thiserror::Error;
|
|
|
|
/// One of the many errors that can occur when processing text input.
|
|
///
|
|
/// If you get one of these and want to display it to the user, we strongly
|
|
/// suggest using the [`From`] implementation to turn this into a [`Diagnostic`],
|
|
/// and then printing it via [`codespan_reporting`].
|
|
#[derive(Debug, Error)]
|
|
pub enum ParserError {
|
|
/// Raised by the lexer when we see some text that doesn't make
|
|
/// any sense in the language.
|
|
#[error("Invalid token")]
|
|
InvalidToken(Location),
|
|
|
|
/// Raised when we're parsing the file and run into an EOF in a
|
|
/// place we really weren't expecting.
|
|
#[error("Unrecognized EOF")]
|
|
UnrecognizedEOF(Location, Vec<String>),
|
|
|
|
/// Raised when we're parsing the file, and run into a token in a
|
|
/// place we weren't expecting it.
|
|
#[error("Unrecognized token")]
|
|
UnrecognizedToken(Location, Token, Vec<String>),
|
|
|
|
/// Raised when we were expecting the end of the file, but instead
|
|
/// got another token.
|
|
#[error("Extra token")]
|
|
ExtraToken(Location, Token),
|
|
|
|
/// Raised when the lexer just had some sort of internal problem
|
|
/// and just gave up.
|
|
#[error("Lexing failure")]
|
|
LexFailure(Location),
|
|
|
|
/// Raised when we tried to reference a file, or add a file, to our
|
|
/// file database, and the database ran into a problem.
|
|
#[error("File database error")]
|
|
FileDatabaseError(#[from] codespan_reporting::files::Error),
|
|
|
|
/// Raised when the OS is having problems giving us data.
|
|
#[error("Read error")]
|
|
ReadError(#[from] std::io::Error),
|
|
}
|
|
|
|
impl ParserError {
|
|
/// Convert one of lalrpop's parser errors into one of our own, which we can more
|
|
/// easily implement translation into [`Diagnostic`].
|
|
///
|
|
/// This function is relatively straightforward, because we match the errors pretty
|
|
/// closely. The major thing we do here is convert [`lalrpop`]'s notion of a location,
|
|
/// which is just an offset that it got from the lexer, into an actual location that
|
|
/// we can use in our [`Diagnostic`]s.
|
|
fn convert(file_idx: usize, err: ParseError<usize, Token, ParserError>) -> Self {
|
|
match err {
|
|
ParseError::InvalidToken { location } => {
|
|
ParserError::InvalidToken(Location::new(file_idx, location..location + 1))
|
|
}
|
|
ParseError::UnrecognizedEof { location, expected } => ParserError::UnrecognizedEOF(
|
|
Location::new(file_idx, location..location + 1),
|
|
expected,
|
|
),
|
|
ParseError::UnrecognizedToken {
|
|
token: (start, token, end),
|
|
expected,
|
|
} => {
|
|
ParserError::UnrecognizedToken(Location::new(file_idx, start..end), token, expected)
|
|
}
|
|
ParseError::ExtraToken {
|
|
token: (start, token, end),
|
|
} => ParserError::ExtraToken(Location::new(file_idx, start..end), token),
|
|
ParseError::User { error } => error,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// This is just a nice little function to print out what we expected, if
|
|
/// we had some expectations. Because English is a little wonky, there's
|
|
/// some odd stuff with whether we get 0, 1, 2, or more, and it's nice to
|
|
/// just split that bit of logic out.
|
|
fn display_expected(expected: &[String]) -> String {
|
|
match expected.len() {
|
|
0 => "".to_string(),
|
|
1 => format!("; expected {}", expected[0]),
|
|
2 => format!("; expected {} or {}", expected[0], expected[1]),
|
|
n => format!(
|
|
"; expected {}or {}",
|
|
comma_separate(&expected[0..n - 1]),
|
|
expected[n - 1]
|
|
),
|
|
}
|
|
}
|
|
|
|
/// Given a list of strings, comma separate (with a space) them, as in an
|
|
/// English list.
|
|
fn comma_separate(strings: &[String]) -> String {
|
|
let mut result = String::new();
|
|
|
|
for s in strings.iter() {
|
|
result.push_str(s);
|
|
result.push_str(", ");
|
|
}
|
|
|
|
result
|
|
}
|
|
|
|
impl<'a> From<&'a ParserError> for Diagnostic<usize> {
|
|
fn from(value: &ParserError) -> Self {
|
|
match value {
|
|
// this was just a token we didn't understand
|
|
ParserError::InvalidToken(location) => location
|
|
.labelled_error("extremely odd token")
|
|
.with_message("encountered extremely confusing token"),
|
|
|
|
// unexpected EOF!
|
|
ParserError::UnrecognizedEOF(location, expected) => location.error().with_message(
|
|
format!("expected enf of file{}", display_expected(expected)),
|
|
),
|
|
|
|
// encountered a token where it shouldn't be
|
|
ParserError::UnrecognizedToken(loc, token, expected) => {
|
|
let expected_str =
|
|
format!("unexpected token {}{}", token, display_expected(expected));
|
|
let unexpected_str = format!("unexpected token {}", token);
|
|
|
|
Diagnostic::error()
|
|
.with_message(expected_str)
|
|
.with_labels(vec![loc.primary_label().with_message(unexpected_str)])
|
|
}
|
|
|
|
// I think we get this when we get a token, but were expected EOF
|
|
ParserError::ExtraToken(loc, token) => {
|
|
let expected_str =
|
|
format!("unexpected token {} after the expected end of file", token);
|
|
let unexpected_str = format!("unexpected token {}", token);
|
|
|
|
Diagnostic::error()
|
|
.with_message(expected_str)
|
|
.with_labels(vec![loc.primary_label().with_message(unexpected_str)])
|
|
}
|
|
|
|
// simple lexer errors
|
|
ParserError::LexFailure(location) => {
|
|
location.error().with_message("unexpected character")
|
|
}
|
|
|
|
ParserError::FileDatabaseError(e) => Diagnostic::error().with_message(e.to_string()),
|
|
|
|
ParserError::ReadError(e) => Diagnostic::error().with_message(e.to_string()),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Program {
|
|
/// Parse the given file, adding it to the database as part of the process.
|
|
///
|
|
/// This operation reads the file from disk and adds it to the database for future
|
|
/// reference. If you get an error, we strongly suggest conversion to [`Diagnostic`]
|
|
/// and then reporting it to the user via [`codespan_reporting`]. You should use
|
|
/// this function if you're pretty sure that you've never seen this file before,
|
|
/// and [`Program::parse`] if you have and know its index and already have it in
|
|
/// memory.
|
|
pub fn parse_file(
|
|
file_database: &mut SimpleFiles<String, String>,
|
|
file_name: &str,
|
|
) -> Result<Self, ParserError> {
|
|
let file_contents = std::fs::read_to_string(file_name)?;
|
|
let file_handle = file_database.add(file_name.to_string(), file_contents);
|
|
let file_db_info = file_database.get(file_handle)?;
|
|
Program::parse(file_handle, file_db_info.source())
|
|
}
|
|
|
|
/// Parse a block of text you have in memory, using the given index for [`Location`]s.
|
|
///
|
|
/// If you use a nonsensical file index, everything will work fine until you try to
|
|
/// report an error, at which point [`codespan_reporting`] may have some nasty things
|
|
/// to say to you.
|
|
pub fn parse(file_idx: usize, buffer: &str) -> Result<Program, ParserError> {
|
|
let lexer = Token::lexer(buffer)
|
|
.spanned()
|
|
.map(|x| permute_lexer_result(file_idx, x));
|
|
ProgramParser::new()
|
|
.parse(file_idx, lexer)
|
|
.map_err(|e| ParserError::convert(file_idx, e))
|
|
}
|
|
}
|
|
|
|
impl TopLevel {
|
|
/// Parse a top-level item that you have in memory, using the given index for [`Location`]s.
|
|
///
|
|
/// As with [`Program::parse`], if you use a bad file index, you'll get weird behaviors
|
|
/// when you try to print errors, but things should otherwise work fine. This function
|
|
/// will only parse a single statement, which is useful in the REPL, but probably shouldn't
|
|
/// be used when reading in whole files.
|
|
pub fn parse(file_idx: usize, buffer: &str) -> Result<TopLevel, ParserError> {
|
|
let lexer = Token::lexer(buffer)
|
|
.spanned()
|
|
.map(|x| permute_lexer_result(file_idx, x));
|
|
TopLevelParser::new()
|
|
.parse(file_idx, lexer)
|
|
.map_err(|e| ParserError::convert(file_idx, e))
|
|
}
|
|
}
|
|
|
|
fn permute_lexer_result(
|
|
file_idx: usize,
|
|
result: (Result<Token, ()>, Range<usize>),
|
|
) -> Result<(usize, Token, usize), ParserError> {
|
|
let (token, range) = result;
|
|
|
|
match token {
|
|
Ok(v) => Ok((range.start, v, range.end)),
|
|
Err(()) => Err(ParserError::LexFailure(Location::new(file_idx, range))),
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
impl FromStr for Program {
|
|
type Err = ParserError;
|
|
|
|
fn from_str(s: &str) -> Result<Program, ParserError> {
|
|
Program::parse(0, s)
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn order_of_operations() {
|
|
let muladd1 = "x = 1 + 2 * 3;";
|
|
let testfile = 0;
|
|
assert_eq!(
|
|
Program::from_str(muladd1).unwrap(),
|
|
Program {
|
|
items: vec![TopLevel::Expression(Expression::Binding(
|
|
Location::new(testfile, 0..1),
|
|
Name::manufactured("x"),
|
|
Box::new(Expression::Call(
|
|
Location::new(testfile, 6..7),
|
|
Box::new(Expression::Primitive(
|
|
Location::new(testfile, 6..7),
|
|
Name::manufactured("+")
|
|
)),
|
|
vec![
|
|
Expression::Value(
|
|
Location::new(testfile, 4..5),
|
|
Value::Number(None, None, 1),
|
|
),
|
|
Expression::Call(
|
|
Location::new(testfile, 10..11),
|
|
Box::new(Expression::Primitive(
|
|
Location::new(testfile, 10..11),
|
|
Name::manufactured("*")
|
|
)),
|
|
vec![
|
|
Expression::Value(
|
|
Location::new(testfile, 8..9),
|
|
Value::Number(None, None, 2),
|
|
),
|
|
Expression::Value(
|
|
Location::new(testfile, 12..13),
|
|
Value::Number(None, None, 3),
|
|
),
|
|
]
|
|
)
|
|
]
|
|
))
|
|
))],
|
|
}
|
|
);
|
|
}
|
|
|
|
proptest::proptest! {
|
|
#[test]
|
|
fn random_syntaxes_validate(program: Program) {
|
|
let (errors, _) = program.validate();
|
|
prop_assert!(errors.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn generated_run_or_overflow(program in Program::arbitrary_with(GenerationEnvironment::new(false))) {
|
|
use crate::eval::{EvalError, PrimOpError};
|
|
prop_assert!(matches!(program.eval(), Ok(_) | Err(EvalError::PrimOp(PrimOpError::MathFailure(_)))));
|
|
}
|
|
}
|