Add the rest of the documentation around syntax.

This commit is contained in:
2023-05-12 16:42:48 -07:00
parent 309983ef3e
commit 62e27398be
4 changed files with 127 additions and 2 deletions

View File

@@ -10,7 +10,7 @@
//! * Taking those tokens, and computing a basic syntax tree from them, //! * Taking those tokens, and computing a basic syntax tree from them,
//! using our parser ([`ProgramParser`] or [`StatementParser`], generated //! using our parser ([`ProgramParser`] or [`StatementParser`], generated
//! by [`lalrpop`](https://lalrpop.github.io/lalrpop/)). //! by [`lalrpop`](https://lalrpop.github.io/lalrpop/)).
//! * Validating the tree we have parsed, using the [`validate`] module, //! * Validating the tree we have parsed, using [`Program::validate`],
//! returning any warnings or errors we have found. //! returning any warnings or errors we have found.
//! //!
//! In addition to all of this, we make sure that the structures defined in this //! In addition to all of this, we make sure that the structures defined in this
@@ -38,7 +38,7 @@ lalrpop_mod!(
"/syntax/parser.rs" "/syntax/parser.rs"
); );
mod pretty; mod pretty;
pub mod validate; mod validate;
pub use crate::syntax::ast::*; pub use crate::syntax::ast::*;
pub use crate::syntax::location::Location; pub use crate::syntax::location::Location;
@@ -53,25 +53,56 @@ use proptest::{prop_assert, prop_assert_eq};
use std::str::FromStr; use std::str::FromStr;
use thiserror::Error; use thiserror::Error;
/// One of the many errors that can occur when processing text input.
///
/// If you get one of these and want to display it to the user, we strongly
/// suggest using the [`From`] implementation to turn this into a [`Diagnostic`],
/// and then printing it via [`codespan_reporting`].
#[derive(Debug, Error)] #[derive(Debug, Error)]
pub enum ParserError { pub enum ParserError {
/// Raised by the lexer when we see some text that doesn't make
/// any sense in the language.
#[error("Invalid token")] #[error("Invalid token")]
InvalidToken(Location), InvalidToken(Location),
/// Raised when we're parsing the file and run into an EOF in a
/// place we really weren't expecting.
#[error("Unrecognized EOF")] #[error("Unrecognized EOF")]
UnrecognizedEOF(Location, Vec<String>), UnrecognizedEOF(Location, Vec<String>),
/// Raised when we're parsing the file, and run into a token in a
/// place we weren't expecting it.
#[error("Unrecognized token")] #[error("Unrecognized token")]
UnrecognizedToken(Location, Location, Token, Vec<String>), UnrecognizedToken(Location, Location, Token, Vec<String>),
/// Raised when we were expecting the end of the file, but instead
/// got another token.
#[error("Extra token")] #[error("Extra token")]
ExtraToken(Location, Token, Location), ExtraToken(Location, Token, Location),
/// Raised when the lexer just had some sort of internal problem
/// and just gave up.
#[error("Lexing failure")] #[error("Lexing failure")]
LexFailure(Location), LexFailure(Location),
/// Raised when we tried to reference a file, or add a file, to our
/// file database, and the database ran into a problem.
#[error("File database error")] #[error("File database error")]
FileDatabaseError(#[from] codespan_reporting::files::Error), FileDatabaseError(#[from] codespan_reporting::files::Error),
/// Raised when the OS is having problems giving us data.
#[error("Read error")] #[error("Read error")]
ReadError(#[from] std::io::Error), ReadError(#[from] std::io::Error),
} }
impl ParserError { impl ParserError {
/// Convert one of lalrpop's parser errors into one of our own, which we can more
/// easily implement translation into [`Diagnostic`].
///
/// This function is relatively straightforward, because we match the errors pretty
/// closely. The major thing we do here is convert [`lalrpop`]'s notion of a location,
/// which is just an offset that it got from the lexer, into an actual location that
/// we can use in our [`Diagnostic`]s.
fn convert(file_idx: usize, err: ParseError<usize, Token, LexerError>) -> Self { fn convert(file_idx: usize, err: ParseError<usize, Token, LexerError>) -> Self {
match err { match err {
ParseError::InvalidToken { location } => { ParseError::InvalidToken { location } => {
@@ -105,6 +136,10 @@ impl ParserError {
} }
} }
/// This is just a nice little function to print out what we expected, if
/// we had some expectations. Because English is a little wonky, there's
/// some odd stuff with whether we get 0, 1, 2, or more, and it's nice to
/// just split that bit of logic out.
fn display_expected(expected: &[String]) -> String { fn display_expected(expected: &[String]) -> String {
match expected.len() { match expected.len() {
0 => "".to_string(), 0 => "".to_string(),
@@ -118,6 +153,8 @@ fn display_expected(expected: &[String]) -> String {
} }
} }
/// Given a list of strings, comma separate (with a space) them, as in an
/// English list.
fn comma_separate(strings: &[String]) -> String { fn comma_separate(strings: &[String]) -> String {
let mut result = String::new(); let mut result = String::new();
@@ -189,6 +226,14 @@ impl<'a> From<&'a ParserError> for Diagnostic<usize> {
} }
impl Program { impl Program {
/// Parse the given file, adding it to the database as part of the process.
///
/// This operation reads the file from disk and adds it to the database for future
/// reference. If you get an error, we strongly suggest conversion to [`Diagnostic`]
/// and then reporting it to the user via [`codespan_reporting`]. You should use
/// this function if you're pretty sure that you've never seen this file before,
/// and [`Program::parse`] if you have and know its index and already have it in
/// memory.
pub fn parse_file( pub fn parse_file(
file_database: &mut SimpleFiles<String, String>, file_database: &mut SimpleFiles<String, String>,
file_name: &str, file_name: &str,
@@ -199,6 +244,11 @@ impl Program {
Program::parse(file_handle, file_db_info.source()) Program::parse(file_handle, file_db_info.source())
} }
/// Parse a block of text you have in memory, using the given index for [`Location`]s.
///
/// If you use a nonsensical file index, everything will work fine until you try to
/// report an error, at which point [`codespan_reporting`] may have some nasty things
/// to say to you.
pub fn parse(file_idx: usize, buffer: &str) -> Result<Program, ParserError> { pub fn parse(file_idx: usize, buffer: &str) -> Result<Program, ParserError> {
let lexer = Token::lexer(buffer) let lexer = Token::lexer(buffer)
.spanned() .spanned()
@@ -210,6 +260,12 @@ impl Program {
} }
impl Statement { impl Statement {
/// Parse a statement that you have in memory, using the given index for [`Location`]s.
///
/// As with [`Program::parse`], if you use a bad file index, you'll get weird behaviors
/// when you try to print errors, but things should otherwise work fine. This function
/// will only parse a single statement, which is useful in the REPL, but probably shouldn't
/// be used when reading in whole files.
pub fn parse(file_idx: usize, buffer: &str) -> Result<Statement, ParserError> { pub fn parse(file_idx: usize, buffer: &str) -> Result<Statement, ParserError> {
let lexer = Token::lexer(buffer) let lexer = Token::lexer(buffer)
.spanned() .spanned()

View File

@@ -1,12 +1,32 @@
use crate::syntax::Location; use crate::syntax::Location;
/// The set of valid binary operators.
pub static BINARY_OPERATORS: &[&str] = &["+", "-", "*", "/"]; pub static BINARY_OPERATORS: &[&str] = &["+", "-", "*", "/"];
/// A structure represented a parsed program.
///
/// One `Program` is associated with exactly one input file, and the
/// vector is arranged in exactly the same order as the parsed file.
/// Because this is the syntax layer, the program is guaranteed to be
/// syntactically valid, but may be nonsense. There could be attempts
/// to use unbound variables, for example, until after someone runs
/// `validate` and it comes back without errors.
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug, PartialEq)]
pub struct Program { pub struct Program {
pub statements: Vec<Statement>, pub statements: Vec<Statement>,
} }
/// A parsed statement.
///
/// Statements are guaranteed to be syntactically valid, but may be
/// complete nonsense at the semantic level. Which is to say, all the
/// print statements were correctly formatted, and all the variables
/// referenced are definitely valid symbols, but they may not have
/// been defined or anything.
///
/// Note that equivalence testing on statements is independent of
/// source location; it is testing if the two statements say the same
/// thing, not if they are the exact same statement.
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub enum Statement { pub enum Statement {
Binding(Location, String, Expression), Binding(Location, String, Expression),
@@ -28,6 +48,12 @@ impl PartialEq for Statement {
} }
} }
/// An expression in the underlying syntax.
///
/// Like statements, these expressions are guaranteed to have been
/// formatted correctly, but may not actually make any sense. Also
/// like Statements, the [`PartialEq`] implementation does not take
/// source positions into account.
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub enum Expression { pub enum Expression {
Value(Location, Value), Value(Location, Value),
@@ -54,7 +80,9 @@ impl PartialEq for Expression {
} }
} }
/// A value from the source syntax
#[derive(Clone, Debug, PartialEq, Eq)] #[derive(Clone, Debug, PartialEq, Eq)]
pub enum Value { pub enum Value {
/// The value of the number, and an optional base that it was written in
Number(Option<u8>, i64), Number(Option<u8>, i64),
} }

View File

@@ -4,11 +4,23 @@ use crate::eval::{EvalEnvironment, EvalError, Value};
use crate::syntax::{Expression, Program, Statement}; use crate::syntax::{Expression, Program, Statement};
impl Program { impl Program {
/// Evaluate the program, returning either an error or what it prints out when run.
///
/// Doing this evaluation is particularly useful for testing, to ensure that if we
/// modify a program in some way it does the same thing on both sides of the
/// transformation. It's also sometimes just nice to know what a program will be
/// doing.
///
/// Note that the errors here are slightly more strict that we enforce at runtime.
/// For example, we check for overflow and underflow errors during evaluation, and
/// we don't check for those in the compiled code.
pub fn eval(&self) -> Result<String, EvalError> { pub fn eval(&self) -> Result<String, EvalError> {
let mut env = EvalEnvironment::empty(); let mut env = EvalEnvironment::empty();
let mut stdout = String::new(); let mut stdout = String::new();
for stmt in self.statements.iter() { for stmt in self.statements.iter() {
// at this point, evaluation is pretty simple. just walk through each
// statement, in order, and record printouts as we come to them.
match stmt { match stmt {
Statement::Binding(_, name, value) => { Statement::Binding(_, name, value) => {
let actual_value = value.eval(&env)?; let actual_value = value.eval(&env)?;
@@ -40,6 +52,7 @@ impl Expression {
let mut arg_values = Vec::with_capacity(args.len()); let mut arg_values = Vec::with_capacity(args.len());
for arg in args.iter() { for arg in args.iter() {
// yay, recursion! makes this pretty straightforward
arg_values.push(arg.eval(env)?); arg_values.push(arg.eval(env)?);
} }

View File

@@ -2,6 +2,13 @@ use crate::syntax::{Expression, Location, Program, Statement};
use codespan_reporting::diagnostic::Diagnostic; use codespan_reporting::diagnostic::Diagnostic;
use std::collections::HashMap; use std::collections::HashMap;
/// An error we found while validating the input program.
///
/// These errors indicate that we should stop trying to compile
/// the program, because it's just fundamentally broken in a way
/// that we're not going to be able to work through. As with most
/// of these errors, we recommend converting this to a [`Diagnostic`]
/// and using [`codespan_reporting`] to present them to the user.
pub enum Error { pub enum Error {
UnboundVariable(Location, String), UnboundVariable(Location, String),
} }
@@ -16,6 +23,13 @@ impl From<Error> for Diagnostic<usize> {
} }
} }
/// A problem we found validating the input that isn't critical.
///
/// These are things that the user might want to do something about,
/// but we can keep going without it being a problem. As with most of
/// these things, if you want to present this information to the user,
/// the best way to do so is via [`From`] and [`Diagnostic`], and then
/// interactions via [`codespan_reporting`].
#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
pub enum Warning { pub enum Warning {
ShadowedVariable(Location, Location, String), ShadowedVariable(Location, Location, String),
@@ -37,6 +51,11 @@ impl From<Warning> for Diagnostic<usize> {
} }
impl Program { impl Program {
/// Validate that the program makes semantic sense, not just syntactic sense.
///
/// This checks for things like references to variables that don't exist, for
/// example, and generates warnings for things that are inadvisable but not
/// actually a problem.
pub fn validate(&self) -> (Vec<Error>, Vec<Warning>) { pub fn validate(&self) -> (Vec<Error>, Vec<Warning>) {
let mut errors = vec![]; let mut errors = vec![];
let mut warnings = vec![]; let mut warnings = vec![];
@@ -53,6 +72,15 @@ impl Program {
} }
impl Statement { impl Statement {
/// Validate that the statement makes semantic sense, not just syntactic sense.
///
/// This checks for things like references to variables that don't exist, for
/// example, and generates warnings for things that are inadvisable but not
/// actually a problem. Since statements appear in a broader context, you'll
/// need to provide the set of variables that are bound where this statement
/// occurs. We use a `HashMap` to map these bound locations to the locations
/// where their bound, because these locations are handy when generating errors
/// and warnings.
pub fn validate( pub fn validate(
&self, &self,
bound_variables: &mut HashMap<String, Location>, bound_variables: &mut HashMap<String, Location>,