Add the rest of the documentation around syntax.

2023-05-12 16:42:48 -07:00
parent 309983ef3e
commit 62e27398be
4 changed files with 127 additions and 2 deletions
--- a/src/syntax.rs
+++ b/src/syntax.rs
@@ -10,7 +10,7 @@
 //!   * Taking those tokens, and computing a basic syntax tree from them,
 //!     using our parser ([`ProgramParser`] or [`StatementParser`], generated
 //!     by [`lalrpop`](https://lalrpop.github.io/lalrpop/)).
-//!   * Validating the tree we have parsed, using the [`validate`] module,
+//!   * Validating the tree we have parsed, using [`Program::validate`],
 //!     returning any warnings or errors we have found.
 //!
 //! In addition to all of this, we make sure that the structures defined in this
@@ -38,7 +38,7 @@ lalrpop_mod!(
    "/syntax/parser.rs"
 );
 mod pretty;
-pub mod validate;
+mod validate;
 pub use crate::syntax::ast::*;
 pub use crate::syntax::location::Location;
@@ -53,25 +53,56 @@ use proptest::{prop_assert, prop_assert_eq};
 use std::str::FromStr;
 use thiserror::Error;
 /// One of the many errors that can occur when processing text input.
 /// 
 /// If you get one of these and want to display it to the user, we strongly
 /// suggest using the [`From`] implementation to turn this into a [`Diagnostic`],
 /// and then printing it via [`codespan_reporting`].
 #[derive(Debug, Error)]
 pub enum ParserError {
    /// Raised by the lexer when we see some text that doesn't make
    /// any sense in the language.
    #[error("Invalid token")]
    InvalidToken(Location),
    /// Raised when we're parsing the file and run into an EOF in a
    /// place we really weren't expecting.
    #[error("Unrecognized EOF")]
    UnrecognizedEOF(Location, Vec<String>),
    /// Raised when we're parsing the file, and run into a token in a
    /// place we weren't expecting it.
    #[error("Unrecognized token")]
    UnrecognizedToken(Location, Location, Token, Vec<String>),
    /// Raised when we were expecting the end of the file, but instead
    /// got another token.
    #[error("Extra token")]
    ExtraToken(Location, Token, Location),
    /// Raised when the lexer just had some sort of internal problem
    /// and just gave up.
    #[error("Lexing failure")]
    LexFailure(Location),
    /// Raised when we tried to reference a file, or add a file, to our
    /// file database, and the database ran into a problem.
    #[error("File database error")]
    FileDatabaseError(#[from] codespan_reporting::files::Error),
    /// Raised when the OS is having problems giving us data.
    #[error("Read error")]
    ReadError(#[from] std::io::Error),
 }
 impl ParserError {
    /// Convert one of lalrpop's parser errors into one of our own, which we can more
    /// easily implement translation into [`Diagnostic`].
    /// 
    /// This function is relatively straightforward, because we match the errors pretty
    /// closely. The major thing we do here is convert [`lalrpop`]'s notion of a location,
    /// which is just an offset that it got from the lexer, into an actual location that
    /// we can use in our [`Diagnostic`]s.
    fn convert(file_idx: usize, err: ParseError<usize, Token, LexerError>) -> Self {
        match err {
            ParseError::InvalidToken { location } => {
@@ -105,6 +136,10 @@ impl ParserError {
    }
 }
 /// This is just a nice little function to print out what we expected, if
 /// we had some expectations. Because English is a little wonky, there's
 /// some odd stuff with whether we get 0, 1, 2, or more, and it's nice to
 /// just split that bit of logic out.
 fn display_expected(expected: &[String]) -> String {
    match expected.len() {
        0 => "".to_string(),
@@ -118,6 +153,8 @@ fn display_expected(expected: &[String]) -> String {
    }
 }
 /// Given a list of strings, comma separate (with a space) them, as in an
 /// English list.
 fn comma_separate(strings: &[String]) -> String {
    let mut result = String::new();
@@ -189,6 +226,14 @@ impl<'a> From<&'a ParserError> for Diagnostic<usize> {
 }
 impl Program {
    /// Parse the given file, adding it to the database as part of the process.
    /// 
    /// This operation reads the file from disk and adds it to the database for future
    /// reference. If you get an error, we strongly suggest conversion to [`Diagnostic`]
    /// and then reporting it to the user via [`codespan_reporting`]. You should use
    /// this function if you're pretty sure that you've never seen this file before,
    /// and [`Program::parse`] if you have and know its index and already have it in
    /// memory.
    pub fn parse_file(
        file_database: &mut SimpleFiles<String, String>,
        file_name: &str,
@@ -199,6 +244,11 @@ impl Program {
        Program::parse(file_handle, file_db_info.source())
    }
    /// Parse a block of text you have in memory, using the given index for [`Location`]s.
    /// 
    /// If you use a nonsensical file index, everything will work fine until you try to
    /// report an error, at which point [`codespan_reporting`] may have some nasty things
    /// to say to you.
    pub fn parse(file_idx: usize, buffer: &str) -> Result<Program, ParserError> {
        let lexer = Token::lexer(buffer)
            .spanned()
@@ -210,6 +260,12 @@ impl Program {
 }
 impl Statement {
    /// Parse a statement that you have in memory, using the given index for [`Location`]s.
    /// 
    /// As with [`Program::parse`], if you use a bad file index, you'll get weird behaviors
    /// when you try to print errors, but things should otherwise work fine. This function
    /// will only parse a single statement, which is useful in the REPL, but probably shouldn't
    /// be used when reading in whole files.
    pub fn parse(file_idx: usize, buffer: &str) -> Result<Statement, ParserError> {
        let lexer = Token::lexer(buffer)
            .spanned()
--- a/src/syntax/ast.rs
+++ b/src/syntax/ast.rs
@@ -1,12 +1,32 @@
 use crate::syntax::Location;
 /// The set of valid binary operators.
 pub static BINARY_OPERATORS: &[&str] = &["+", "-", "*", "/"];
 /// A structure represented a parsed program.
 /// 
 /// One `Program` is associated with exactly one input file, and the
 /// vector is arranged in exactly the same order as the parsed file.
 /// Because this is the syntax layer, the program is guaranteed to be
 /// syntactically valid, but may be nonsense. There could be attempts
 /// to use unbound variables, for example, until after someone runs
 /// `validate` and it comes back without errors.
 #[derive(Clone, Debug, PartialEq)]
 pub struct Program {
    pub statements: Vec<Statement>,
 }
 /// A parsed statement.
 /// 
 /// Statements are guaranteed to be syntactically valid, but may be
 /// complete nonsense at the semantic level. Which is to say, all the
 /// print statements were correctly formatted, and all the variables
 /// referenced are definitely valid symbols, but they may not have
 /// been defined or anything. 
 /// 
 /// Note that equivalence testing on statements is independent of
 /// source location; it is testing if the two statements say the same
 /// thing, not if they are the exact same statement.
 #[derive(Clone, Debug)]
 pub enum Statement {
    Binding(Location, String, Expression),
@@ -28,6 +48,12 @@ impl PartialEq for Statement {
    }
 }
 /// An expression in the underlying syntax.
 /// 
 /// Like statements, these expressions are guaranteed to have been
 /// formatted correctly, but may not actually make any sense. Also
 /// like Statements, the [`PartialEq`] implementation does not take
 /// source positions into account.
 #[derive(Clone, Debug)]
 pub enum Expression {
    Value(Location, Value),
@@ -54,7 +80,9 @@ impl PartialEq for Expression {
    }
 }
 /// A value from the source syntax
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub enum Value {
    /// The value of the number, and an optional base that it was written in
    Number(Option<u8>, i64),
 }
--- a/src/syntax/eval.rs
+++ b/src/syntax/eval.rs
@@ -4,11 +4,23 @@ use crate::eval::{EvalEnvironment, EvalError, Value};
 use crate::syntax::{Expression, Program, Statement};
 impl Program {
    /// Evaluate the program, returning either an error or what it prints out when run.
    /// 
    /// Doing this evaluation is particularly useful for testing, to ensure that if we
    /// modify a program in some way it does the same thing on both sides of the
    /// transformation. It's also sometimes just nice to know what a program will be
    /// doing.
    /// 
    /// Note that the errors here are slightly more strict that we enforce at runtime.
    /// For example, we check for overflow and underflow errors during evaluation, and
    /// we don't check for those in the compiled code.
    pub fn eval(&self) -> Result<String, EvalError> {
        let mut env = EvalEnvironment::empty();
        let mut stdout = String::new();
        for stmt in self.statements.iter() {
            // at this point, evaluation is pretty simple. just walk through each
            // statement, in order, and record printouts as we come to them.
            match stmt {
                Statement::Binding(_, name, value) => {
                    let actual_value = value.eval(&env)?;
@@ -40,6 +52,7 @@ impl Expression {
                let mut arg_values = Vec::with_capacity(args.len());
                for arg in args.iter() {
                    // yay, recursion! makes this pretty straightforward
                    arg_values.push(arg.eval(env)?);
                }
--- a/src/syntax/validate.rs
+++ b/src/syntax/validate.rs
@@ -2,6 +2,13 @@ use crate::syntax::{Expression, Location, Program, Statement};
 use codespan_reporting::diagnostic::Diagnostic;
 use std::collections::HashMap;
 /// An error we found while validating the input program.
 /// 
 /// These errors indicate that we should stop trying to compile
 /// the program, because it's just fundamentally broken in a way
 /// that we're not going to be able to work through. As with most
 /// of these errors, we recommend converting this to a [`Diagnostic`]
 /// and using [`codespan_reporting`] to present them to the user.
 pub enum Error {
    UnboundVariable(Location, String),
 }
@@ -16,6 +23,13 @@ impl From<Error> for Diagnostic<usize> {
    }
 }
 /// A problem we found validating the input that isn't critical.
 /// 
 /// These are things that the user might want to do something about,
 /// but we can keep going without it being a problem. As with most of
 /// these things, if you want to present this information to the user,
 /// the best way to do so is via [`From`] and [`Diagnostic`], and then
 /// interactions via [`codespan_reporting`].
 #[derive(Debug, PartialEq, Eq)]
 pub enum Warning {
    ShadowedVariable(Location, Location, String),
@@ -37,6 +51,11 @@ impl From<Warning> for Diagnostic<usize> {
 }
 impl Program {
    /// Validate that the program makes semantic sense, not just syntactic sense.
    /// 
    /// This checks for things like references to variables that don't exist, for
    /// example, and generates warnings for things that are inadvisable but not
    /// actually a problem. 
    pub fn validate(&self) -> (Vec<Error>, Vec<Warning>) {
        let mut errors = vec![];
        let mut warnings = vec![];
@@ -53,6 +72,15 @@ impl Program {
 }
 impl Statement {
    /// Validate that the statement makes semantic sense, not just syntactic sense.
    /// 
    /// This checks for things like references to variables that don't exist, for
    /// example, and generates warnings for things that are inadvisable but not
    /// actually a problem. Since statements appear in a broader context, you'll
    /// need to provide the set of variables that are bound where this statement
    /// occurs. We use a `HashMap` to map these bound locations to the locations
    /// where their bound, because these locations are handy when generating errors
    /// and warnings.
    pub fn validate(
        &self,
        bound_variables: &mut HashMap<String, Location>,