📜 Add better documentation across the compiler. #3
@@ -10,7 +10,7 @@
|
|||||||
//! * Taking those tokens, and computing a basic syntax tree from them,
|
//! * Taking those tokens, and computing a basic syntax tree from them,
|
||||||
//! using our parser ([`ProgramParser`] or [`StatementParser`], generated
|
//! using our parser ([`ProgramParser`] or [`StatementParser`], generated
|
||||||
//! by [`lalrpop`](https://lalrpop.github.io/lalrpop/)).
|
//! by [`lalrpop`](https://lalrpop.github.io/lalrpop/)).
|
||||||
//! * Validating the tree we have parsed, using the [`validate`] module,
|
//! * Validating the tree we have parsed, using [`Program::validate`],
|
||||||
//! returning any warnings or errors we have found.
|
//! returning any warnings or errors we have found.
|
||||||
//!
|
//!
|
||||||
//! In addition to all of this, we make sure that the structures defined in this
|
//! In addition to all of this, we make sure that the structures defined in this
|
||||||
@@ -38,7 +38,7 @@ lalrpop_mod!(
|
|||||||
"/syntax/parser.rs"
|
"/syntax/parser.rs"
|
||||||
);
|
);
|
||||||
mod pretty;
|
mod pretty;
|
||||||
pub mod validate;
|
mod validate;
|
||||||
|
|
||||||
pub use crate::syntax::ast::*;
|
pub use crate::syntax::ast::*;
|
||||||
pub use crate::syntax::location::Location;
|
pub use crate::syntax::location::Location;
|
||||||
@@ -53,25 +53,56 @@ use proptest::{prop_assert, prop_assert_eq};
|
|||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
|
||||||
|
/// One of the many errors that can occur when processing text input.
|
||||||
|
///
|
||||||
|
/// If you get one of these and want to display it to the user, we strongly
|
||||||
|
/// suggest using the [`From`] implementation to turn this into a [`Diagnostic`],
|
||||||
|
/// and then printing it via [`codespan_reporting`].
|
||||||
#[derive(Debug, Error)]
|
#[derive(Debug, Error)]
|
||||||
pub enum ParserError {
|
pub enum ParserError {
|
||||||
|
/// Raised by the lexer when we see some text that doesn't make
|
||||||
|
/// any sense in the language.
|
||||||
#[error("Invalid token")]
|
#[error("Invalid token")]
|
||||||
InvalidToken(Location),
|
InvalidToken(Location),
|
||||||
|
|
||||||
|
/// Raised when we're parsing the file and run into an EOF in a
|
||||||
|
/// place we really weren't expecting.
|
||||||
#[error("Unrecognized EOF")]
|
#[error("Unrecognized EOF")]
|
||||||
UnrecognizedEOF(Location, Vec<String>),
|
UnrecognizedEOF(Location, Vec<String>),
|
||||||
|
|
||||||
|
/// Raised when we're parsing the file, and run into a token in a
|
||||||
|
/// place we weren't expecting it.
|
||||||
#[error("Unrecognized token")]
|
#[error("Unrecognized token")]
|
||||||
UnrecognizedToken(Location, Location, Token, Vec<String>),
|
UnrecognizedToken(Location, Location, Token, Vec<String>),
|
||||||
|
|
||||||
|
/// Raised when we were expecting the end of the file, but instead
|
||||||
|
/// got another token.
|
||||||
#[error("Extra token")]
|
#[error("Extra token")]
|
||||||
ExtraToken(Location, Token, Location),
|
ExtraToken(Location, Token, Location),
|
||||||
|
|
||||||
|
/// Raised when the lexer just had some sort of internal problem
|
||||||
|
/// and just gave up.
|
||||||
#[error("Lexing failure")]
|
#[error("Lexing failure")]
|
||||||
LexFailure(Location),
|
LexFailure(Location),
|
||||||
|
|
||||||
|
/// Raised when we tried to reference a file, or add a file, to our
|
||||||
|
/// file database, and the database ran into a problem.
|
||||||
#[error("File database error")]
|
#[error("File database error")]
|
||||||
FileDatabaseError(#[from] codespan_reporting::files::Error),
|
FileDatabaseError(#[from] codespan_reporting::files::Error),
|
||||||
|
|
||||||
|
/// Raised when the OS is having problems giving us data.
|
||||||
#[error("Read error")]
|
#[error("Read error")]
|
||||||
ReadError(#[from] std::io::Error),
|
ReadError(#[from] std::io::Error),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ParserError {
|
impl ParserError {
|
||||||
|
/// Convert one of lalrpop's parser errors into one of our own, which we can more
|
||||||
|
/// easily implement translation into [`Diagnostic`].
|
||||||
|
///
|
||||||
|
/// This function is relatively straightforward, because we match the errors pretty
|
||||||
|
/// closely. The major thing we do here is convert [`lalrpop`]'s notion of a location,
|
||||||
|
/// which is just an offset that it got from the lexer, into an actual location that
|
||||||
|
/// we can use in our [`Diagnostic`]s.
|
||||||
fn convert(file_idx: usize, err: ParseError<usize, Token, LexerError>) -> Self {
|
fn convert(file_idx: usize, err: ParseError<usize, Token, LexerError>) -> Self {
|
||||||
match err {
|
match err {
|
||||||
ParseError::InvalidToken { location } => {
|
ParseError::InvalidToken { location } => {
|
||||||
@@ -105,6 +136,10 @@ impl ParserError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// This is just a nice little function to print out what we expected, if
|
||||||
|
/// we had some expectations. Because English is a little wonky, there's
|
||||||
|
/// some odd stuff with whether we get 0, 1, 2, or more, and it's nice to
|
||||||
|
/// just split that bit of logic out.
|
||||||
fn display_expected(expected: &[String]) -> String {
|
fn display_expected(expected: &[String]) -> String {
|
||||||
match expected.len() {
|
match expected.len() {
|
||||||
0 => "".to_string(),
|
0 => "".to_string(),
|
||||||
@@ -118,6 +153,8 @@ fn display_expected(expected: &[String]) -> String {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Given a list of strings, comma separate (with a space) them, as in an
|
||||||
|
/// English list.
|
||||||
fn comma_separate(strings: &[String]) -> String {
|
fn comma_separate(strings: &[String]) -> String {
|
||||||
let mut result = String::new();
|
let mut result = String::new();
|
||||||
|
|
||||||
@@ -189,6 +226,14 @@ impl<'a> From<&'a ParserError> for Diagnostic<usize> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Program {
|
impl Program {
|
||||||
|
/// Parse the given file, adding it to the database as part of the process.
|
||||||
|
///
|
||||||
|
/// This operation reads the file from disk and adds it to the database for future
|
||||||
|
/// reference. If you get an error, we strongly suggest conversion to [`Diagnostic`]
|
||||||
|
/// and then reporting it to the user via [`codespan_reporting`]. You should use
|
||||||
|
/// this function if you're pretty sure that you've never seen this file before,
|
||||||
|
/// and [`Program::parse`] if you have and know its index and already have it in
|
||||||
|
/// memory.
|
||||||
pub fn parse_file(
|
pub fn parse_file(
|
||||||
file_database: &mut SimpleFiles<String, String>,
|
file_database: &mut SimpleFiles<String, String>,
|
||||||
file_name: &str,
|
file_name: &str,
|
||||||
@@ -199,6 +244,11 @@ impl Program {
|
|||||||
Program::parse(file_handle, file_db_info.source())
|
Program::parse(file_handle, file_db_info.source())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Parse a block of text you have in memory, using the given index for [`Location`]s.
|
||||||
|
///
|
||||||
|
/// If you use a nonsensical file index, everything will work fine until you try to
|
||||||
|
/// report an error, at which point [`codespan_reporting`] may have some nasty things
|
||||||
|
/// to say to you.
|
||||||
pub fn parse(file_idx: usize, buffer: &str) -> Result<Program, ParserError> {
|
pub fn parse(file_idx: usize, buffer: &str) -> Result<Program, ParserError> {
|
||||||
let lexer = Token::lexer(buffer)
|
let lexer = Token::lexer(buffer)
|
||||||
.spanned()
|
.spanned()
|
||||||
@@ -210,6 +260,12 @@ impl Program {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Statement {
|
impl Statement {
|
||||||
|
/// Parse a statement that you have in memory, using the given index for [`Location`]s.
|
||||||
|
///
|
||||||
|
/// As with [`Program::parse`], if you use a bad file index, you'll get weird behaviors
|
||||||
|
/// when you try to print errors, but things should otherwise work fine. This function
|
||||||
|
/// will only parse a single statement, which is useful in the REPL, but probably shouldn't
|
||||||
|
/// be used when reading in whole files.
|
||||||
pub fn parse(file_idx: usize, buffer: &str) -> Result<Statement, ParserError> {
|
pub fn parse(file_idx: usize, buffer: &str) -> Result<Statement, ParserError> {
|
||||||
let lexer = Token::lexer(buffer)
|
let lexer = Token::lexer(buffer)
|
||||||
.spanned()
|
.spanned()
|
||||||
|
|||||||
@@ -1,12 +1,32 @@
|
|||||||
use crate::syntax::Location;
|
use crate::syntax::Location;
|
||||||
|
|
||||||
|
/// The set of valid binary operators.
|
||||||
pub static BINARY_OPERATORS: &[&str] = &["+", "-", "*", "/"];
|
pub static BINARY_OPERATORS: &[&str] = &["+", "-", "*", "/"];
|
||||||
|
|
||||||
|
/// A structure represented a parsed program.
|
||||||
|
///
|
||||||
|
/// One `Program` is associated with exactly one input file, and the
|
||||||
|
/// vector is arranged in exactly the same order as the parsed file.
|
||||||
|
/// Because this is the syntax layer, the program is guaranteed to be
|
||||||
|
/// syntactically valid, but may be nonsense. There could be attempts
|
||||||
|
/// to use unbound variables, for example, until after someone runs
|
||||||
|
/// `validate` and it comes back without errors.
|
||||||
#[derive(Clone, Debug, PartialEq)]
|
#[derive(Clone, Debug, PartialEq)]
|
||||||
pub struct Program {
|
pub struct Program {
|
||||||
pub statements: Vec<Statement>,
|
pub statements: Vec<Statement>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A parsed statement.
|
||||||
|
///
|
||||||
|
/// Statements are guaranteed to be syntactically valid, but may be
|
||||||
|
/// complete nonsense at the semantic level. Which is to say, all the
|
||||||
|
/// print statements were correctly formatted, and all the variables
|
||||||
|
/// referenced are definitely valid symbols, but they may not have
|
||||||
|
/// been defined or anything.
|
||||||
|
///
|
||||||
|
/// Note that equivalence testing on statements is independent of
|
||||||
|
/// source location; it is testing if the two statements say the same
|
||||||
|
/// thing, not if they are the exact same statement.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub enum Statement {
|
pub enum Statement {
|
||||||
Binding(Location, String, Expression),
|
Binding(Location, String, Expression),
|
||||||
@@ -28,6 +48,12 @@ impl PartialEq for Statement {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// An expression in the underlying syntax.
|
||||||
|
///
|
||||||
|
/// Like statements, these expressions are guaranteed to have been
|
||||||
|
/// formatted correctly, but may not actually make any sense. Also
|
||||||
|
/// like Statements, the [`PartialEq`] implementation does not take
|
||||||
|
/// source positions into account.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub enum Expression {
|
pub enum Expression {
|
||||||
Value(Location, Value),
|
Value(Location, Value),
|
||||||
@@ -54,7 +80,9 @@ impl PartialEq for Expression {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A value from the source syntax
|
||||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||||
pub enum Value {
|
pub enum Value {
|
||||||
|
/// The value of the number, and an optional base that it was written in
|
||||||
Number(Option<u8>, i64),
|
Number(Option<u8>, i64),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,11 +4,23 @@ use crate::eval::{EvalEnvironment, EvalError, Value};
|
|||||||
use crate::syntax::{Expression, Program, Statement};
|
use crate::syntax::{Expression, Program, Statement};
|
||||||
|
|
||||||
impl Program {
|
impl Program {
|
||||||
|
/// Evaluate the program, returning either an error or what it prints out when run.
|
||||||
|
///
|
||||||
|
/// Doing this evaluation is particularly useful for testing, to ensure that if we
|
||||||
|
/// modify a program in some way it does the same thing on both sides of the
|
||||||
|
/// transformation. It's also sometimes just nice to know what a program will be
|
||||||
|
/// doing.
|
||||||
|
///
|
||||||
|
/// Note that the errors here are slightly more strict that we enforce at runtime.
|
||||||
|
/// For example, we check for overflow and underflow errors during evaluation, and
|
||||||
|
/// we don't check for those in the compiled code.
|
||||||
pub fn eval(&self) -> Result<String, EvalError> {
|
pub fn eval(&self) -> Result<String, EvalError> {
|
||||||
let mut env = EvalEnvironment::empty();
|
let mut env = EvalEnvironment::empty();
|
||||||
let mut stdout = String::new();
|
let mut stdout = String::new();
|
||||||
|
|
||||||
for stmt in self.statements.iter() {
|
for stmt in self.statements.iter() {
|
||||||
|
// at this point, evaluation is pretty simple. just walk through each
|
||||||
|
// statement, in order, and record printouts as we come to them.
|
||||||
match stmt {
|
match stmt {
|
||||||
Statement::Binding(_, name, value) => {
|
Statement::Binding(_, name, value) => {
|
||||||
let actual_value = value.eval(&env)?;
|
let actual_value = value.eval(&env)?;
|
||||||
@@ -40,6 +52,7 @@ impl Expression {
|
|||||||
let mut arg_values = Vec::with_capacity(args.len());
|
let mut arg_values = Vec::with_capacity(args.len());
|
||||||
|
|
||||||
for arg in args.iter() {
|
for arg in args.iter() {
|
||||||
|
// yay, recursion! makes this pretty straightforward
|
||||||
arg_values.push(arg.eval(env)?);
|
arg_values.push(arg.eval(env)?);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,13 @@ use crate::syntax::{Expression, Location, Program, Statement};
|
|||||||
use codespan_reporting::diagnostic::Diagnostic;
|
use codespan_reporting::diagnostic::Diagnostic;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
/// An error we found while validating the input program.
|
||||||
|
///
|
||||||
|
/// These errors indicate that we should stop trying to compile
|
||||||
|
/// the program, because it's just fundamentally broken in a way
|
||||||
|
/// that we're not going to be able to work through. As with most
|
||||||
|
/// of these errors, we recommend converting this to a [`Diagnostic`]
|
||||||
|
/// and using [`codespan_reporting`] to present them to the user.
|
||||||
pub enum Error {
|
pub enum Error {
|
||||||
UnboundVariable(Location, String),
|
UnboundVariable(Location, String),
|
||||||
}
|
}
|
||||||
@@ -16,6 +23,13 @@ impl From<Error> for Diagnostic<usize> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A problem we found validating the input that isn't critical.
|
||||||
|
///
|
||||||
|
/// These are things that the user might want to do something about,
|
||||||
|
/// but we can keep going without it being a problem. As with most of
|
||||||
|
/// these things, if you want to present this information to the user,
|
||||||
|
/// the best way to do so is via [`From`] and [`Diagnostic`], and then
|
||||||
|
/// interactions via [`codespan_reporting`].
|
||||||
#[derive(Debug, PartialEq, Eq)]
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
pub enum Warning {
|
pub enum Warning {
|
||||||
ShadowedVariable(Location, Location, String),
|
ShadowedVariable(Location, Location, String),
|
||||||
@@ -37,6 +51,11 @@ impl From<Warning> for Diagnostic<usize> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Program {
|
impl Program {
|
||||||
|
/// Validate that the program makes semantic sense, not just syntactic sense.
|
||||||
|
///
|
||||||
|
/// This checks for things like references to variables that don't exist, for
|
||||||
|
/// example, and generates warnings for things that are inadvisable but not
|
||||||
|
/// actually a problem.
|
||||||
pub fn validate(&self) -> (Vec<Error>, Vec<Warning>) {
|
pub fn validate(&self) -> (Vec<Error>, Vec<Warning>) {
|
||||||
let mut errors = vec![];
|
let mut errors = vec![];
|
||||||
let mut warnings = vec![];
|
let mut warnings = vec![];
|
||||||
@@ -53,6 +72,15 @@ impl Program {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Statement {
|
impl Statement {
|
||||||
|
/// Validate that the statement makes semantic sense, not just syntactic sense.
|
||||||
|
///
|
||||||
|
/// This checks for things like references to variables that don't exist, for
|
||||||
|
/// example, and generates warnings for things that are inadvisable but not
|
||||||
|
/// actually a problem. Since statements appear in a broader context, you'll
|
||||||
|
/// need to provide the set of variables that are bound where this statement
|
||||||
|
/// occurs. We use a `HashMap` to map these bound locations to the locations
|
||||||
|
/// where their bound, because these locations are handy when generating errors
|
||||||
|
/// and warnings.
|
||||||
pub fn validate(
|
pub fn validate(
|
||||||
&self,
|
&self,
|
||||||
bound_variables: &mut HashMap<String, Location>,
|
bound_variables: &mut HashMap<String, Location>,
|
||||||
|
|||||||
Reference in New Issue
Block a user