diff --git a/Cargo.toml b/Cargo.toml index 59b8033..0451d2e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ cranelift-module = "0.94.0" cranelift-native = "0.94.0" cranelift-object = "0.94.0" internment = { version = "0.7.0", default-features = false, features = ["arc"] } -lalrpop-util = "^0.19.7" +lalrpop-util = "^0.20.0" lazy_static = "^1.4.0" logos = "^0.12.0" pretty = { version = "^0.11.2", features = ["termcolor"] } @@ -30,4 +30,4 @@ tempfile = "^3.5.0" thiserror = "^1.0.30" [build-dependencies] -lalrpop = "^0.19.7" +lalrpop = "^0.20.0" diff --git a/src/backend.rs b/src/backend.rs index f70a52f..b6b8808 100644 --- a/src/backend.rs +++ b/src/backend.rs @@ -1,3 +1,31 @@ +//! # The compiler backend: generation of machine code, both static and JIT. +//! +//! This module is responsible for taking our intermediate representation from +//! [`crate::ir`] and turning it into Cranelift and then into object code that +//! can either be saved to disk or run in memory. Because the runtime functions +//! for NGR are very closely tied to the compiler implentation, we also include +//! information about these functions as part of the module. +//! +//! ## Using the `Backend` +//! +//! The backend of this compiler can be used in two modes: a static compilation +//! mode, where the goal is to write the compiled object to disk and then link +//! it later, and a JIT mode, where the goal is to write the compiled object to +//! memory and then run it. Both modes use the same `Backend` object, because +//! they share a lot of behaviors. However, you'll want to use different variants +//! based on your goals: +//! +//! * Use `Backend`, constructed via [`Backend::object_file`], +//! if you want to compile to an object file on disk, which you're then going +//! to link to later. +//! * Use `Backend`, constructed via [`Backend::jit`], if you want +//! to do just-in-time compilation and are just going to run things immediately. +//! +//! ## Working with Runtime Functions +//! +//! For now, runtime functions are pretty easy to describe, because there's +//! only one. In the future, though, the [`RuntimeFunctions`] object is there to +//! help provide a clean interface to them all. mod error; mod eval; mod into_crane; @@ -16,6 +44,15 @@ use target_lexicon::Triple; const EMPTY_DATUM: [u8; 8] = [0; 8]; +/// An object representing an active backend. +/// +/// Internally, this object holds a bunch of state useful for compiling one +/// or more functions into an object file or memory. It can be passed around, +/// but cannot currently be duplicated because some of that state is not +/// easily duplicated. You should be able to share this across threads, assuming +/// normal Rust safety, but you should be thoughtful about transferring it across +/// processes in a JIT context due to some special cases in the runtime function +/// implementations. pub struct Backend { pub module: M, data_ctx: DataContext, @@ -26,6 +63,12 @@ pub struct Backend { } impl Backend { + /// Create a new JIT backend for compiling NGR into memory. + /// + /// The provided output buffer is not for the compiled code, but for the output + /// of any `print` expressions that are evaluated. If set to `None`, the output + /// will be written to `stdout` as per normal, but if a String buffer is provided, + /// it will be extended by any `print` statements that happen during code execution. pub fn jit(output_buffer: Option) -> Result { let platform = Triple::host(); let isa_builder = isa::lookup(platform.clone())?; @@ -50,12 +93,24 @@ impl Backend { }) } + /// Given a compiled function ID, get a pointer to where that function was written + /// in memory. + /// + /// The data at this pointer should not be mutated unless you really, really, + /// really know what you're doing. It can be run by casting it into a Rust + /// `fn() -> ()`, and then calling it from normal Rust. pub fn bytes(&self, function_id: FuncId) -> *const u8 { self.module.get_finalized_function(function_id) } } impl Backend { + /// Generate a backend for compiling into an object file for the given target. + /// + /// This backend will generate a single output file per `Backend` object, although + /// that file may have multiple functions defined within it. Data between those + /// functions (in particular, strings) will be defined once and shared between + /// the different functions. pub fn object_file(platform: Triple) -> Result { let isa_builder = isa::lookup(platform.clone())?; let mut settings_builder = settings::builder(); @@ -76,12 +131,22 @@ impl Backend { }) } + /// Given all the functions defined, return the bytes the object file should contain. pub fn bytes(self) -> Result, BackendError> { self.module.finish().emit().map_err(Into::into) } } impl Backend { + /// Define a string within the current backend. + /// + /// Note that this is a Cranelift [`DataId`], which then must be redeclared inside the + /// context of any functions or data items that want to use it. That being said, the + /// string value will be defined once in the file and then shared by all referencers. + /// + /// This function will automatically add a null character (`'\0'`) to the end of the + /// string, to ensure that strings are non-terminated for interactions with other + /// languages. pub fn define_string(&mut self, s: &str) -> Result { let name = format!("{}", s); let s0 = format!("{}\0", s); @@ -97,6 +162,11 @@ impl Backend { Ok(global_id) } + /// Define a global variable within the current backend. + /// + /// These variables can be shared between functions, and will be exported from the + /// module itself as public data in the case of static compilation. There initial + /// value will be null. pub fn define_variable(&mut self, name: String) -> Result { self.data_ctx.define(Box::new(EMPTY_DATUM)); let id = self @@ -108,6 +178,11 @@ impl Backend { Ok(id) } + /// Get a pointer to the output buffer for `print`ing, or `null`. + /// + /// As suggested, returns `null` in the case where the user has not provided an + /// output buffer; it is your responsibility to check for this case and do + /// something sensible. pub fn output_buffer_ptr(&mut self) -> *mut String { if let Some(str) = self.output_buffer.as_mut() { str as *mut String @@ -116,6 +191,10 @@ impl Backend { } } + /// Get any captured output `print`ed by the program during execution. + /// + /// If an output buffer was not provided, or if the program has not done any + /// printing, then this function will return an empty string. pub fn output(self) -> String { if let Some(s) = self.output_buffer { s diff --git a/src/backend/error.rs b/src/backend/error.rs index 3eb3118..caa9e59 100644 --- a/src/backend/error.rs +++ b/src/backend/error.rs @@ -2,8 +2,27 @@ use crate::backend::runtime::RuntimeFunctionError; use codespan_reporting::diagnostic::Diagnostic; use cranelift_codegen::{isa::LookupError, settings::SetError, CodegenError}; use cranelift_module::ModuleError; +use internment::ArcIntern; use thiserror::Error; +/// An error in the translation to a backend (either the JIT or the static compiler). +/// +/// In general, this is just a nice summary error type for a bunch of downstream +/// errors; the exception are internal errors from builtin functions or variable +/// lookups. +/// +/// Unlike some other errors in the system, the translation to a `Diagnostic` does +/// not necessarily provide a whole lot of value, because we have lost most of the +/// source information by the time we're generating these errors. That being said, +/// people who want to provide nicer error messages might consider using the +/// translation through `Diagnostic` anyways, just in case we add more information +/// in the future. +/// +/// Finally, the `PartialEq` for this function is a bit fuzzy. In some cases, it +/// ensures that the errors match exactly. In other cases, though, it just checks to +/// see if the two errors are of the same class; e.g., it will return true if both +/// errors are `BackendError::CodegenError`, regardless of what the specific +/// `CodegenError` is. #[derive(Debug, Error)] pub enum BackendError { #[error("Cranelift module error: {0}")] @@ -11,7 +30,7 @@ pub enum BackendError { #[error("Builtin function error: {0}")] BuiltinError(#[from] RuntimeFunctionError), #[error("Internal variable lookup error")] - VariableLookupFailure, + VariableLookupFailure(ArcIntern), #[error(transparent)] CodegenError(#[from] CodegenError), #[error(transparent)] @@ -31,9 +50,8 @@ impl From for Diagnostic { BackendError::BuiltinError(me) => { Diagnostic::error().with_message(format!("Internal runtime function error: {}", me)) } - BackendError::VariableLookupFailure => { - Diagnostic::error().with_message("Internal variable lookup error!") - } + BackendError::VariableLookupFailure(x) => Diagnostic::error() + .with_message(format!("Internal variable lookup error for {}", x)), BackendError::CodegenError(me) => { Diagnostic::error().with_message(format!("Internal codegen error: {}", me)) } @@ -58,8 +76,12 @@ impl PartialEq for BackendError { _ => false, }, + // because the underlying `CodegenError` doesn't implement `PartialEq', + // we just check that they're both `CodegenError`s. BackendError::CodegenError(_) => matches!(other, BackendError::CodegenError(_)), + // because the underlying `ModuleError` doesn't implement `PartialEq', + // we just check that they're both `Cranelift`s. BackendError::Cranelift(_) => matches!(other, BackendError::Cranelift(_)), BackendError::LookupError(a) => match other { @@ -72,7 +94,10 @@ impl PartialEq for BackendError { _ => false, }, - BackendError::VariableLookupFailure => other == &BackendError::VariableLookupFailure, + BackendError::VariableLookupFailure(a) => match other { + BackendError::VariableLookupFailure(b) => a == b, + _ => false, + }, BackendError::Write(a) => match other { BackendError::Write(b) => a == b, diff --git a/src/backend/eval.rs b/src/backend/eval.rs index a57c625..e9c88f1 100644 --- a/src/backend/eval.rs +++ b/src/backend/eval.rs @@ -8,6 +8,19 @@ use cranelift_object::ObjectModule; use target_lexicon::Triple; impl Backend { + /// Evaluate the given IR, returning the output it prints. + /// + /// This builds and executes the program using the JIT backend, using a fresh JIT runtime + /// that should be independent of any other runtimes being executed. As such, it should be + /// impossible for a program being executed by this function to interact with another, parallel + /// execution of the function. If you actually want them to interact, you'll need to combine + /// them into the same `Program` before execution. + /// + /// One important note: The runtime used by this function does not currently implement + /// overflow/underflow erroring the same way that other evaluation functions within this + /// library do. So, if you're validating equivalence between them, you'll want to weed + /// out examples that overflow/underflow before checking equivalence. (This is the behavior + /// of the built-in test systems.) pub fn eval(program: Program) -> Result { let mut jitter = Backend::jit(Some(String::new()))?; let function_id = jitter.compile_function("test", program)?; @@ -20,6 +33,20 @@ impl Backend { } impl Backend { + /// Evalute the given IR, returning the output it prints. + /// + /// This build the program as a standalone object in a temporary directory, and then links + /// and runs it using the provided runtime system (see `CARGO_MANIFEST_DIR/runtime/`). To + /// do so, it assumes that there is a version of `clang` available in the current PATH. + /// + /// This routine is regularly tested under Windows, Mac, and Linux, and should work across + /// other platforms that support `clang`. + /// + /// One important note: The runtime used by this function does not currently implement + /// overflow/underflow erroring the same way that other evaluation functions within this + /// library do. So, if you're validating equivalence between them, you'll want to weed + /// out examples that overflow/underflow before checking equivalence. (This is the behavior + /// of the built-in test systems.) pub fn eval(program: Program) -> Result { //use pretty::{Arena, Pretty}; //let allocator = Arena::<()>::new(); @@ -40,18 +67,26 @@ impl Backend { if output.status.success() { Ok(std::string::String::from_utf8_lossy(&output.stdout).to_string()) } else { - Err(EvalError::IO(format!( - "Exitted with error code {}", - output.status - ))) + Err(EvalError::ExitCode(output.status)) } } else { - Err(EvalError::IO( + Err(EvalError::RuntimeOutput( std::string::String::from_utf8_lossy(&output.stderr).to_string(), )) } } + /// Link the generated object into an executable. + /// + /// Currently, our runtime system is a single file, and ends up being the function + /// that includes `main`. (It then calls the `gogogo` function which serves as the + /// entry point for our compiled code.) This function thus just uses `clang` to + /// compile the C file with the generated object file to produce the executable. + /// Conveniently, `clang` also sets execute permissions under unix-like file systems. + /// + /// This function assumes that this compilation and linking should run without any + /// output, so changes to the RTS should make 100% sure that they do not generate + /// any compiler warnings. fn link(object_file: &Path, executable_path: &Path) -> Result<(), EvalError> { use std::path::PathBuf; @@ -67,7 +102,7 @@ impl Backend { .output()?; if !output.stderr.is_empty() { - return Err(EvalError::IO( + return Err(EvalError::Linker( std::string::String::from_utf8_lossy(&output.stderr).to_string(), )); } @@ -77,12 +112,17 @@ impl Backend { } proptest::proptest! { + // This is the obvious test to make sure that our static compilation path works + // without error, assuming any possible input ... well, any possible input that + // doesn't involve overflow or underflow. #[test] - fn file_backend_works(program: Program) { + fn static_backend(program: Program) { use crate::eval::PrimOpError; let basic_result = program.eval(); + // windows `printf` is going to terminate lines with "\r\n", so we need to adjust + // our test result here. #[cfg(target_family="windows")] let basic_result = basic_result.map(|x| x.replace('\n', "\r\n")); @@ -92,8 +132,11 @@ proptest::proptest! { } } + // This is the obvious test to make sure that our JIT compilation path works + // without error, assuming any possible input ... well, any possible input that + // doesn't involve overflow or underflow. #[test] - fn jit_backend_works(program: Program) { + fn jit_backend(program: Program) { use crate::eval::PrimOpError; let basic_result = program.eval(); diff --git a/src/backend/into_crane.rs b/src/backend/into_crane.rs index 6ceff02..5965a9f 100644 --- a/src/backend/into_crane.rs +++ b/src/backend/into_crane.rs @@ -8,15 +8,31 @@ use cranelift_codegen::ir::{ use cranelift_codegen::isa::CallConv; use cranelift_codegen::Context; use cranelift_frontend::{FunctionBuilder, FunctionBuilderContext, Variable}; -use cranelift_module::{FuncId, Linkage, Module, ModuleError}; +use cranelift_module::{FuncId, Linkage, Module}; use internment::ArcIntern; use crate::backend::error::BackendError; use crate::backend::Backend; +/// When we're compiling, we might need to reference some of the strings built into +/// the source code; to do so, we need a `GlobalValue`. Perhaps unexpectedly, given +/// the name, `GlobalValue`s are specific to a single function we're compiling, so +/// we end up computing this table for every function. +/// +/// This just a handy type alias to avoid a lot of confusion in the functions. type StringTable = HashMap, GlobalValue>; impl Backend { + /// Compile the given `Program` into a function with the given name. + /// + /// At some point, the use of `Program` is going to change; however, for the + /// moment, we have no notion of a function in our language so the whole input + /// is converted into a single output function. The type of the generated + /// function is, essentially, `fn() -> ()`: it takes no arguments and returns + /// no value. + /// + /// The function provided can then be either written to a file (if using a + /// static Cranelift backend) or executed directly (if using the Cranelift JIT). pub fn compile_function( &mut self, function_name: &str, @@ -28,21 +44,47 @@ impl Backend { call_conv: CallConv::SystemV, }; + // this generates the handle for the function that we'll eventually want to + // return to the user. For now, we declare all functions defined by this + // function as public/global/exported, although we may want to reconsider + // this decision later. let func_id = self.module .declare_function(function_name, Linkage::Export, &basic_signature)?; - let mut ctx = Context::new(); - ctx.func = - Function::with_name_signature(UserFuncName::user(0, func_id.as_u32()), basic_signature); + // Next we have to generate the compilation context for the rest of this + // function. Currently, we generate a fresh context for every function. + // Since we're only generating one function per `Program`, this makes + // complete sense. However, in the future, we may want to revisit this + // decision. + let mut ctx = Context::new(); + let user_func_name = UserFuncName::user(0, func_id.as_u32()); + ctx.func = Function::with_name_signature(user_func_name, basic_signature); + + // We generate a table of every string that we use in the program, here. + // Cranelift is going to require us to have this in a particular structure + // (`GlobalValue`) so that we can reference them later, and it's going to + // be tricky to generate those on the fly. So we just generate the set we + // need here, and then have ir around in the table for later. let string_table = self.build_string_table(&mut ctx.func, &program)?; - let mut variable_table = HashMap::new(); - let mut next_var_num = 1; + + // In the future, we might want to see what runtime functions the function + // we were given uses, and then only include those functions that we care + // about. Presumably, we'd use some sort of lookup table like we do for + // strings. But for now, we only have one runtime function, and we're pretty + // sure we're always going to use it, so we just declare it (and reference + // it) directly. let print_func_ref = self.runtime_functions.include_runtime_function( "print", &mut self.module, &mut ctx.func, )?; + + // In the case of the JIT, there may be symbols we've already defined outside + // the context of this particular `Progam`, which we might want to reference. + // Just like with strings, generating the `GlobalValue`s we need can potentially + // be a little tricky to do on the fly, so we generate the complete list right + // here and then use it later. let pre_defined_symbols: HashMap = self .defined_symbols .iter() @@ -52,67 +94,88 @@ impl Backend { }) .collect(); + // The last table we're going to need is our local variable table, to store + // variables used in this `Program` but not used outside of it. For whatever + // reason, Cranelift requires us to generate unique indexes for each of our + // variables; we just use a simple incrementing counter for that. + let mut variable_table = HashMap::new(); + let mut next_var_num = 1; + + // Finally (!), we generate the function builder that we're going to use to + // make this function! let mut fctx = FunctionBuilderContext::new(); let mut builder = FunctionBuilder::new(&mut ctx.func, &mut fctx); + + // Make the initial block to put instructions in. Later, when we have control + // flow, we might add more blocks after this one. But, for now, we only have + // the one block. let main_block = builder.create_block(); builder.switch_to_block(main_block); + // Compiling a function is just compiling each of the statements in order. + // At the moment, we do the pattern match for statements here, and then + // directly compile the statements. If/when we add more statement forms, + // this is likely to become more cumbersome, and we'll want to separate + // these off. But for now, given the amount of tables we keep around to track + // state, it's easier to just include them. for stmt in program.statements.drain(..) { match stmt { + // Print statements are fairly easy to compile: we just lookup the + // output buffer, the address of the string to print, and the value + // of whatever variable we're printing. Then we just call print. Statement::Print(ann, var) => { + // Get the output buffer (or null) from our general compilation context. let buffer_ptr = self.output_buffer_ptr(); let buffer_ptr = builder.ins().iconst(types::I64, buffer_ptr as i64); + + // Get a reference to the string we want to print. let local_name_ref = string_table.get(&var).unwrap(); let name_ptr = builder.ins().symbol_value(types::I64, *local_name_ref); - let val = ValueOrRef::Ref(ann, var).into_cranelift( + + // Look up the value for the variable. Because this might be a + // global variable (and that requires special logic), we just turn + // this into an `Expression` and re-use the logic in that implementation. + let val = Expression::Reference(ann, var).into_crane( &mut builder, &variable_table, &pre_defined_symbols, )?; + + // Finally, we can generate the call to print. builder .ins() .call(print_func_ref, &[buffer_ptr, name_ptr, val]); } + // Variable binding is a little more con Statement::Binding(_, var_name, value) => { - let val = match value { - Expression::Value(_, Value::Number(_, v)) => { - builder.ins().iconst(types::I64, v) - } - - Expression::Reference(_, name) => { - let value_var_num = variable_table.get(&name).unwrap(); - builder.use_var(Variable::new(*value_var_num)) - } - - Expression::Primitive(_, prim, mut vals) => { - let right = vals.pop().unwrap().into_cranelift( - &mut builder, - &variable_table, - &pre_defined_symbols, - )?; - let left = vals.pop().unwrap().into_cranelift( - &mut builder, - &variable_table, - &pre_defined_symbols, - )?; - - match prim { - Primitive::Plus => builder.ins().iadd(left, right), - Primitive::Minus => builder.ins().isub(left, right), - Primitive::Times => builder.ins().imul(left, right), - Primitive::Divide => builder.ins().sdiv(left, right), - } - } - }; + // Kick off to the `Expression` implementation to see what value we're going + // to bind to this variable. + let val = + value.into_crane(&mut builder, &variable_table, &pre_defined_symbols)?; + // Now the question is: is this a local variable, or a global one? if let Some(global_id) = pre_defined_symbols.get(var_name.as_str()) { + // It's a global variable! In this case, we assume that someone has already + // dedicated some space in memory to store this value. We look this location + // up, and then tell Cranelift to store the value there. let val_ptr = builder.ins().symbol_value(types::I64, *global_id); builder.ins().store(MemFlags::new(), val, val_ptr, 0); } else { + // It's a local variable! In this case, we need to allocate a new Cranelift + // `Variable` for this variable, which we do using our `next_var_num` counter. + // (While we're doing this, we also increment `next_var_num`, so that we get + // a fresh `Variable` next time. This is one of those very narrow cases in which + // I wish Rust had an increment expression.) let var = Variable::new(next_var_num); - variable_table.insert(var_name, next_var_num); next_var_num += 1; + + // We can add the variable directly to our local variable map; it's `Copy`. + variable_table.insert(var_name, var); + + // Now we tell Cranelift about our new variable, which has type I64 because + // everything we have at this point is of type I64. Once it's declare, we + // define it as having the value we computed above. builder.declare_var(var, types::I64); builder.def_var(var, val); } @@ -120,15 +183,30 @@ impl Backend { } } + // Now that we're done, inject a return function (one with no actual value; basically + // the equivalent of Rust's `return;`). We then seal the block (which lets Cranelift + // know that the block is done), and then finalize the function (which lets Cranelift + // know we're done with the function). builder.ins().return_(&[]); builder.seal_block(main_block); builder.finalize(); + // This is a little odd. We want to tell the rest of Cranelift about this function, + // so we register it using the function ID and our builder context. However, the + // result of this function isn't actually super helpful. So we ignore it, unless + // it's an error. let _ = self.module.define_function(func_id, &mut ctx)?; + // done! Ok(func_id) } + // Build the string table for use in referencing strings later. + // + // This function is slightly smart, in that it only puts strings in the table that + // are used by the `Program`. (Thanks to `Progam::strings()`!) If the strings have + // been declared globally, via `Backend::define_string()`, we will re-use that data. + // Otherwise, this will define the string for you. fn build_string_table( &mut self, func: &mut Function, @@ -149,30 +227,73 @@ impl Backend { } } -impl ValueOrRef { - fn into_cranelift( +impl Expression { + fn into_crane( self, builder: &mut FunctionBuilder, - local_variables: &HashMap, usize>, + local_variables: &HashMap, Variable>, global_variables: &HashMap, - ) -> Result { + ) -> Result { match self { - ValueOrRef::Value(_, value) => match value { - Value::Number(_base, numval) => Ok(builder.ins().iconst(types::I64, numval)), - }, + // Values are pretty straightforward to compile, mostly because we only + // have one type of variable, and it's an integer type. + Expression::Value(_, Value::Number(_, v)) => Ok(builder.ins().iconst(types::I64, v)), - ValueOrRef::Ref(_, name) => { - if let Some(local_num) = local_variables.get(&name) { - return Ok(builder.use_var(Variable::new(*local_num))); + Expression::Reference(_, name) => { + // first we see if this is a local variable (which is nicer, from an + // optimization point of view.) + if let Some(local_var) = local_variables.get(&name) { + return Ok(builder.use_var(*local_var)); } - if let Some(global_id) = global_variables.get(name.as_str()) { - let val_ptr = builder.ins().symbol_value(types::I64, *global_id); + // then we check to see if this is a global reference, which requires us to + // first lookup where the value is stored, and then load it. + if let Some(global_var) = global_variables.get(name.as_ref()) { + let val_ptr = builder.ins().symbol_value(types::I64, *global_var); return Ok(builder.ins().load(types::I64, MemFlags::new(), val_ptr, 0)); } - Err(ModuleError::Undeclared(name.to_string())) + // this should never happen, because we should have made sure that there are + // no unbound variables a long time before this. but still ... + Err(BackendError::VariableLookupFailure(name)) + } + + Expression::Primitive(_, prim, mut vals) => { + // we're going to use `pop`, so we're going to pull and compile the right value ... + let right = + vals.pop() + .unwrap() + .into_crane(builder, local_variables, global_variables)?; + // ... and then the left. + let left = + vals.pop() + .unwrap() + .into_crane(builder, local_variables, global_variables)?; + + // then we just need to tell Cranelift how to do each of our primitives! Much + // like Statements, above, we probably want to eventually shuffle this off into + // a separate function (maybe something off `Primitive`), but for now it's simple + // enough that we just do the `match` here. + match prim { + Primitive::Plus => Ok(builder.ins().iadd(left, right)), + Primitive::Minus => Ok(builder.ins().isub(left, right)), + Primitive::Times => Ok(builder.ins().imul(left, right)), + Primitive::Divide => Ok(builder.ins().sdiv(left, right)), + } } } } } + +// Just to avoid duplication, this just leverages the `From` trait implementation +// for `ValueOrRef` to compile this via the `Expression` logic, above. +impl ValueOrRef { + fn into_crane( + self, + builder: &mut FunctionBuilder, + local_variables: &HashMap, Variable>, + global_variables: &HashMap, + ) -> Result { + Expression::from(self).into_crane(builder, local_variables, global_variables) + } +} diff --git a/src/backend/runtime.rs b/src/backend/runtime.rs index 1338a73..a03acf7 100644 --- a/src/backend/runtime.rs +++ b/src/backend/runtime.rs @@ -8,9 +8,14 @@ use std::fmt::Write; use target_lexicon::Triple; use thiserror::Error; +/// An object for querying / using functions built into the runtime. +/// +/// Right now, this is a quite a bit of boilerplate for very nebulous +/// value. However, as the number of built-in functions gets large, it's +/// nice to have a single point to register and query them, so here we +/// go. pub struct RuntimeFunctions { builtin_functions: HashMap, - _referenced_functions: Vec, } #[derive(Debug, Error, PartialEq)] @@ -19,25 +24,27 @@ pub enum RuntimeFunctionError { CannotFindRuntimeFunction(String), } -extern "C" fn runtime_print(output_buffer: *mut String, name: *const i8, value: i64) { - let cstr = unsafe { CStr::from_ptr(name) }; - let reconstituted = cstr.to_string_lossy(); - - if let Some(output_buffer) = unsafe { output_buffer.as_mut() } { - writeln!(output_buffer, "{} = {}i64", reconstituted, value).unwrap(); - } else { - println!("{} = {}", reconstituted, value); - } -} - impl RuntimeFunctions { + /// Generate a new runtime function table for the given platform, and + /// declare them within the provided Cranelift module. + /// + /// Note that this is very conservative: it assumes that your module + /// will want to use every runtime function. Unless the Cranelift object + /// builder is smart, this might inject a bunch of references (and thus + /// linker requirements) that aren't actually needed by your program. + /// + /// Then again, right now there's exactly one runtime function, so ... + /// not a big deal. pub fn new(platform: &Triple, module: &mut M) -> ModuleResult { let mut builtin_functions = HashMap::new(); - let _referenced_functions = Vec::new(); let string_param = AbiParam::new(types::I64); let int64_param = AbiParam::new(types::I64); + // declare print for Cranelift; it's something we're going to import + // into the current module (it's compiled separately), and takes two + // strings and an integer. (Which ... turn out to all be the same + // underlying type, which is weird but the way it is.) let print_id = module.declare_function( "print", Linkage::Import, @@ -47,14 +54,19 @@ impl RuntimeFunctions { call_conv: CallConv::triple_default(platform), }, )?; + + // Toss this function in our internal dictionary, as well. builtin_functions.insert("print".to_string(), print_id); - Ok(RuntimeFunctions { - builtin_functions, - _referenced_functions, - }) + Ok(RuntimeFunctions { builtin_functions }) } + /// Include the named runtime function into the current Function context. + /// + /// This is necessary for every runtime function reference within each + /// function. The returned `FuncRef` can be used in `call` invocations. + /// The only reason for this function to error is if you pass a name that + /// the runtime isn't familiar with. pub fn include_runtime_function( &self, name: &str, @@ -69,7 +81,30 @@ impl RuntimeFunctions { } } + /// Register live, local versions of the runtime functions into the JIT. + /// + /// Note that these implementations are *not* the same as the ones defined + /// in `CARGO_MANIFEST_DIR/runtime/`, for ... reasons. It might be a good + /// change, in the future, to find a way to unify these implementations into + /// one; both to reduce the chance that they deviate, and to reduce overall + /// maintenance burden. pub fn register_jit_implementations(builder: &mut JITBuilder) { builder.symbol("print", runtime_print as *const u8); } } + +// Print! This implementation is used in the JIT compiler, to actually print data. We +// use the `output_buffer` argument as an aid for testing; if it's non-NULL, it's a string +// we extend with the output, so that multiple JIT'd `Program`s can run concurrently +// without stomping over each other's output. If `output_buffer` is NULL, we just print +// to stdout. +extern "C" fn runtime_print(output_buffer: *mut String, name: *const i8, value: i64) { + let cstr = unsafe { CStr::from_ptr(name) }; + let reconstituted = cstr.to_string_lossy(); + + if let Some(output_buffer) = unsafe { output_buffer.as_mut() } { + writeln!(output_buffer, "{} = {}i64", reconstituted, value).unwrap(); + } else { + println!("{} = {}", reconstituted, value); + } +} diff --git a/src/bin/ngrc.rs b/src/bin/ngrc.rs index 23a9021..821b0e2 100644 --- a/src/bin/ngrc.rs +++ b/src/bin/ngrc.rs @@ -1,17 +1,7 @@ use clap::Parser; -use codespan_reporting::diagnostic::Diagnostic; -use codespan_reporting::files::SimpleFiles; -use codespan_reporting::term; -use codespan_reporting::term::termcolor::{ColorChoice, StandardStream}; -use cranelift_object::object; - -use ngr::backend::Backend; -use ngr::backend::BackendError; -use ngr::ir::Program as IR; -use ngr::syntax::{ParserError, Program as Syntax}; -use target_lexicon::Triple; -use thiserror::Error; +/// Clap is great! Even though we don't have many command line arguments +/// yet, this is just really neat. #[derive(Parser, Debug)] #[clap(author, version, about, long_about = None)] struct CommandLineArguments { @@ -23,76 +13,14 @@ struct CommandLineArguments { file: String, } -#[derive(Debug, Error)] -enum MainError { - #[error(transparent)] - Backend(#[from] BackendError), - #[error("Parser error")] - ParserError(#[from] ParserError), - #[error("IO error")] - IoError(#[from] std::io::Error), - #[error("write error")] - WriteError(#[from] object::write::Error), -} - -impl From for Diagnostic { - fn from(value: MainError) -> Self { - match value { - MainError::Backend(be) => be.into(), - MainError::ParserError(pe) => (&pe).into(), - MainError::IoError(e) => Diagnostic::error().with_message(format!("IO error: {}", e)), - MainError::WriteError(e) => { - Diagnostic::error().with_message(format!("Module write error: {}", e)) - } - } - } -} - -fn compile(file_database: &mut SimpleFiles) -> Result<(), MainError> { - let args = CommandLineArguments::parse(); - - let syntax = Syntax::parse_file(file_database, &args.file)?; - let (mut errors, mut warnings) = syntax.validate(); - let stop = !errors.is_empty(); - let messages = errors - .drain(..) - .map(Into::into) - .chain(warnings.drain(..).map(Into::into)); - let writer = StandardStream::stderr(ColorChoice::Auto); - let config = codespan_reporting::term::Config::default(); - - for message in messages { - term::emit(&mut writer.lock(), &config, file_database, &message).unwrap(); - } - - if stop { - return Ok(()); - } - - let ir = IR::from(syntax.simplify()); - let mut backend = Backend::object_file(Triple::host())?; - backend.compile_function("gogogo", ir)?; - let bytes = backend.bytes()?; - std::fs::write(args.output.unwrap_or_else(|| "output.o".to_string()), bytes)?; - Ok(()) -} - fn main() { - let mut file_database = SimpleFiles::new(); + let args = CommandLineArguments::parse(); + let mut compiler = ngr::Compiler::default(); - match compile(&mut file_database) { - Ok(()) => {} - Err(e) => { - let writer = StandardStream::stderr(ColorChoice::Auto); - let config = codespan_reporting::term::Config::default(); + let output_file = args.output.unwrap_or("output.o".to_string()); - term::emit( - &mut writer.lock(), - &config, - &file_database, - &Diagnostic::from(e), - ) - .unwrap(); - } + if let Some(bytes) = compiler.compile(&args.file) { + std::fs::write(&output_file, bytes) + .unwrap_or_else(|x| eprintln!("Could not write to file {}: {}", output_file, x)); } } diff --git a/src/bin/ngri.rs b/src/bin/ngri.rs index b1d74de..0558de4 100644 --- a/src/bin/ngri.rs +++ b/src/bin/ngri.rs @@ -1,130 +1,11 @@ -use codespan_reporting::diagnostic::Diagnostic; -use codespan_reporting::files::SimpleFiles; -use codespan_reporting::term::{self, Config}; -use cranelift_jit::JITModule; -use cranelift_module::ModuleError; -use ngr::backend::{Backend, BackendError}; -use ngr::ir::Program as IR; -use ngr::syntax::{Location, ParserError, Statement}; -use pretty::termcolor::{ColorChoice, StandardStream, WriteColor}; +use ngr::backend::BackendError; use rustyline::error::ReadlineError; use rustyline::DefaultEditor; -use std::collections::HashMap; - -pub struct RunLoop<'a> { - file_database: SimpleFiles<&'a str, String>, - jitter: Backend, - variable_binding_sites: HashMap, - gensym_index: usize, - writer: &'a mut dyn WriteColor, - config: Config, -} - -#[allow(clippy::upper_case_acronyms)] -#[derive(Debug, thiserror::Error)] -enum REPLError { - #[error("Error parsing statement: {0}")] - Parser(#[from] ParserError), - #[error("JIT error: {0}")] - JIT(#[from] BackendError), - #[error("Internal cranelift error: {0}")] - Cranelift(#[from] ModuleError), - #[error(transparent)] - Reporting(#[from] codespan_reporting::files::Error), -} - -impl From for Diagnostic { - fn from(value: REPLError) -> Self { - match value { - REPLError::Parser(err) => Diagnostic::from(&err), - REPLError::JIT(err) => Diagnostic::from(err), - REPLError::Cranelift(err) => Diagnostic::bug().with_message(format!("{}", err)), - REPLError::Reporting(err) => Diagnostic::bug().with_message(format!("{}", err)), - } - } -} - -impl<'a> RunLoop<'a> { - pub fn new(writer: &'a mut dyn WriteColor, config: Config) -> Result { - Ok(RunLoop { - file_database: SimpleFiles::new(), - jitter: Backend::jit(None)?, - variable_binding_sites: HashMap::new(), - gensym_index: 1, - writer, - config, - }) - } - - fn emit_diagnostic( - &mut self, - diagnostic: Diagnostic, - ) -> Result<(), codespan_reporting::files::Error> { - term::emit(self.writer, &self.config, &self.file_database, &diagnostic) - } - - fn process_input(&mut self, line_no: usize, command: String) { - if let Err(err) = self.process(line_no, command) { - if let Err(e) = self.emit_diagnostic(Diagnostic::from(err)) { - eprintln!( - "WOAH! System having trouble printing error messages. This is very bad. ({})", - e - ); - } - } - } - - fn process(&mut self, line_no: usize, command: String) -> Result<(), REPLError> { - let entry = self.file_database.add("entry", command); - let source = self - .file_database - .get(entry) - .expect("entry exists") - .source(); - let syntax = Statement::parse(entry, source)?; - - // if this is a variable binding, and we've never defined this variable before, - // we should tell cranelift about it. this is optimistic; if we fail to compile, - // then we won't use this definition until someone tries again. - if let Statement::Binding(_, ref name, _) = syntax { - if !self.variable_binding_sites.contains_key(name.as_str()) { - self.jitter.define_string(name)?; - self.jitter.define_variable(name.clone())?; - } - }; - - let (mut errors, mut warnings) = syntax.validate(&mut self.variable_binding_sites); - let stop = !errors.is_empty(); - let messages = errors - .drain(..) - .map(Into::into) - .chain(warnings.drain(..).map(Into::into)); - - for message in messages { - self.emit_diagnostic(message)?; - } - - if stop { - return Ok(()); - } - - let ir = IR::from(syntax.simplify(&mut self.gensym_index)); - let name = format!("line{}", line_no); - let function_id = self.jitter.compile_function(&name, ir)?; - self.jitter.module.finalize_definitions()?; - let compiled_bytes = self.jitter.bytes(function_id); - let compiled_function = unsafe { std::mem::transmute::<_, fn() -> ()>(compiled_bytes) }; - compiled_function(); - Ok(()) - } -} fn main() -> Result<(), BackendError> { let mut editor = DefaultEditor::new().expect("rustyline works"); let mut line_no = 0; - let mut writer = StandardStream::stdout(ColorChoice::Auto); - let config = codespan_reporting::term::Config::default(); - let mut state = RunLoop::new(&mut writer, config)?; + let mut state = ngr::REPL::default(); println!("No Good Reason, the Interpreter!"); loop { @@ -135,18 +16,30 @@ fn main() -> Result<(), BackendError> { ":quit" => break, _ => state.process_input(line_no, command), }, + + // it's not clear to me what this could be, but OK Err(ReadlineError::Io(e)) => { eprintln!("IO error: {}", e); break; } + + // Control-D and Control-C Err(ReadlineError::Eof) => break, Err(ReadlineError::Interrupted) => break, + + // For some reason this doesn't exist on Windows. I also don't quite know + // what would cause this, but ... #[cfg(not(windows))] Err(ReadlineError::Errno(e)) => { eprintln!("Unknown syscall error: {}", e); break; } + + // We don't actually do any reflow-ing if we change the terminal size, + // so we can just ignore this. Err(ReadlineError::WindowResized) => continue, + + // Why on earth are there so many error types? Err(e) => { eprintln!("Unknown internal error: {}", e); break; diff --git a/src/compiler.rs b/src/compiler.rs new file mode 100644 index 0000000..41cc037 --- /dev/null +++ b/src/compiler.rs @@ -0,0 +1,157 @@ +use crate::backend::Backend; +use crate::ir::Program as IR; +use crate::syntax::Program as Syntax; +use codespan_reporting::{ + diagnostic::Diagnostic, + files::SimpleFiles, + term::{self, Config}, +}; +use pretty::termcolor::{ColorChoice, StandardStream}; +use target_lexicon::Triple; + +/// A high-level compiler for NGR programs. +/// +/// This object can be built once, and then re-used many times to build multiple +/// files. For most users, the [`Default`] implementation should be sufficient; +/// it will use `stderr` for warnings and errors, with default colors based on +/// what we discover from the terminal. For those who want to provide alternate +/// outputs, though, the `Compiler::new` constructor is available. +pub struct Compiler { + file_database: SimpleFiles, + console: StandardStream, + console_config: Config, +} + +impl Default for Compiler { + fn default() -> Self { + let console = StandardStream::stderr(ColorChoice::Auto); + Compiler::new(console, Config::default()) + } +} + +impl Compiler { + /// Create a new compiler object. + /// + /// This object can be re-used to compile as many files as you like. + /// Use this function if you want to configure your output console and/or + /// its configuration in some custom way. Alternatively, you can use the + /// `Default` implementation, which will emit information to `stderr` with + /// a reasonable default configuration. + pub fn new(console: StandardStream, console_config: Config) -> Self { + Compiler { + file_database: SimpleFiles::new(), + console, + console_config, + } + } + + /// Compile the given file, returning the object file as a vector of bytes. + /// + /// This function may create output, via the console configured with this + /// `Compiler` object. If the compilation fails for any reason, will return + /// `None`. + pub fn compile>(&mut self, input_file: P) -> Option> { + match self.compile_internal(input_file.as_ref()) { + Ok(x) => x, + Err(e) => { + self.emit(e.into()); + None + } + } + } + + /// This is the actual meat of the compilation chain; we hide it from the user + /// because the type is kind of unpleasant. + /// + /// The weird error type comes from the fact that we can run into three types + /// of result: + /// + /// * Fundamental errors, like an incorrectly formatted file or some + /// oddity with IO. These return `Err`. + /// * Validation errors, where we reject the program due to something + /// semantically wrong with them. These return `Ok(None)`. + /// * Success! In this case, we return `Ok(Some(...))`, where the bytes + /// returned is the contents of the compiled object file. + /// + fn compile_internal(&mut self, input_file: &str) -> Result>, CompilerError> { + // Try to parse the file into our syntax AST. If we fail, emit the error + // and then immediately return `None`. + let syntax = Syntax::parse_file(&mut self.file_database, input_file)?; + + // Now validate the user's syntax AST. This can possibly find errors and/or + // create warnings. We can continue if we only get warnings, but need to stop + // if we get any errors. + let (mut errors, mut warnings) = syntax.validate(); + let stop = !errors.is_empty(); + let messages = errors + .drain(..) + .map(Into::into) + .chain(warnings.drain(..).map(Into::into)); + + // emit all the messages we receive; warnings *and* errors + for message in messages { + self.emit(message); + } + + // we got errors, so just stop right now. perhaps oddly, this is Ok(None); + // we've already said all we're going to say in the messags above, so there's + // no need to provide another `Err` result. + if stop { + return Ok(None); + } + + // Now that we've validated it, turn it into IR. + let ir = IR::from(syntax); + + // Finally, send all this to Cranelift for conversion into an object file. + let mut backend = Backend::object_file(Triple::host())?; + backend.compile_function("gogogo", ir)?; + Ok(Some(backend.bytes()?)) + } + + /// Emit a diagnostic. + /// + /// This is just a really handy shorthand we use elsewhere in the object, because + /// there's a lot of boilerplate we'd like to skip. + fn emit(&mut self, diagnostic: Diagnostic) { + term::emit( + &mut self.console.lock(), + &self.console_config, + &self.file_database, + &diagnostic, + ) + .expect("codespan reporting term::emit works"); + } +} + +// This is just a handy type that we can convert things into; it's not +// exposed outside this module, and doesn't actually do much of interest. +#[derive(Debug, thiserror::Error)] +enum CompilerError { + #[error(transparent)] + Backend(#[from] crate::backend::BackendError), + #[error(transparent)] + ParserError(#[from] crate::syntax::ParserError), + #[error(transparent)] + IoError(#[from] std::io::Error), + #[error(transparent)] + WriteError(#[from] cranelift_object::object::write::Error), +} + +// Since we're going to use codespan to report pretty much all errors, +// this just passes through most of the errors, or makes simple versions +// of `Diagnostic` for those that we don't have existing `From`s. +impl From for Diagnostic { + fn from(value: CompilerError) -> Self { + match value { + CompilerError::Backend(be) => be.into(), + CompilerError::ParserError(pe) => (&pe).into(), + CompilerError::IoError(e) => { + Diagnostic::error().with_message(format!("IO error: {}", e)) + } + CompilerError::WriteError(e) => { + Diagnostic::error().with_message(format!("Module write error: {}", e)) + } + } + } +} diff --git a/src/eval.rs b/src/eval.rs index b764eb5..cf77f74 100644 --- a/src/eval.rs +++ b/src/eval.rs @@ -1,3 +1,38 @@ +//! Helpful functions for evaluating NGR programs. +//! +//! Look, this is a compiler, and so you might be asking why it has a bunch of +//! stuff in it to help with writing interpreters. Well, the answer is simple: +//! testing. It's really nice to know that if you start with a program that +//! does a thing, and then you muck with it, you end up with a program that does +//! the exact same thing. If you talk to people who think about language +//! semantics, they'll call this "observational equivalence": maybe the two +//! programs don't do 100% the same things in the same order, but you shouldn't +//! be able to observe the difference ... at least, not without a stopwatch, +//! memory profilers, etc. +//! +//! The actual evaluators for our various syntaxes are hidden in `eval` functions +//! of the various ASTs. It's nice to have them "next to" the syntax that way, so +//! that we just edit stuff in one part of the source tree at a time. This module, +//! then, just contains some things that are generally helpful across all the +//! interpreters we've written. +//! +//! In particular, this module helps with: +//! +//! * Defining a common error type -- [`EvalError`] -- that we can reasonably +//! compare. It's nice to compare errors, here, because we want to know that +//! if a program used to fail, it will still fail after we change it, and +//! fail in the exact same way. +//! * Defining a notion of a binding environment: [`EvalEnvironment`]. This +//! will help us keep track of variables bound in our program, as we run it. +//! * Defining a notion of a runtime value: [`Value`]. Yes, this is the +//! umpteenth time that we're re-defining basically the same enumeration +//! with exactly the same name, but it's nice to have it separated so that +//! we don't confuse them. +//! * Finally, this module implements all of our primitive functions, as the +//! [`Value::calculate`] function. This is just a nice abstraction boundary, +//! because the implementation of some parts of these primitives is really +//! awful to look at. +//! mod env; mod primop; mod value; @@ -9,6 +44,13 @@ pub use value::Value; use crate::backend::BackendError; +/// All of the errors that can happen trying to evaluate an NGR program. +/// +/// This is yet another standard [`thiserror::Error`] type, but with the +/// caveat that it implements [`PartialEq`] even though some of its +/// constituent members don't. It does so through the very sketchy mechanism +/// of converting those errors to strings and then seeing if they're the +/// same. #[derive(Debug, thiserror::Error)] pub enum EvalError { #[error(transparent)] @@ -18,15 +60,15 @@ pub enum EvalError { #[error(transparent)] Backend(#[from] BackendError), #[error("IO error: {0}")] - IO(String), + IO(#[from] std::io::Error), #[error(transparent)] Module(#[from] ModuleError), -} - -impl From for EvalError { - fn from(value: std::io::Error) -> Self { - EvalError::IO(value.to_string()) - } + #[error("Linker error: {0}")] + Linker(String), + #[error("Program exitted with status {0}")] + ExitCode(std::process::ExitStatus), + #[error("Unexpected output at runtime: {0}")] + RuntimeOutput(String), } impl PartialEq for EvalError { @@ -48,7 +90,7 @@ impl PartialEq for EvalError { }, EvalError::IO(a) => match other { - EvalError::IO(b) => a == b, + EvalError::IO(b) => a.to_string() == b.to_string(), _ => false, }, @@ -56,6 +98,21 @@ impl PartialEq for EvalError { EvalError::Module(b) => a.to_string() == b.to_string(), _ => false, }, + + EvalError::Linker(a) => match other { + EvalError::Linker(b) => a == b, + _ => false, + }, + + EvalError::ExitCode(a) => match other { + EvalError::ExitCode(b) => a == b, + _ => false, + }, + + EvalError::RuntimeOutput(a) => match other { + EvalError::RuntimeOutput(b) => a == b, + _ => false, + }, } } } diff --git a/src/eval/env.rs b/src/eval/env.rs index ff24834..a1a0320 100644 --- a/src/eval/env.rs +++ b/src/eval/env.rs @@ -2,15 +2,28 @@ use crate::eval::Value; use internment::ArcIntern; use std::sync::Arc; +/// An evaluation environment, which maps variable names to their +/// current values. +/// +/// One key difference between `EvalEnvironment` and `HashMap` is that +/// `EvalEnvironment` uses an `extend` mechanism to add keys, rather +/// than an `insert`. This difference allows you to add mappings for +/// a subcomputation while still retaining the old version without those +/// keys, which is really handy for implementing variable scoping. pub struct EvalEnvironment { inner: Arc, } -pub enum EvalEnvInternal { +enum EvalEnvInternal { Empty, Value(ArcIntern, Value, Arc), } +/// Errors that can happen when looking up a variable. +/// +/// This enumeration may be extended in the future, depending on if we +/// get more subtle with our keys. But for now, this is just a handy +/// way to make lookup failures be `thiserror::Error`s. #[derive(Clone, Debug, PartialEq, thiserror::Error)] pub enum LookupError { #[error("Could not find variable '{0}' in environment")] @@ -24,28 +37,38 @@ impl Default for EvalEnvironment { } impl EvalEnvironment { + /// Create a new, empty environment. pub fn empty() -> Self { EvalEnvironment { inner: Arc::new(EvalEnvInternal::Empty), } } + /// Extend the environment with a new mapping. + /// + /// Note the types: the result of this method is a new `EvalEnvironment`, + /// with its own lifetime, and the original environment is left unmodified. pub fn extend(&self, name: ArcIntern, value: Value) -> Self { EvalEnvironment { inner: Arc::new(EvalEnvInternal::Value(name, value, self.inner.clone())), } } + /// Look up a variable in the environment, returning an error if it isn't there. pub fn lookup(&self, n: ArcIntern) -> Result { self.inner.lookup(n) } } impl EvalEnvInternal { + /// Look up a variable in the environment, returning an error if it isn't there. fn lookup(&self, n: ArcIntern) -> Result { match self { + // if this is an empty dictionary, never mind, couldn't find it EvalEnvInternal::Empty => Err(LookupError::CouldNotFind(n)), + // is this the key we have right here? if yes, return our value EvalEnvInternal::Value(name, value, _) if *name == n => Ok(value.clone()), + // otherwise, recurse up our chain of environments EvalEnvInternal::Value(_, _, rest) => rest.lookup(n), } } @@ -70,6 +93,9 @@ mod tests { assert!(tester.lookup(arced("baz")).is_err()); } + // added this test to make sure that our nesting property works propertly. + // it's not a big deal now, but it'll be really handy later when we add any + // kind of variable scoping. #[test] fn nested() { let tester = EvalEnvironment::default(); diff --git a/src/eval/primop.rs b/src/eval/primop.rs index aef9681..49c014c 100644 --- a/src/eval/primop.rs +++ b/src/eval/primop.rs @@ -1,19 +1,39 @@ use crate::eval::value::Value; +/// Errors that can occur running primitive operations in the evaluators. #[derive(Clone, Debug, PartialEq, thiserror::Error)] pub enum PrimOpError { #[error("Math error (underflow or overflow) computing {0} operator")] MathFailure(&'static str), + /// This particular variant covers the case in which a primitive + /// operator takes two arguments that are supposed to be the same, + /// but they differ. (So, like, all the math operators.) #[error("Type mismatch ({1} vs {2}) computing {0} operator")] TypeMismatch(String, Value, Value), + /// This variant covers when an operator must take a particular + /// type, but the user has provided a different one. #[error("Bad type for operator {0}: {1}")] BadTypeFor(&'static str, Value), + /// Probably obvious from the name, but just to be very clear: this + /// happens when you pass three arguments to a two argument operator, + /// etc. Technically that's a type error of some sort, but we split + /// it out. #[error("Illegal number of arguments for {0}: {1} arguments found")] BadArgCount(String, usize), #[error("Unknown primitive operation {0}")] UnknownPrimOp(String), } +// Implementing primitives in an interpreter like this is *super* tedious, +// and the only way to make it even somewhat manageable is to use macros. +// This particular macro works for binary operations, and assumes that +// you've already worked out that the `calculate` call provided two arguments. +// +// In those cases, it will rul the operations we know about, and error if +// it doesn't. +// +// This macro then needs to be instantiated for every type, which is super +// fun. macro_rules! run_op { ($op: ident, $left: expr, $right: expr) => { match $op { @@ -23,15 +43,15 @@ macro_rules! run_op { .map(Into::into), "-" => $left .checked_sub($right) - .ok_or(PrimOpError::MathFailure("+")) + .ok_or(PrimOpError::MathFailure("-")) .map(Into::into), "*" => $left .checked_mul($right) - .ok_or(PrimOpError::MathFailure("+")) + .ok_or(PrimOpError::MathFailure("*")) .map(Into::into), "/" => $left .checked_div($right) - .ok_or(PrimOpError::MathFailure("+")) + .ok_or(PrimOpError::MathFailure("/")) .map(Into::into), _ => Err(PrimOpError::UnknownPrimOp($op.to_string())), } @@ -41,6 +61,8 @@ macro_rules! run_op { impl Value { fn binary_op(operation: &str, left: &Value, right: &Value) -> Result { match left { + // for now we only have one type, but in the future this is + // going to be very irritating. Value::I64(x) => match right { Value::I64(y) => run_op!(operation, x, *y), // _ => Err(PrimOpError::TypeMismatch( @@ -52,6 +74,14 @@ impl Value { } } + /// Calculate the result of running the given primitive on the given arguments. + /// + /// This can cause errors in a whole mess of ways, so be careful about your + /// inputs. For example, addition only works when the two values have the exact + /// same type, so expect an error if you try to do so. In addition, this + /// implementation catches and raises an error on overflow or underflow, so + /// its worth being careful to make sure that your inputs won't cause either + /// condition. pub fn calculate(operation: &str, values: Vec) -> Result { if values.len() == 2 { Value::binary_op(operation, &values[0], &values[1]) diff --git a/src/eval/value.rs b/src/eval/value.rs index a158dc9..ba0b0bd 100644 --- a/src/eval/value.rs +++ b/src/eval/value.rs @@ -1,5 +1,10 @@ use std::fmt::Display; +/// Values in the interpreter. +/// +/// Yes, this is yet another definition of a structure called `Value`, which +/// are almost entirely identical. However, it's nice to have them separated +/// by type so that we don't mix them up. #[derive(Clone, Debug, PartialEq)] pub enum Value { I64(i64), diff --git a/src/ir.rs b/src/ir.rs index b7cd9cf..88454e4 100644 --- a/src/ir.rs +++ b/src/ir.rs @@ -1,3 +1,17 @@ +//! The middle of the compiler: analysis, simplification, optimization. +//! +//! For the moment, this module doesn't do much besides define an intermediate +//! representation for NGR programs that is a little easier to work with then +//! the structures we've built from the actual user syntax. For example, in the +//! IR syntax, function calls are simplified so that all their arguments are +//! either variables or constants, which can make reasoning about programs +//! (and implicit temporary variables) quite a bit easier. +//! +//! For the foreseeable future, this module will likely remain mostly empty +//! besides definitions, as we'll likely want to focus on just processing / +//! validating syntax, and then figuring out how to turn it into Cranelift +//! and object code. After that point, however, this will be the module to +//! come to for analysis and optimization work. mod ast; mod eval; mod from_syntax; diff --git a/src/ir/ast.rs b/src/ir/ast.rs index ad96e95..3d8446d 100644 --- a/src/ir/ast.rs +++ b/src/ir/ast.rs @@ -1,3 +1,4 @@ +use crate::syntax::Location; use internment::ArcIntern; use pretty::{DocAllocator, Pretty}; use proptest::{ @@ -5,13 +6,28 @@ use proptest::{ strategy::{BoxedStrategy, Strategy}, }; -use crate::syntax::Location; - +/// We're going to represent variables as interned strings. +/// +/// These should be fast enough for comparison that it's OK, since it's going to end up +/// being pretty much the pointer to the string. type Variable = ArcIntern; +/// The representation of a program within our IR. For now, this is exactly one file. +/// +/// In addition, for the moment there's not really much of interest to hold here besides +/// the list of statements read from the file. Order is important. In the future, you +/// could imagine caching analysis information in this structure. +/// +/// `Program` implements both [`Pretty`] and [`Arbitrary`]. The former should be used +/// to print the structure whenever possible, especially if you value your or your +/// user's time. The latter is useful for testing that conversions of `Program` retain +/// their meaning. All `Program`s generated through [`Arbitrary`] are guaranteed to be +/// syntactically valid, although they may contain runtime issue like over- or underflow. #[derive(Debug)] pub struct Program { - pub statements: Vec, + // For now, a program is just a vector of statements. In the future, we'll probably + // extend this to include a bunch of other information, but for now: just a list. + pub(crate) statements: Vec, } impl<'a, 'b, D, A> Pretty<'a, D, A> for &'b Program @@ -23,6 +39,8 @@ where let mut result = allocator.nil(); for stmt in self.statements.iter() { + // there's probably a better way to do this, rather than constantly + // adding to the end, but this works. result = result .append(stmt.pretty(allocator)) .append(allocator.text(";")) @@ -39,11 +57,21 @@ impl Arbitrary for Program { fn arbitrary_with(args: Self::Parameters) -> Self::Strategy { crate::syntax::Program::arbitrary_with(args) - .prop_map(|x| Program::from(x.simplify())) + .prop_map(Program::from) .boxed() } } +/// The representation of a statement in the language. +/// +/// For now, this is either a binding site (`x = 4`) or a print statement +/// (`print x`). Someday, though, more! +/// +/// As with `Program`, this type implements [`Pretty`], which should +/// be used to display the structure whenever possible. It does not +/// implement [`Arbitrary`], though, mostly because it's slightly +/// complicated to do so. +/// #[derive(Debug)] pub enum Statement { Binding(Location, Variable, Expression), @@ -71,6 +99,18 @@ where } } +/// The representation of an expression. +/// +/// Note that expressions, like everything else in this syntax tree, +/// supports [`Pretty`], and it's strongly encouraged that you use +/// that trait/module when printing these structures. +/// +/// Also, Expressions at this point in the compiler are explicitly +/// defined so that they are *not* recursive. By this point, if an +/// expression requires some other data (like, for example, invoking +/// a primitive), any subexpressions have been bound to variables so +/// that the referenced data will always either be a constant or a +/// variable reference. #[derive(Debug)] pub enum Expression { Value(Location, Value), @@ -107,6 +147,12 @@ where } } +/// A type representing the primitives allowed in the language. +/// +/// Having this as an enumeration avoids a lot of "this should not happen" +/// cases, but might prove to be cumbersome in the future. If that happens, +/// this may either become a more hierarchical enumeration, or we'll just +/// deal with the "this should not happen" cases. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum Primitive { Plus, @@ -144,6 +190,11 @@ where } } +/// An expression that is always either a value or a reference. +/// +/// This is the type used to guarantee that we don't nest expressions +/// at this level. Instead, expressions that take arguments take one +/// of these, which can only be a constant or a reference. #[derive(Debug)] pub enum ValueOrRef { Value(Location, Value), @@ -163,8 +214,23 @@ where } } +impl From for Expression { + fn from(value: ValueOrRef) -> Self { + match value { + ValueOrRef::Value(loc, val) => Expression::Value(loc, val), + ValueOrRef::Ref(loc, var) => Expression::Reference(loc, var), + } + } +} + +/// A constant in the IR. #[derive(Debug)] pub enum Value { + /// A numerical constant. + /// + /// The optional argument is the base that was used by the user to input + /// the number. By retaining it, we can ensure that if we need to print the + /// number back out, we can do so in the form that the user entered it. Number(Option, i64), } diff --git a/src/ir/eval.rs b/src/ir/eval.rs index d7c4135..78b8b0b 100644 --- a/src/ir/eval.rs +++ b/src/ir/eval.rs @@ -4,6 +4,10 @@ use crate::ir::{Expression, Program, Statement}; use super::{Primitive, ValueOrRef}; impl Program { + /// Evaluate the program, returning either an error or a string containing everything + /// the program printed out. + /// + /// The print outs will be newline separated, with one print out per line. pub fn eval(&self) -> Result { let mut env = EvalEnvironment::empty(); let mut stdout = String::new(); @@ -39,6 +43,9 @@ impl Expression { Expression::Primitive(_, op, args) => { let mut arg_values = Vec::with_capacity(args.len()); + // we implement primitive operations by first evaluating each of the + // arguments to the function, and then gathering up all the values + // produced. for arg in args.iter() { match arg { ValueOrRef::Ref(_, n) => arg_values.push(env.lookup(n.clone())?), @@ -48,6 +55,8 @@ impl Expression { } } + // and then finally we call `calculate` to run them. trust me, it's nice + // to not have to deal with all the nonsense hidden under `calculate`. match op { Primitive::Plus => Ok(Value::calculate("+", arg_values)?), Primitive::Minus => Ok(Value::calculate("-", arg_values)?), @@ -62,7 +71,7 @@ impl Expression { #[test] fn two_plus_three() { let input = crate::syntax::Program::parse(0, "x = 2 + 3; print x;").expect("parse works"); - let ir = Program::from(input.simplify()); + let ir = Program::from(input); let output = ir.eval().expect("runs successfully"); assert_eq!("x = 5i64\n", &output); } @@ -71,7 +80,7 @@ fn two_plus_three() { fn lotsa_math() { let input = crate::syntax::Program::parse(0, "x = 2 + 3 * 10 / 5 - 1; print x;").expect("parse works"); - let ir = Program::from(input.simplify()); + let ir = Program::from(input); let output = ir.eval().expect("runs successfully"); assert_eq!("x = 7i64\n", &output); } diff --git a/src/ir/from_syntax.rs b/src/ir/from_syntax.rs index e5eea0e..46c7c69 100644 --- a/src/ir/from_syntax.rs +++ b/src/ir/from_syntax.rs @@ -1,82 +1,185 @@ use internment::ArcIntern; +use std::sync::atomic::AtomicUsize; use crate::ir::ast as ir; -use crate::syntax::ast as syntax; +use crate::syntax; + +use super::ValueOrRef; impl From for ir::Program { + /// We implement the top-level conversion of a syntax::Program into an + /// ir::Program using just the standard `From::from`, because we don't + /// need to return any arguments and we shouldn't produce any errors. + /// Technically there's an `unwrap` deep under the hood that we could + /// float out, but the validator really should've made sure that never + /// happens, so we're just going to assume. fn from(mut value: syntax::Program) -> Self { - ir::Program { - statements: value.statements.drain(..).map(Into::into).collect(), + let mut statements = Vec::new(); + + for stmt in value.statements.drain(..) { + statements.append(&mut stmt.simplify()); } + + ir::Program { statements } } } -impl From> for ir::Program { - fn from(mut value: Vec) -> Self { - ir::Program { - statements: value.drain(..).map(Into::into).collect(), - } - } -} - -impl From for ir::Statement { +impl From for ir::Program { + /// One interesting thing about this conversion is that there isn't + /// a natural translation from syntax::Statement to ir::Statement, + /// because the syntax version can have nested expressions and the + /// IR version can't. + /// + /// As a result, we can naturally convert a syntax::Statement into + /// an ir::Program, because we can allow the additional binding + /// sites to be generated, instead. And, bonus, it turns out that + /// this is what we wanted anyways. fn from(value: syntax::Statement) -> Self { - match value { - syntax::Statement::Binding(loc, name, expr) => { - ir::Statement::Binding(loc, ArcIntern::from(name), ir::Expression::from(expr)) - } - syntax::Statement::Print(loc, name) => ir::Statement::Print(loc, ArcIntern::from(name)), + ir::Program { + statements: value.simplify(), } } } -impl From for ir::Expression { - fn from(value: syntax::Expression) -> Self { - match value { - syntax::Expression::Primitive(loc, name, mut exprs) => ir::Expression::Primitive( - loc, - ir::Primitive::try_from(name.as_str()).unwrap(), - exprs.drain(..).map(Into::into).collect(), - ), +impl syntax::Statement { + /// Simplify a syntax::Statement into a series of ir::Statements. + /// + /// The reason this function is one-to-many is because we may have to + /// introduce new binding sites in order to avoid having nested + /// expressions. Nested expressions, like `(1 + 2) * 3`, are allowed + /// in syntax::Expression but are expressly *not* allowed in + /// ir::Expression. So this pass converts them into bindings, like + /// this: + /// + /// x = (1 + 2) * 3; + /// + /// ==> + /// + /// x:1 = 1 + 2; + /// x:2 = x:1 * 3; + /// x = x:2 + /// + /// Thus ensuring that things are nice and simple. Note that the + /// binding of `x:2` is not, strictly speaking, necessary, but it + /// makes the code below much easier to read. + fn simplify(self) -> Vec { + let mut new_statements = vec![]; + + match self { + // Print statements we don't have to do much with + syntax::Statement::Print(loc, name) => { + new_statements.push(ir::Statement::Print(loc, ArcIntern::new(name))) + } + + // Bindings, however, may involve a single expression turning into + // a series of statements and then an expression. + syntax::Statement::Binding(loc, name, value) => { + let (mut prereqs, new_value) = value.rebind(&name); + new_statements.append(&mut prereqs); + new_statements.push(ir::Statement::Binding( + loc, + ArcIntern::new(name), + new_value.into(), + )) + } + } + + new_statements + } +} + +impl syntax::Expression { + /// This actually does the meat of the simplification work, here, by rebinding + /// any nested expressions into their own variables. We have this return + /// `ValueOrRef` in all cases because it makes for slighly less code; in the + /// case when we actually want an `Expression`, we can just use `into()`. + fn rebind(self, base_name: &str) -> (Vec, ir::ValueOrRef) { + match self { + // Values just convert in the obvious way, and require no prereqs + syntax::Expression::Value(loc, val) => (vec![], ValueOrRef::Value(loc, val.into())), + + // Similarly, references just convert in the obvious way, and require + // no prereqs syntax::Expression::Reference(loc, name) => { - ir::Expression::Reference(loc, ArcIntern::from(name)) - } - syntax::Expression::Value(loc, value) => { - ir::Expression::Value(loc, ir::Value::from(value)) - } - } - } -} - -impl From for ir::ValueOrRef { - fn from(value: syntax::Expression) -> Self { - match value { - syntax::Expression::Primitive(loc, _, _) => { - panic!("{:?}: couldn't convert to valueorref", loc) + (vec![], ValueOrRef::Ref(loc, ArcIntern::new(name))) } - syntax::Expression::Reference(loc, var) => { - ir::ValueOrRef::Ref(loc, ArcIntern::new(var)) - } + // Primitive expressions are where we do the real work. + syntax::Expression::Primitive(loc, prim, mut expressions) => { + // generate a fresh new name for the binding site we're going to + // introduce, basing the name on wherever we came from; so if this + // expression was bound to `x` originally, it might become `x:23`. + // + // gensym is guaranteed to give us a name that is unused anywhere + // else in the program. + let new_name = gensym(base_name); + let mut prereqs = Vec::new(); + let mut new_exprs = Vec::new(); - syntax::Expression::Value(loc, val) => ir::ValueOrRef::Value(loc, val.into()), + // here we loop through every argument, and recurse on the expressions + // we find. that will give us any new binding sites that *they* introduce, + // and a simple value or reference that we can use in our result. + for expr in expressions.drain(..) { + let (mut cur_prereqs, arg) = expr.rebind(new_name.as_str()); + prereqs.append(&mut cur_prereqs); + new_exprs.push(arg); + } + + // now we're going to use those new arguments to run the primitive, binding + // the results to the new variable we introduced. + let prim = + ir::Primitive::try_from(prim.as_str()).expect("is valid primitive function"); + prereqs.push(ir::Statement::Binding( + loc.clone(), + new_name.clone(), + ir::Expression::Primitive(loc.clone(), prim, new_exprs), + )); + + // and finally, we can return all the new bindings, and a reference to + // the variable we just introduced to hold the value of the primitive + // invocation. + (prereqs, ValueOrRef::Ref(loc, new_name)) + } } } } impl From for ir::Value { - fn from(x: syntax::Value) -> Self { - match x { - syntax::Value::Number(base, value) => ir::Value::Number(base, value), + fn from(value: syntax::Value) -> Self { + match value { + syntax::Value::Number(base, val) => ir::Value::Number(base, val), } } } +impl From for ir::Primitive { + fn from(value: String) -> Self { + value.try_into().unwrap() + } +} + +/// Generate a fresh new name based on the given name. +/// +/// The new name is guaranteed to be unique across the entirety of the +/// execution. This is achieved by using characters in the variable name +/// that would not be valid input, and by including a counter that is +/// incremented on every invocation. +fn gensym(name: &str) -> ArcIntern { + static COUNTER: AtomicUsize = AtomicUsize::new(0); + + let new_name = format!( + "<{}:{}>", + name, + COUNTER.fetch_add(1, std::sync::atomic::Ordering::SeqCst) + ); + ArcIntern::new(new_name) +} + proptest::proptest! { #[test] fn translation_maintains_semantics(input: syntax::Program) { let syntax_result = input.eval(); - let ir = ir::Program::from(input.simplify()); + let ir = ir::Program::from(input); let ir_result = ir.eval(); assert_eq!(syntax_result, ir_result); } diff --git a/src/ir/strings.rs b/src/ir/strings.rs index d0e57a2..f7b291e 100644 --- a/src/ir/strings.rs +++ b/src/ir/strings.rs @@ -3,6 +3,10 @@ use internment::ArcIntern; use std::collections::HashSet; impl Program { + /// Get the complete list of strings used within the program. + /// + /// For the purposes of this function, strings are the variables used in + /// `print` statements. pub fn strings(&self) -> HashSet> { let mut result = HashSet::new(); diff --git a/src/lib.rs b/src/lib.rs index 71d55e5..18ade12 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,75 @@ +//! # NGR (No Good Reason) Compiler +//! +//! This is the top-level module for the NGR compiler; a compiler written +//! in Rust for no good reason. I may eventually try to turn this into a +//! basic guide for writing compilers, but for now it's a fairly silly +//! (although complete) language and implementation, featuring: +//! +//! * Variable binding with basic arithmetic operators. +//! * The ability to print variable values. +//! +//! I'll be extending this list into the future, with the eventual goal of +//! being able to implement basic programming tasks with it. For example, +//! I have a goal of eventually writing reasonably-clear +//! [Advent of Code](https://adventofcode.com/) implementations with it. +//! +//! Users of this as a library will want to choose their adventure based +//! on how much they want to customize their experience; I've defaulted +//! to providing the ability to see internals, rather than masking them, +//! so folks can play with things as they see fit. +//! +//! ## Easy Mode - Just Running a REPL or Compiler +//! +//! For easiest use, you will want to use either the [`Compiler`] object +//! or the [`REPL`] object. +//! +//! As you might expect, the [`Compiler`] object builds a compiler, which +//! can be re-used to compile as many files as you'd like. Right now, +//! that's all it does. (TODO: Add a linker function to it.) +//! +//! The [`REPL`] object implements the core of what you'll need to +//! implement a just-in-time compiled read-eval-print loop. It will +//! maintain variable state and make sure that variables are linked +//! appropriately as the loop progresses. +//! +//! ## Hard Mode - Looking at the individual passes +//! +//! This compiler is broken into three core parts: +//! +//! 1. The front-end / syntax engine. This portion of the compiler is +//! responsible for turning basic strings (or files) into a machine- +//! friendly abstract syntax tree. See the [`syntax`] module for +//! more information. +//! 2. The IR. This portion of the compiler will be responsible for +//! high-level code analysis and transformation ... although for +//! now, it doesn't do much at all. See the [`ir`] module for more +//! information. +//! 3. The Backend implementation. This portion of the compiler turns +//! the IR from the previous section into Cranelift structures, and +//! helps with either compiling them via JIT or statically compiling +//! them into a file. The [`backend`] module also contains information +//! about the runtime functions made available to the user. +//! +//! ## Testing +//! +//! Testing is a key focus of this effort. To that end, both the syntax +//! tree used in the syntax module and the IR used in the middle of the +//! compiler both implement `Arbitrary`, and are subject to property-based +//! testing to make sure that various passes work properly. +//! +//! In addition, to support basic equivalence testing, we include support +//! for evaluating all expressions. The [`eval`] module provides some +//! utility support for this work. +//! pub mod backend; pub mod eval; pub mod ir; pub mod syntax; + +/// Implementation module for the high-level compiler. +mod compiler; +/// Implementation module for the high-level REPL. +mod repl; + +pub use crate::compiler::Compiler; +pub use crate::repl::REPL; diff --git a/src/repl.rs b/src/repl.rs new file mode 100644 index 0000000..5d511b5 --- /dev/null +++ b/src/repl.rs @@ -0,0 +1,166 @@ +use crate::backend::{Backend, BackendError}; +use crate::ir::Program as IR; +use crate::syntax::{Location, ParserError, Statement}; +use codespan_reporting::diagnostic::Diagnostic; +use codespan_reporting::files::SimpleFiles; +use codespan_reporting::term::{self, Config}; +use cranelift_jit::JITModule; +use cranelift_module::ModuleError; +use pretty::termcolor::{ColorChoice, StandardStream}; +use std::collections::HashMap; + +/// A high-level REPL helper for NGR. +/// +/// This object holds most of the state required to implement some +/// form of interactive compiler for NGR; all you need to do is provide +/// the actual user IO. +/// +/// For most console-based used cases, the [`Default`] implementation +/// should be sufficient; it prints any warnings or errors to `stdout`, +/// using a default color scheme that should work based on the terminal +/// type. For more complex interactions, though, you may want to use +/// the `REPL::new` function to provide your own print substrate. +pub struct REPL { + file_database: SimpleFiles, + jitter: Backend, + variable_binding_sites: HashMap, + console: StandardStream, + console_config: Config, +} + +impl Default for REPL { + fn default() -> Self { + let console = StandardStream::stdout(ColorChoice::Auto); + REPL::new(console, Config::default()).unwrap() + } +} + +#[allow(clippy::upper_case_acronyms)] +#[derive(Debug, thiserror::Error)] +enum REPLError { + #[error("Error parsing statement: {0}")] + Parser(#[from] ParserError), + #[error("JIT error: {0}")] + JIT(#[from] BackendError), + #[error("Internal cranelift error: {0}")] + Cranelift(#[from] ModuleError), + #[error(transparent)] + Reporting(#[from] codespan_reporting::files::Error), +} + +impl From for Diagnostic { + fn from(value: REPLError) -> Self { + match value { + REPLError::Parser(err) => Diagnostic::from(&err), + REPLError::JIT(err) => Diagnostic::from(err), + REPLError::Cranelift(err) => Diagnostic::bug().with_message(format!("{}", err)), + REPLError::Reporting(err) => Diagnostic::bug().with_message(format!("{}", err)), + } + } +} + +impl REPL { + /// Construct a new REPL helper, using the given stream implementation and console configuration. + /// + /// For most users, the [`Default::default`] implementation will be sufficient; + /// it will use `stdout` and a default console configuration. But if you need to + /// be more specific, this will help you provide more guidance to the REPL as it + /// evaluates things. + pub fn new(console: StandardStream, console_config: Config) -> Result { + Ok(REPL { + file_database: SimpleFiles::new(), + jitter: Backend::jit(None)?, + variable_binding_sites: HashMap::new(), + console, + console_config, + }) + } + + /// Emit a diagnostic to the configured console. + /// + /// This is just a convenience function; there's a lot of boilerplate in printing + /// diagnostics, and it was nice to pull it out into its own function. + fn emit_diagnostic( + &mut self, + diagnostic: Diagnostic, + ) -> Result<(), codespan_reporting::files::Error> { + term::emit( + &mut self.console, + &self.console_config, + &self.file_database, + &diagnostic, + ) + } + + /// Process a line of input, printing any problems or the results. + /// + /// The line number argument is just for a modicum of source information, to + /// provide to the user if some parsing or validation step fails. It can be + /// changed to be any value you like that provides some insight into what + /// failed, although it is probably a good idea for it to be different for + /// every invocation of this function. (Not critical, but a good idea.) + /// + /// Any warnings or errors generated in processing this command will be + /// printed to the configured console. If there are no problems, the + /// command will be compiled and then executed. + pub fn process_input(&mut self, line_no: usize, command: String) { + if let Err(err) = self.process(line_no, command) { + if let Err(e) = self.emit_diagnostic(Diagnostic::from(err)) { + eprintln!( + "WOAH! System having trouble printing error messages. This is very bad. ({})", + e + ); + } + } + } + + /// The internal implementation, with a handy `Result` type. + /// + /// All information from the documentation of `REPL::process_input` applies here, + /// as well; this is the internal implementation of that function, which is + /// differentiated by returning a `Result` type that is hidden from the user + /// in the case of `REPL::process_input`. + fn process(&mut self, line_no: usize, command: String) -> Result<(), REPLError> { + let entry = self.file_database.add("entry".to_string(), command); + let source = self + .file_database + .get(entry) + .expect("entry exists") + .source(); + let syntax = Statement::parse(entry, source)?; + + // if this is a variable binding, and we've never defined this variable before, + // we should tell cranelift about it. this is optimistic; if we fail to compile, + // then we won't use this definition until someone tries again. + if let Statement::Binding(_, ref name, _) = syntax { + if !self.variable_binding_sites.contains_key(name.as_str()) { + self.jitter.define_string(name)?; + self.jitter.define_variable(name.clone())?; + } + }; + + let (mut errors, mut warnings) = syntax.validate(&mut self.variable_binding_sites); + let stop = !errors.is_empty(); + let messages = errors + .drain(..) + .map(Into::into) + .chain(warnings.drain(..).map(Into::into)); + + for message in messages { + self.emit_diagnostic(message)?; + } + + if stop { + return Ok(()); + } + + let ir = IR::from(syntax); + let name = format!("line{}", line_no); + let function_id = self.jitter.compile_function(&name, ir)?; + self.jitter.module.finalize_definitions()?; + let compiled_bytes = self.jitter.bytes(function_id); + let compiled_function = unsafe { std::mem::transmute::<_, fn() -> ()>(compiled_bytes) }; + compiled_function(); + Ok(()) + } +} diff --git a/src/syntax.rs b/src/syntax.rs index cbadf0c..0ed88ee 100644 --- a/src/syntax.rs +++ b/src/syntax.rs @@ -1,12 +1,36 @@ +//! NGR Parsing: Reading input, turning it into sense (or errors). +//! +//! This module implement the front end of the compiler, which is responsible for +//! reading in NGR syntax as a string, turning it into a series of reasonable Rust +//! structures for us to manipulate, and doing some validation while it's at it. +//! +//! The core flow for this work is: +//! +//! * Turning the string into a series of language-specific [`Token`]s. +//! * Taking those tokens, and computing a basic syntax tree from them, +//! using our parser ([`ProgramParser`] or [`StatementParser`], generated +//! by [`lalrpop`](https://lalrpop.github.io/lalrpop/)). +//! * Validating the tree we have parsed, using [`Program::validate`], +//! returning any warnings or errors we have found. +//! +//! In addition to all of this, we make sure that the structures defined in this +//! module are all: +//! +//! * Instances of [`Pretty`](::pretty::Pretty), so that you can print stuff back +//! out that can be read by a human. +//! * Instances of [`Arbitrary`](proptest::prelude::Arbitrary), so they can be +//! used in `proptest`-based property testing. There are built-in tests in +//! the library, for example, to make sure that the pretty-printing round-trips. +//! * Can be evaluated using an `eval` function, for comparison with later +//! versions of the function downstream. use codespan_reporting::{diagnostic::Diagnostic, files::SimpleFiles}; use lalrpop_util::lalrpop_mod; use logos::Logos; mod arbitrary; -pub mod ast; +mod ast; mod eval; mod location; -mod simplify; mod tokens; lalrpop_mod!( #[allow(clippy::just_underscores_and_digits, clippy::clone_on_copy)] @@ -18,7 +42,7 @@ mod validate; pub use crate::syntax::ast::*; pub use crate::syntax::location::Location; -use crate::syntax::parser::ProgramParser; +pub use crate::syntax::parser::{ProgramParser, StatementParser}; pub use crate::syntax::tokens::{LexerError, Token}; #[cfg(test)] use ::pretty::{Arena, Pretty}; @@ -29,33 +53,62 @@ use proptest::{prop_assert, prop_assert_eq}; use std::str::FromStr; use thiserror::Error; -use self::parser::StatementParser; - +/// One of the many errors that can occur when processing text input. +/// +/// If you get one of these and want to display it to the user, we strongly +/// suggest using the [`From`] implementation to turn this into a [`Diagnostic`], +/// and then printing it via [`codespan_reporting`]. #[derive(Debug, Error)] pub enum ParserError { + /// Raised by the lexer when we see some text that doesn't make + /// any sense in the language. #[error("Invalid token")] InvalidToken(Location), + + /// Raised when we're parsing the file and run into an EOF in a + /// place we really weren't expecting. #[error("Unrecognized EOF")] UnrecognizedEOF(Location, Vec), + + /// Raised when we're parsing the file, and run into a token in a + /// place we weren't expecting it. #[error("Unrecognized token")] UnrecognizedToken(Location, Location, Token, Vec), + + /// Raised when we were expecting the end of the file, but instead + /// got another token. #[error("Extra token")] ExtraToken(Location, Token, Location), + + /// Raised when the lexer just had some sort of internal problem + /// and just gave up. #[error("Lexing failure")] LexFailure(Location), + + /// Raised when we tried to reference a file, or add a file, to our + /// file database, and the database ran into a problem. #[error("File database error")] FileDatabaseError(#[from] codespan_reporting::files::Error), + + /// Raised when the OS is having problems giving us data. #[error("Read error")] ReadError(#[from] std::io::Error), } impl ParserError { + /// Convert one of lalrpop's parser errors into one of our own, which we can more + /// easily implement translation into [`Diagnostic`]. + /// + /// This function is relatively straightforward, because we match the errors pretty + /// closely. The major thing we do here is convert [`lalrpop`]'s notion of a location, + /// which is just an offset that it got from the lexer, into an actual location that + /// we can use in our [`Diagnostic`]s. fn convert(file_idx: usize, err: ParseError) -> Self { match err { ParseError::InvalidToken { location } => { ParserError::InvalidToken(Location::new(file_idx, location)) } - ParseError::UnrecognizedEOF { location, expected } => { + ParseError::UnrecognizedEof { location, expected } => { ParserError::UnrecognizedEOF(Location::new(file_idx, location), expected) } ParseError::UnrecognizedToken { @@ -83,6 +136,10 @@ impl ParserError { } } +/// This is just a nice little function to print out what we expected, if +/// we had some expectations. Because English is a little wonky, there's +/// some odd stuff with whether we get 0, 1, 2, or more, and it's nice to +/// just split that bit of logic out. fn display_expected(expected: &[String]) -> String { match expected.len() { 0 => "".to_string(), @@ -96,6 +153,8 @@ fn display_expected(expected: &[String]) -> String { } } +/// Given a list of strings, comma separate (with a space) them, as in an +/// English list. fn comma_separate(strings: &[String]) -> String { let mut result = String::new(); @@ -125,12 +184,12 @@ impl<'a> From<&'a ParserError> for Diagnostic { let expected_str = format!("unexpected token {}{}", token, display_expected(expected)); let unexpected_str = format!("unexpected token {}", token); - let mut labels = start.range_label(end); + let labels = start.range_label(end); Diagnostic::error() .with_labels( labels - .drain(..) + .into_iter() .map(|l| l.with_message(unexpected_str.clone())) .collect(), ) @@ -142,12 +201,12 @@ impl<'a> From<&'a ParserError> for Diagnostic { let expected_str = format!("unexpected token {} after the expected end of file", token); let unexpected_str = format!("unexpected token {}", token); - let mut labels = start.range_label(end); + let labels = start.range_label(end); Diagnostic::error() .with_labels( labels - .drain(..) + .into_iter() .map(|l| l.with_message(unexpected_str.clone())) .collect(), ) @@ -167,6 +226,14 @@ impl<'a> From<&'a ParserError> for Diagnostic { } impl Program { + /// Parse the given file, adding it to the database as part of the process. + /// + /// This operation reads the file from disk and adds it to the database for future + /// reference. If you get an error, we strongly suggest conversion to [`Diagnostic`] + /// and then reporting it to the user via [`codespan_reporting`]. You should use + /// this function if you're pretty sure that you've never seen this file before, + /// and [`Program::parse`] if you have and know its index and already have it in + /// memory. pub fn parse_file( file_database: &mut SimpleFiles, file_name: &str, @@ -177,6 +244,11 @@ impl Program { Program::parse(file_handle, file_db_info.source()) } + /// Parse a block of text you have in memory, using the given index for [`Location`]s. + /// + /// If you use a nonsensical file index, everything will work fine until you try to + /// report an error, at which point [`codespan_reporting`] may have some nasty things + /// to say to you. pub fn parse(file_idx: usize, buffer: &str) -> Result { let lexer = Token::lexer(buffer) .spanned() @@ -188,6 +260,12 @@ impl Program { } impl Statement { + /// Parse a statement that you have in memory, using the given index for [`Location`]s. + /// + /// As with [`Program::parse`], if you use a bad file index, you'll get weird behaviors + /// when you try to print errors, but things should otherwise work fine. This function + /// will only parse a single statement, which is useful in the REPL, but probably shouldn't + /// be used when reading in whole files. pub fn parse(file_idx: usize, buffer: &str) -> Result { let lexer = Token::lexer(buffer) .spanned() diff --git a/src/syntax/ast.rs b/src/syntax/ast.rs index ad28025..d71e872 100644 --- a/src/syntax/ast.rs +++ b/src/syntax/ast.rs @@ -1,12 +1,32 @@ use crate::syntax::Location; +/// The set of valid binary operators. pub static BINARY_OPERATORS: &[&str] = &["+", "-", "*", "/"]; +/// A structure represented a parsed program. +/// +/// One `Program` is associated with exactly one input file, and the +/// vector is arranged in exactly the same order as the parsed file. +/// Because this is the syntax layer, the program is guaranteed to be +/// syntactically valid, but may be nonsense. There could be attempts +/// to use unbound variables, for example, until after someone runs +/// `validate` and it comes back without errors. #[derive(Clone, Debug, PartialEq)] pub struct Program { pub statements: Vec, } +/// A parsed statement. +/// +/// Statements are guaranteed to be syntactically valid, but may be +/// complete nonsense at the semantic level. Which is to say, all the +/// print statements were correctly formatted, and all the variables +/// referenced are definitely valid symbols, but they may not have +/// been defined or anything. +/// +/// Note that equivalence testing on statements is independent of +/// source location; it is testing if the two statements say the same +/// thing, not if they are the exact same statement. #[derive(Clone, Debug)] pub enum Statement { Binding(Location, String, Expression), @@ -28,6 +48,12 @@ impl PartialEq for Statement { } } +/// An expression in the underlying syntax. +/// +/// Like statements, these expressions are guaranteed to have been +/// formatted correctly, but may not actually make any sense. Also +/// like Statements, the [`PartialEq`] implementation does not take +/// source positions into account. #[derive(Clone, Debug)] pub enum Expression { Value(Location, Value), @@ -54,7 +80,9 @@ impl PartialEq for Expression { } } +/// A value from the source syntax #[derive(Clone, Debug, PartialEq, Eq)] pub enum Value { + /// The value of the number, and an optional base that it was written in Number(Option, i64), } diff --git a/src/syntax/eval.rs b/src/syntax/eval.rs index 15e7b85..6504e26 100644 --- a/src/syntax/eval.rs +++ b/src/syntax/eval.rs @@ -4,11 +4,23 @@ use crate::eval::{EvalEnvironment, EvalError, Value}; use crate::syntax::{Expression, Program, Statement}; impl Program { + /// Evaluate the program, returning either an error or what it prints out when run. + /// + /// Doing this evaluation is particularly useful for testing, to ensure that if we + /// modify a program in some way it does the same thing on both sides of the + /// transformation. It's also sometimes just nice to know what a program will be + /// doing. + /// + /// Note that the errors here are slightly more strict that we enforce at runtime. + /// For example, we check for overflow and underflow errors during evaluation, and + /// we don't check for those in the compiled code. pub fn eval(&self) -> Result { let mut env = EvalEnvironment::empty(); let mut stdout = String::new(); for stmt in self.statements.iter() { + // at this point, evaluation is pretty simple. just walk through each + // statement, in order, and record printouts as we come to them. match stmt { Statement::Binding(_, name, value) => { let actual_value = value.eval(&env)?; @@ -40,6 +52,7 @@ impl Expression { let mut arg_values = Vec::with_capacity(args.len()); for arg in args.iter() { + // yay, recursion! makes this pretty straightforward arg_values.push(arg.eval(env)?); } diff --git a/src/syntax/location.rs b/src/syntax/location.rs index 65e1402..3c97d3d 100644 --- a/src/syntax/location.rs +++ b/src/syntax/location.rs @@ -1,5 +1,9 @@ use codespan_reporting::diagnostic::{Diagnostic, Label}; +/// A source location, for use in pointing users towards warnings and errors. +/// +/// Internally, locations are very tied to the `codespan_reporting` library, +/// and the primary use of them is to serve as anchors within that library. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Location { file_idx: usize, @@ -7,10 +11,22 @@ pub struct Location { } impl Location { + /// Generate a new `Location` from a file index and an offset from the + /// start of the file. + /// + /// The file index is based on the file database being used. See the + /// `codespan_reporting::files::SimpleFiles::add` function, which is + /// normally where we get this index. pub fn new(file_idx: usize, offset: usize) -> Self { Location { file_idx, offset } } + /// Generate a `Location` for a completely manufactured bit of code. + /// + /// Ideally, this is used only in testing, as any code we generate as + /// part of the compiler should, theoretically, be tied to some actual + /// location in the source code. That being said, this can be used in + /// a pinch ... just maybe try to avoid it if you can. pub fn manufactured() -> Self { Location { file_idx: 0, @@ -18,27 +34,73 @@ impl Location { } } + /// Generate a primary label for a [`Diagnostic`], based on this source + /// location. + /// + /// Note, this is just the [`Label`], you'll want to fill in the [`Diagnostic`] + /// with a lot more information. + /// + /// Primary labels are the things that are they key cause of the message. + /// If, for example, it was an error to bind a variable named "x", and + /// then have another binding of a variable named "x", the second one + /// would likely be the primary label (because that's where the error + /// actually happened), but you'd probably want to make the first location + /// the secondary label to help users find it. pub fn primary_label(&self) -> Label { Label::primary(self.file_idx, self.offset..self.offset) } + /// Generate a secondary label for a [`Diagnostic`], based on this source + /// location. + /// + /// Note, this is just the [`Label`], you'll want to fill in the [`Diagnostic`] + /// with a lot more information. + /// + /// Secondary labels are the things that are involved in the message, but + /// aren't necessarily a problem in and of themselves. If, for example, it + /// was an error to bind a variable named "x", and then have another binding + /// of a variable named "x", the second one would likely be the primary + /// label (because that's where the error actually happened), but you'd + /// probably want to make the first location the secondary label to help + /// users find it. pub fn secondary_label(&self) -> Label { Label::secondary(self.file_idx, self.offset..self.offset) } - pub fn range_label(&self, end: &Location) -> Vec> { - if self.file_idx == end.file_idx { - vec![Label::primary(self.file_idx, self.offset..end.offset)] - } else if self.file_idx == 0 { - // if this is a manufactured item, then ... just try the other one - vec![Label::primary(end.file_idx, end.offset..end.offset)] + /// Given this location and another, generate a primary label that + /// specifies the area between those two locations. + /// + /// See [`Self::primary_label`] for some discussion of primary versus + /// secondary labels. If the two locations are the same, this method does + /// the exact same thing as [`Self::primary_label`]. If this item was + /// generated by [`Self::manufactured`], it will act as if you'd called + /// `primary_label` on the argument. Otherwise, it will generate the obvious + /// span. + /// + /// This function will return `None` only in the case that you provide + /// labels from two different files, which it cannot sensibly handle. + pub fn range_label(&self, end: &Location) -> Option> { + if self.file_idx == 0 { + return Some(end.primary_label()); + } + + if self.file_idx != end.file_idx { + return None; + } + + if self.offset > end.offset { + Some(Label::primary(self.file_idx, end.offset..self.offset)) } else { - // we'll just pick the first location if this is in two different - // files - vec![Label::primary(self.file_idx, self.offset..self.offset)] + Some(Label::primary(self.file_idx, self.offset..end.offset)) } } + /// Return an error diagnostic centered at this location. + /// + /// Note that this [`Diagnostic`] will have no information associated with + /// it other than that (a) there is an error, and (b) that the error is at + /// this particular location. You'll need to extend it with actually useful + /// information, like what kind of error it is. pub fn error(&self) -> Diagnostic { Diagnostic::error().with_labels(vec![Label::primary( self.file_idx, @@ -46,6 +108,12 @@ impl Location { )]) } + /// Return an error diagnostic centered at this location, with the given message. + /// + /// This is much more useful than [`Self::error`], because it actually provides + /// the user with some guidance. That being said, you still might want to add + /// even more information to ut, using [`Diagnostic::with_labels`], + /// [`Diagnostic::with_notes`], or [`Diagnostic::with_code`]. pub fn labelled_error(&self, msg: &str) -> Diagnostic { Diagnostic::error().with_labels(vec![Label::primary( self.file_idx, diff --git a/src/syntax/parser.lalrpop b/src/syntax/parser.lalrpop index 85b21b3..3d8de29 100644 --- a/src/syntax/parser.lalrpop +++ b/src/syntax/parser.lalrpop @@ -1,14 +1,32 @@ +//! The parser for NGR! +//! +//! This file contains the grammar for the NGR language; a grammar is a nice, +//! machine-readable way to describe how your language's syntax works. For +//! example, here we describe a program as a series of statements, statements +//! as either variable binding or print statements, etc. As the grammar gets +//! more complicated, using tools like [`lalrpop`] becomes even more important. +//! (Although, at some point, things can become so complicated that you might +//! eventually want to leave lalrpop behind.) +//! use crate::syntax::{LexerError, Location}; use crate::syntax::ast::{Program,Statement,Expression,Value}; use crate::syntax::tokens::Token; use internment::ArcIntern; +// one cool thing about lalrpop: we can pass arguments. in this case, the +// file index of the file we're parsing. we combine this with the file offset +// that Logos gives us to make a [`crate::syntax::Location`]. grammar(file_idx: usize); +// this is a slighlyt odd way to describe this, but: consider this section +// as describing the stuff that is external to the lalrpop grammar that it +// needs to know to do its job. extern { - type Location = usize; + type Location = usize; // Logos, our lexer, implements locations as + // offsets from the start of the file. type Error = LexerError; + // here we redeclare all of the tokens. enum Token { "=" => Token::Equals, ";" => Token::Semi, @@ -22,57 +40,123 @@ extern { "*" => Token::Operator('*'), "/" => Token::Operator('/'), + // the previous items just match their tokens, and if you try + // to name and use "their value", you get their source location. + // For these, we want "their value" to be their actual contents, + // which is why we put their types in angle brackets. "" => Token::Number((>,)), "" => Token::Variable(>), } } pub Program: Program = { + // a program is just a set of statements => Program { statements: stmts } } Statements: Vec = { + // a statement is either a set of statements followed by another + // statement (note, here, that you can name the result of a sub-parse + // using ) ... => { stmts.push(stmt); stmts }, + + // ... or it's nothing. This may feel like an awkward way to define + // lists of things -- and it is a bit awkward -- but there are actual + // technical reasons that you want to (a) use recursivion to define + // these, and (b) use *left* recursion, specifically. That's why, in + // this file, all of the recursive cases are to the left, like they + // are above. + // + // the details of why left recursion is better is actually pretty + // fiddly and in the weeds, and if you're interested you should look + // up LALR parsers versus LL parsers; both their differences and how + // they're constructed, as they're kind of neat. + // + // but if you're just writing grammars with lalrpop, then you should + // just remember that you should always use left recursion, and be + // done with it. => { Vec::new() } } pub Statement: Statement = { + // A statement can be a variable binding. Note, here, that we use this + // funny @L thing to get the source location before the variable, so that + // we can say that this statement spans across everything. "> "=" ";" => Statement::Binding(Location::new(file_idx, l), v.to_string(), e), + + // Alternatively, a statement can just be a print statement. "print" "> ";" => Statement::Print(Location::new(file_idx, l), v.to_string()), } +// Expressions! Expressions are a little fiddly, because we're going to +// use a little bit of a trick to make sure that we get operator precedence +// right. The trick works by creating a top-level `Expression` grammar entry +// that just points to the thing with the *weakest* precedence. In this case, +// we have addition, subtraction, multiplication, and division, so addition +// and subtraction have the weakest precedence. +// +// Then, as we go down the precedence tree, each item will recurse (left!) +// to other items at the same precedence level. The right hand operator, for +// binary operators (which is all of ours, at the moment) will then be one +// level stronger precendence. In addition, we'll let people just fall through +// to the next level; so if there isn't an addition or subtraction, we'll just +// fall through to the multiplication/division case. +// +// Finally, at the bottom, we'll have the core expressions (like constants, +// variables, etc.) as well as a parenthesized version of `Expression`, which +// gets us right up top again. +// +// Understanding why this works to solve all your operator precedence problems +// is a little hard to give an easy intuition for, but for myself it helped +// to run through a few examples. Consider thinking about how you want to +// parse something like "1 + 2 * 3", for example, versus "1 + 2 + 3" or +// "1 * 2 + 3", and hopefully that'll help. Expression: Expression = { AdditiveExpression, } +// we group addition and subtraction under the heading "additive" AdditiveExpression: Expression = { "+" => Expression::Primitive(Location::new(file_idx, l), "+".to_string(), vec![e1, e2]), "-" => Expression::Primitive(Location::new(file_idx, l), "-".to_string(), vec![e1, e2]), MultiplicativeExpression, } +// similarly, we group multiplication and division under "multiplicative" MultiplicativeExpression: Expression = { "*" => Expression::Primitive(Location::new(file_idx, l), "*".to_string(), vec![e1, e2]), "/" => Expression::Primitive(Location::new(file_idx, l), "/".to_string(), vec![e1, e2]), AtomicExpression, } +// finally, we describe our lowest-level expressions as "atomic", because +// they cannot be further divided into parts AtomicExpression: Expression = { + // just a variable reference "> => Expression::Reference(Location::new(file_idx, l), v.to_string()), + // just a number "> => { let val = Value::Number(n.0, n.1); Expression::Value(Location::new(file_idx, l), val) }, + // a tricky case: also just a number, but using a negative sign. an + // alternative way to do this -- and we may do this eventually -- is + // to implement a unary negation expression. this has the odd effect + // that the user never actually writes down a negative number; they just + // write positive numbers which are immediately sent to a negation + // primitive! "-" "> => { let val = Value::Number(n.0, -n.1); Expression::Value(Location::new(file_idx, l), val) }, + // finally, let people parenthesize expressions and get back to a + // lower precedence "(" ")" => e, } \ No newline at end of file diff --git a/src/syntax/simplify.rs b/src/syntax/simplify.rs deleted file mode 100644 index 28ad377..0000000 --- a/src/syntax/simplify.rs +++ /dev/null @@ -1,63 +0,0 @@ -use crate::syntax::ast::{Expression, Program, Statement}; - -impl Program { - pub fn simplify(mut self) -> Self { - let mut new_statements = Vec::new(); - let mut gensym_index = 1; - - for stmt in self.statements.drain(..) { - new_statements.append(&mut stmt.simplify(&mut gensym_index)); - } - - self.statements = new_statements; - self - } -} - -impl Statement { - pub fn simplify(self, gensym_index: &mut usize) -> Vec { - let mut new_statements = vec![]; - - match self { - Statement::Print(_, _) => new_statements.push(self), - Statement::Binding(_, _, Expression::Reference(_, _)) => new_statements.push(self), - Statement::Binding(_, _, Expression::Value(_, _)) => new_statements.push(self), - Statement::Binding(loc, name, value) => { - let (mut prereqs, new_value) = value.rebind(&name, gensym_index); - new_statements.append(&mut prereqs); - new_statements.push(Statement::Binding(loc, name, new_value)) - } - } - - new_statements - } -} - -impl Expression { - fn rebind(self, base_name: &str, gensym_index: &mut usize) -> (Vec, Expression) { - match self { - Expression::Value(_, _) => (vec![], self), - Expression::Reference(_, _) => (vec![], self), - Expression::Primitive(loc, prim, mut expressions) => { - let mut prereqs = Vec::new(); - let mut new_exprs = Vec::new(); - - for expr in expressions.drain(..) { - let (mut cur_prereqs, arg) = expr.rebind(base_name, gensym_index); - prereqs.append(&mut cur_prereqs); - new_exprs.push(arg); - } - - let new_name = format!("<{}:{}>", base_name, *gensym_index); - *gensym_index += 1; - prereqs.push(Statement::Binding( - loc.clone(), - new_name.clone(), - Expression::Primitive(loc.clone(), prim, new_exprs), - )); - - (prereqs, Expression::Reference(loc, new_name)) - } - } - } -} diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs index 78d0c8a..e20757d 100644 --- a/src/syntax/tokens.rs +++ b/src/syntax/tokens.rs @@ -4,8 +4,30 @@ use std::fmt; use std::num::ParseIntError; use thiserror::Error; +/// A single token of the input stream; used to help the parsing go down +/// more easily. +/// +/// The key way to generate this structure is via the [`Logos`] trait. +/// See the [`logos`] documentation for more information; we use the +/// [`Token::lexer`] function internally. +/// +/// The first step in the compilation process is turning the raw string +/// data (in UTF-8, which is its own joy) in to a sequence of more sensible +/// tokens. Here, for example, we turn "x=5" into three tokens: a +/// [`Token::Variable`] for "x", a [`Token::Equals`] for the "=", and +/// then a [`Token::Number`] for the "5". Later on, we'll worry about +/// making sense of those three tokens. +/// +/// For now, our list of tokens is relatively straightforward. We'll +/// need/want to extend these later. +/// +/// The [`std::fmt::Display`] implementation for [`Token`] should +/// round-trip; if you lex a string generated with the [`std::fmt::Display`] +/// trait, you should get back the exact same token. #[derive(Logos, Clone, Debug, PartialEq, Eq)] pub enum Token { + // Our first set of tokens are simple characters that we're + // going to use to structure NGR programs. #[token("=")] Equals, @@ -18,12 +40,20 @@ pub enum Token { #[token(")")] RightParen, + // Next we take of any reserved words; I always like to put + // these before we start recognizing more complicated regular + // expressions. I don't think it matters, but it works for me. #[token("print")] Print, + // Next are the operators for NGR. We only have 4, now, but + // we might extend these later, or even make them user-definable! #[regex(r"[+\-*/]", |v| v.slice().chars().next())] Operator(char), + /// Numbers capture both the value we read from the input, + /// converted to an `i64`, as well as the base the user used + /// to write the number, if they did so. #[regex(r"0b[01]+", |v| parse_number(Some(2), v))] #[regex(r"0o[0-7]+", |v| parse_number(Some(8), v))] #[regex(r"0d[0-9]+", |v| parse_number(Some(10), v))] @@ -31,12 +61,23 @@ pub enum Token { #[regex(r"[0-9]+", |v| parse_number(None, v))] Number((Option, i64)), + // Variables; this is a very standard, simple set of characters + // for variables, but feel free to experiment with more complicated + // things. I chose to force variables to start with a lower case + // letter, too. #[regex(r"[a-z][a-zA-Z0-9_]*", |v| ArcIntern::new(v.slice().to_string()))] Variable(ArcIntern), + // the next token will be an error token #[error] + // we're actually just going to skip whitespace, though #[regex(r"[ \t\r\n\f]+", logos::skip)] + // this is an extremely simple version of comments, just line + // comments. More complicated /* */ comments can be harder to + // implement, and didn't seem worth it at the time. #[regex(r"//.*", logos::skip)] + /// This token represents that some core error happened in lexing; + /// possibly that something didn't match anything at all. Error, } @@ -63,19 +104,28 @@ impl fmt::Display for Token { } } +/// A sudden and unexpected error in the lexer. #[derive(Debug, Error, PartialEq, Eq)] pub enum LexerError { + /// The `usize` here is the offset that we ran into the problem, given + /// from the start of the file. #[error("Failed lexing at {0}")] LexFailure(usize), } #[cfg(test)] impl Token { + /// Create a variable token with the given name. Very handy for + /// testing. pub(crate) fn var(s: &str) -> Token { Token::Variable(ArcIntern::new(s.to_string())) } } +/// Parse a number in the given base, return a pair of the base and the +/// parsed number. This is just a helper used for all of the number +/// regular expression cases, which kicks off to the obvious Rust +/// standard library function. fn parse_number( base: Option, value: &Lexer, diff --git a/src/syntax/validate.rs b/src/syntax/validate.rs index da2410c..c318e93 100644 --- a/src/syntax/validate.rs +++ b/src/syntax/validate.rs @@ -2,6 +2,13 @@ use crate::syntax::{Expression, Location, Program, Statement}; use codespan_reporting::diagnostic::Diagnostic; use std::collections::HashMap; +/// An error we found while validating the input program. +/// +/// These errors indicate that we should stop trying to compile +/// the program, because it's just fundamentally broken in a way +/// that we're not going to be able to work through. As with most +/// of these errors, we recommend converting this to a [`Diagnostic`] +/// and using [`codespan_reporting`] to present them to the user. pub enum Error { UnboundVariable(Location, String), } @@ -16,6 +23,13 @@ impl From for Diagnostic { } } +/// A problem we found validating the input that isn't critical. +/// +/// These are things that the user might want to do something about, +/// but we can keep going without it being a problem. As with most of +/// these things, if you want to present this information to the user, +/// the best way to do so is via [`From`] and [`Diagnostic`], and then +/// interactions via [`codespan_reporting`]. #[derive(Debug, PartialEq, Eq)] pub enum Warning { ShadowedVariable(Location, Location, String), @@ -37,6 +51,11 @@ impl From for Diagnostic { } impl Program { + /// Validate that the program makes semantic sense, not just syntactic sense. + /// + /// This checks for things like references to variables that don't exist, for + /// example, and generates warnings for things that are inadvisable but not + /// actually a problem. pub fn validate(&self) -> (Vec, Vec) { let mut errors = vec![]; let mut warnings = vec![]; @@ -53,6 +72,15 @@ impl Program { } impl Statement { + /// Validate that the statement makes semantic sense, not just syntactic sense. + /// + /// This checks for things like references to variables that don't exist, for + /// example, and generates warnings for things that are inadvisable but not + /// actually a problem. Since statements appear in a broader context, you'll + /// need to provide the set of variables that are bound where this statement + /// occurs. We use a `HashMap` to map these bound locations to the locations + /// where their bound, because these locations are handy when generating errors + /// and warnings. pub fn validate( &self, bound_variables: &mut HashMap,