📜 Add better documentation across the compiler. #3

Merged
acw merged 19 commits from acw/better-docs into develop 2023-05-13 12:34:48 -07:00
28 changed files with 1550 additions and 432 deletions

View File

@@ -19,7 +19,7 @@ cranelift-module = "0.94.0"
cranelift-native = "0.94.0"
cranelift-object = "0.94.0"
internment = { version = "0.7.0", default-features = false, features = ["arc"] }
lalrpop-util = "^0.19.7"
lalrpop-util = "^0.20.0"
lazy_static = "^1.4.0"
logos = "^0.12.0"
pretty = { version = "^0.11.2", features = ["termcolor"] }
@@ -30,4 +30,4 @@ tempfile = "^3.5.0"
thiserror = "^1.0.30"
[build-dependencies]
lalrpop = "^0.19.7"
lalrpop = "^0.20.0"

View File

@@ -1,3 +1,31 @@
//! # The compiler backend: generation of machine code, both static and JIT.
//!
//! This module is responsible for taking our intermediate representation from
//! [`crate::ir`] and turning it into Cranelift and then into object code that
//! can either be saved to disk or run in memory. Because the runtime functions
//! for NGR are very closely tied to the compiler implentation, we also include
//! information about these functions as part of the module.
//!
//! ## Using the `Backend`
//!
//! The backend of this compiler can be used in two modes: a static compilation
//! mode, where the goal is to write the compiled object to disk and then link
//! it later, and a JIT mode, where the goal is to write the compiled object to
//! memory and then run it. Both modes use the same `Backend` object, because
//! they share a lot of behaviors. However, you'll want to use different variants
//! based on your goals:
//!
//! * Use `Backend<ObjectModule>`, constructed via [`Backend::object_file`],
//! if you want to compile to an object file on disk, which you're then going
//! to link to later.
//! * Use `Backend<JITModule>`, constructed via [`Backend::jit`], if you want
//! to do just-in-time compilation and are just going to run things immediately.
//!
//! ## Working with Runtime Functions
//!
//! For now, runtime functions are pretty easy to describe, because there's
//! only one. In the future, though, the [`RuntimeFunctions`] object is there to
//! help provide a clean interface to them all.
mod error;
mod eval;
mod into_crane;
@@ -16,6 +44,15 @@ use target_lexicon::Triple;
const EMPTY_DATUM: [u8; 8] = [0; 8];
/// An object representing an active backend.
///
/// Internally, this object holds a bunch of state useful for compiling one
/// or more functions into an object file or memory. It can be passed around,
/// but cannot currently be duplicated because some of that state is not
/// easily duplicated. You should be able to share this across threads, assuming
/// normal Rust safety, but you should be thoughtful about transferring it across
/// processes in a JIT context due to some special cases in the runtime function
/// implementations.
pub struct Backend<M: Module> {
pub module: M,
data_ctx: DataContext,
@@ -26,6 +63,12 @@ pub struct Backend<M: Module> {
}
impl Backend<JITModule> {
/// Create a new JIT backend for compiling NGR into memory.
///
/// The provided output buffer is not for the compiled code, but for the output
/// of any `print` expressions that are evaluated. If set to `None`, the output
/// will be written to `stdout` as per normal, but if a String buffer is provided,
/// it will be extended by any `print` statements that happen during code execution.
pub fn jit(output_buffer: Option<String>) -> Result<Self, BackendError> {
let platform = Triple::host();
let isa_builder = isa::lookup(platform.clone())?;
@@ -50,12 +93,24 @@ impl Backend<JITModule> {
})
}
/// Given a compiled function ID, get a pointer to where that function was written
/// in memory.
///
/// The data at this pointer should not be mutated unless you really, really,
/// really know what you're doing. It can be run by casting it into a Rust
/// `fn() -> ()`, and then calling it from normal Rust.
pub fn bytes(&self, function_id: FuncId) -> *const u8 {
self.module.get_finalized_function(function_id)
}
}
impl Backend<ObjectModule> {
/// Generate a backend for compiling into an object file for the given target.
///
/// This backend will generate a single output file per `Backend` object, although
/// that file may have multiple functions defined within it. Data between those
/// functions (in particular, strings) will be defined once and shared between
/// the different functions.
pub fn object_file(platform: Triple) -> Result<Self, BackendError> {
let isa_builder = isa::lookup(platform.clone())?;
let mut settings_builder = settings::builder();
@@ -76,12 +131,22 @@ impl Backend<ObjectModule> {
})
}
/// Given all the functions defined, return the bytes the object file should contain.
pub fn bytes(self) -> Result<Vec<u8>, BackendError> {
self.module.finish().emit().map_err(Into::into)
}
}
impl<M: Module> Backend<M> {
/// Define a string within the current backend.
///
/// Note that this is a Cranelift [`DataId`], which then must be redeclared inside the
/// context of any functions or data items that want to use it. That being said, the
/// string value will be defined once in the file and then shared by all referencers.
///
/// This function will automatically add a null character (`'\0'`) to the end of the
/// string, to ensure that strings are non-terminated for interactions with other
/// languages.
pub fn define_string(&mut self, s: &str) -> Result<DataId, BackendError> {
let name = format!("<string_constant>{}", s);
let s0 = format!("{}\0", s);
@@ -97,6 +162,11 @@ impl<M: Module> Backend<M> {
Ok(global_id)
}
/// Define a global variable within the current backend.
///
/// These variables can be shared between functions, and will be exported from the
/// module itself as public data in the case of static compilation. There initial
/// value will be null.
pub fn define_variable(&mut self, name: String) -> Result<DataId, BackendError> {
self.data_ctx.define(Box::new(EMPTY_DATUM));
let id = self
@@ -108,6 +178,11 @@ impl<M: Module> Backend<M> {
Ok(id)
}
/// Get a pointer to the output buffer for `print`ing, or `null`.
///
/// As suggested, returns `null` in the case where the user has not provided an
/// output buffer; it is your responsibility to check for this case and do
/// something sensible.
pub fn output_buffer_ptr(&mut self) -> *mut String {
if let Some(str) = self.output_buffer.as_mut() {
str as *mut String
@@ -116,6 +191,10 @@ impl<M: Module> Backend<M> {
}
}
/// Get any captured output `print`ed by the program during execution.
///
/// If an output buffer was not provided, or if the program has not done any
/// printing, then this function will return an empty string.
pub fn output(self) -> String {
if let Some(s) = self.output_buffer {
s

View File

@@ -2,8 +2,27 @@ use crate::backend::runtime::RuntimeFunctionError;
use codespan_reporting::diagnostic::Diagnostic;
use cranelift_codegen::{isa::LookupError, settings::SetError, CodegenError};
use cranelift_module::ModuleError;
use internment::ArcIntern;
use thiserror::Error;
/// An error in the translation to a backend (either the JIT or the static compiler).
///
/// In general, this is just a nice summary error type for a bunch of downstream
/// errors; the exception are internal errors from builtin functions or variable
/// lookups.
///
/// Unlike some other errors in the system, the translation to a `Diagnostic` does
/// not necessarily provide a whole lot of value, because we have lost most of the
/// source information by the time we're generating these errors. That being said,
/// people who want to provide nicer error messages might consider using the
/// translation through `Diagnostic` anyways, just in case we add more information
/// in the future.
///
/// Finally, the `PartialEq` for this function is a bit fuzzy. In some cases, it
/// ensures that the errors match exactly. In other cases, though, it just checks to
/// see if the two errors are of the same class; e.g., it will return true if both
/// errors are `BackendError::CodegenError`, regardless of what the specific
/// `CodegenError` is.
#[derive(Debug, Error)]
pub enum BackendError {
#[error("Cranelift module error: {0}")]
@@ -11,7 +30,7 @@ pub enum BackendError {
#[error("Builtin function error: {0}")]
BuiltinError(#[from] RuntimeFunctionError),
#[error("Internal variable lookup error")]
VariableLookupFailure,
VariableLookupFailure(ArcIntern<String>),
#[error(transparent)]
CodegenError(#[from] CodegenError),
#[error(transparent)]
@@ -31,9 +50,8 @@ impl From<BackendError> for Diagnostic<usize> {
BackendError::BuiltinError(me) => {
Diagnostic::error().with_message(format!("Internal runtime function error: {}", me))
}
BackendError::VariableLookupFailure => {
Diagnostic::error().with_message("Internal variable lookup error!")
}
BackendError::VariableLookupFailure(x) => Diagnostic::error()
.with_message(format!("Internal variable lookup error for {}", x)),
BackendError::CodegenError(me) => {
Diagnostic::error().with_message(format!("Internal codegen error: {}", me))
}
@@ -58,8 +76,12 @@ impl PartialEq for BackendError {
_ => false,
},
// because the underlying `CodegenError` doesn't implement `PartialEq',
// we just check that they're both `CodegenError`s.
BackendError::CodegenError(_) => matches!(other, BackendError::CodegenError(_)),
// because the underlying `ModuleError` doesn't implement `PartialEq',
// we just check that they're both `Cranelift`s.
BackendError::Cranelift(_) => matches!(other, BackendError::Cranelift(_)),
BackendError::LookupError(a) => match other {
@@ -72,7 +94,10 @@ impl PartialEq for BackendError {
_ => false,
},
BackendError::VariableLookupFailure => other == &BackendError::VariableLookupFailure,
BackendError::VariableLookupFailure(a) => match other {
BackendError::VariableLookupFailure(b) => a == b,
_ => false,
},
BackendError::Write(a) => match other {
BackendError::Write(b) => a == b,

View File

@@ -8,6 +8,19 @@ use cranelift_object::ObjectModule;
use target_lexicon::Triple;
impl Backend<JITModule> {
/// Evaluate the given IR, returning the output it prints.
///
/// This builds and executes the program using the JIT backend, using a fresh JIT runtime
/// that should be independent of any other runtimes being executed. As such, it should be
/// impossible for a program being executed by this function to interact with another, parallel
/// execution of the function. If you actually want them to interact, you'll need to combine
/// them into the same `Program` before execution.
///
/// One important note: The runtime used by this function does not currently implement
/// overflow/underflow erroring the same way that other evaluation functions within this
/// library do. So, if you're validating equivalence between them, you'll want to weed
/// out examples that overflow/underflow before checking equivalence. (This is the behavior
/// of the built-in test systems.)
pub fn eval(program: Program) -> Result<String, EvalError> {
let mut jitter = Backend::jit(Some(String::new()))?;
let function_id = jitter.compile_function("test", program)?;
@@ -20,6 +33,20 @@ impl Backend<JITModule> {
}
impl Backend<ObjectModule> {
/// Evalute the given IR, returning the output it prints.
///
/// This build the program as a standalone object in a temporary directory, and then links
/// and runs it using the provided runtime system (see `CARGO_MANIFEST_DIR/runtime/`). To
/// do so, it assumes that there is a version of `clang` available in the current PATH.
///
/// This routine is regularly tested under Windows, Mac, and Linux, and should work across
/// other platforms that support `clang`.
///
/// One important note: The runtime used by this function does not currently implement
/// overflow/underflow erroring the same way that other evaluation functions within this
/// library do. So, if you're validating equivalence between them, you'll want to weed
/// out examples that overflow/underflow before checking equivalence. (This is the behavior
/// of the built-in test systems.)
pub fn eval(program: Program) -> Result<String, EvalError> {
//use pretty::{Arena, Pretty};
//let allocator = Arena::<()>::new();
@@ -40,18 +67,26 @@ impl Backend<ObjectModule> {
if output.status.success() {
Ok(std::string::String::from_utf8_lossy(&output.stdout).to_string())
} else {
Err(EvalError::IO(format!(
"Exitted with error code {}",
output.status
)))
Err(EvalError::ExitCode(output.status))
}
} else {
Err(EvalError::IO(
Err(EvalError::RuntimeOutput(
std::string::String::from_utf8_lossy(&output.stderr).to_string(),
))
}
}
/// Link the generated object into an executable.
///
/// Currently, our runtime system is a single file, and ends up being the function
/// that includes `main`. (It then calls the `gogogo` function which serves as the
/// entry point for our compiled code.) This function thus just uses `clang` to
/// compile the C file with the generated object file to produce the executable.
/// Conveniently, `clang` also sets execute permissions under unix-like file systems.
///
/// This function assumes that this compilation and linking should run without any
/// output, so changes to the RTS should make 100% sure that they do not generate
/// any compiler warnings.
fn link(object_file: &Path, executable_path: &Path) -> Result<(), EvalError> {
use std::path::PathBuf;
@@ -67,7 +102,7 @@ impl Backend<ObjectModule> {
.output()?;
if !output.stderr.is_empty() {
return Err(EvalError::IO(
return Err(EvalError::Linker(
std::string::String::from_utf8_lossy(&output.stderr).to_string(),
));
}
@@ -77,12 +112,17 @@ impl Backend<ObjectModule> {
}
proptest::proptest! {
// This is the obvious test to make sure that our static compilation path works
// without error, assuming any possible input ... well, any possible input that
// doesn't involve overflow or underflow.
#[test]
fn file_backend_works(program: Program) {
fn static_backend(program: Program) {
use crate::eval::PrimOpError;
let basic_result = program.eval();
// windows `printf` is going to terminate lines with "\r\n", so we need to adjust
// our test result here.
#[cfg(target_family="windows")]
let basic_result = basic_result.map(|x| x.replace('\n', "\r\n"));
@@ -92,8 +132,11 @@ proptest::proptest! {
}
}
// This is the obvious test to make sure that our JIT compilation path works
// without error, assuming any possible input ... well, any possible input that
// doesn't involve overflow or underflow.
#[test]
fn jit_backend_works(program: Program) {
fn jit_backend(program: Program) {
use crate::eval::PrimOpError;
let basic_result = program.eval();

View File

@@ -8,15 +8,31 @@ use cranelift_codegen::ir::{
use cranelift_codegen::isa::CallConv;
use cranelift_codegen::Context;
use cranelift_frontend::{FunctionBuilder, FunctionBuilderContext, Variable};
use cranelift_module::{FuncId, Linkage, Module, ModuleError};
use cranelift_module::{FuncId, Linkage, Module};
use internment::ArcIntern;
use crate::backend::error::BackendError;
use crate::backend::Backend;
/// When we're compiling, we might need to reference some of the strings built into
/// the source code; to do so, we need a `GlobalValue`. Perhaps unexpectedly, given
/// the name, `GlobalValue`s are specific to a single function we're compiling, so
/// we end up computing this table for every function.
///
/// This just a handy type alias to avoid a lot of confusion in the functions.
type StringTable = HashMap<ArcIntern<String>, GlobalValue>;
impl<M: Module> Backend<M> {
/// Compile the given `Program` into a function with the given name.
///
/// At some point, the use of `Program` is going to change; however, for the
/// moment, we have no notion of a function in our language so the whole input
/// is converted into a single output function. The type of the generated
/// function is, essentially, `fn() -> ()`: it takes no arguments and returns
/// no value.
///
/// The function provided can then be either written to a file (if using a
/// static Cranelift backend) or executed directly (if using the Cranelift JIT).
pub fn compile_function(
&mut self,
function_name: &str,
@@ -28,21 +44,47 @@ impl<M: Module> Backend<M> {
call_conv: CallConv::SystemV,
};
// this generates the handle for the function that we'll eventually want to
// return to the user. For now, we declare all functions defined by this
// function as public/global/exported, although we may want to reconsider
// this decision later.
let func_id =
self.module
.declare_function(function_name, Linkage::Export, &basic_signature)?;
let mut ctx = Context::new();
ctx.func =
Function::with_name_signature(UserFuncName::user(0, func_id.as_u32()), basic_signature);
// Next we have to generate the compilation context for the rest of this
// function. Currently, we generate a fresh context for every function.
// Since we're only generating one function per `Program`, this makes
// complete sense. However, in the future, we may want to revisit this
// decision.
let mut ctx = Context::new();
let user_func_name = UserFuncName::user(0, func_id.as_u32());
ctx.func = Function::with_name_signature(user_func_name, basic_signature);
// We generate a table of every string that we use in the program, here.
// Cranelift is going to require us to have this in a particular structure
// (`GlobalValue`) so that we can reference them later, and it's going to
// be tricky to generate those on the fly. So we just generate the set we
// need here, and then have ir around in the table for later.
let string_table = self.build_string_table(&mut ctx.func, &program)?;
let mut variable_table = HashMap::new();
let mut next_var_num = 1;
// In the future, we might want to see what runtime functions the function
// we were given uses, and then only include those functions that we care
// about. Presumably, we'd use some sort of lookup table like we do for
// strings. But for now, we only have one runtime function, and we're pretty
// sure we're always going to use it, so we just declare it (and reference
// it) directly.
let print_func_ref = self.runtime_functions.include_runtime_function(
"print",
&mut self.module,
&mut ctx.func,
)?;
// In the case of the JIT, there may be symbols we've already defined outside
// the context of this particular `Progam`, which we might want to reference.
// Just like with strings, generating the `GlobalValue`s we need can potentially
// be a little tricky to do on the fly, so we generate the complete list right
// here and then use it later.
let pre_defined_symbols: HashMap<String, GlobalValue> = self
.defined_symbols
.iter()
@@ -52,67 +94,88 @@ impl<M: Module> Backend<M> {
})
.collect();
// The last table we're going to need is our local variable table, to store
// variables used in this `Program` but not used outside of it. For whatever
// reason, Cranelift requires us to generate unique indexes for each of our
// variables; we just use a simple incrementing counter for that.
let mut variable_table = HashMap::new();
let mut next_var_num = 1;
// Finally (!), we generate the function builder that we're going to use to
// make this function!
let mut fctx = FunctionBuilderContext::new();
let mut builder = FunctionBuilder::new(&mut ctx.func, &mut fctx);
// Make the initial block to put instructions in. Later, when we have control
// flow, we might add more blocks after this one. But, for now, we only have
// the one block.
let main_block = builder.create_block();
builder.switch_to_block(main_block);
// Compiling a function is just compiling each of the statements in order.
// At the moment, we do the pattern match for statements here, and then
// directly compile the statements. If/when we add more statement forms,
// this is likely to become more cumbersome, and we'll want to separate
// these off. But for now, given the amount of tables we keep around to track
// state, it's easier to just include them.
for stmt in program.statements.drain(..) {
match stmt {
// Print statements are fairly easy to compile: we just lookup the
// output buffer, the address of the string to print, and the value
// of whatever variable we're printing. Then we just call print.
Statement::Print(ann, var) => {
// Get the output buffer (or null) from our general compilation context.
let buffer_ptr = self.output_buffer_ptr();
let buffer_ptr = builder.ins().iconst(types::I64, buffer_ptr as i64);
// Get a reference to the string we want to print.
let local_name_ref = string_table.get(&var).unwrap();
let name_ptr = builder.ins().symbol_value(types::I64, *local_name_ref);
let val = ValueOrRef::Ref(ann, var).into_cranelift(
// Look up the value for the variable. Because this might be a
// global variable (and that requires special logic), we just turn
// this into an `Expression` and re-use the logic in that implementation.
let val = Expression::Reference(ann, var).into_crane(
&mut builder,
&variable_table,
&pre_defined_symbols,
)?;
// Finally, we can generate the call to print.
builder
.ins()
.call(print_func_ref, &[buffer_ptr, name_ptr, val]);
}
// Variable binding is a little more con
Statement::Binding(_, var_name, value) => {
let val = match value {
Expression::Value(_, Value::Number(_, v)) => {
builder.ins().iconst(types::I64, v)
}
Expression::Reference(_, name) => {
let value_var_num = variable_table.get(&name).unwrap();
builder.use_var(Variable::new(*value_var_num))
}
Expression::Primitive(_, prim, mut vals) => {
let right = vals.pop().unwrap().into_cranelift(
&mut builder,
&variable_table,
&pre_defined_symbols,
)?;
let left = vals.pop().unwrap().into_cranelift(
&mut builder,
&variable_table,
&pre_defined_symbols,
)?;
match prim {
Primitive::Plus => builder.ins().iadd(left, right),
Primitive::Minus => builder.ins().isub(left, right),
Primitive::Times => builder.ins().imul(left, right),
Primitive::Divide => builder.ins().sdiv(left, right),
}
}
};
// Kick off to the `Expression` implementation to see what value we're going
// to bind to this variable.
let val =
value.into_crane(&mut builder, &variable_table, &pre_defined_symbols)?;
// Now the question is: is this a local variable, or a global one?
if let Some(global_id) = pre_defined_symbols.get(var_name.as_str()) {
// It's a global variable! In this case, we assume that someone has already
// dedicated some space in memory to store this value. We look this location
// up, and then tell Cranelift to store the value there.
let val_ptr = builder.ins().symbol_value(types::I64, *global_id);
builder.ins().store(MemFlags::new(), val, val_ptr, 0);
} else {
// It's a local variable! In this case, we need to allocate a new Cranelift
// `Variable` for this variable, which we do using our `next_var_num` counter.
// (While we're doing this, we also increment `next_var_num`, so that we get
// a fresh `Variable` next time. This is one of those very narrow cases in which
// I wish Rust had an increment expression.)
let var = Variable::new(next_var_num);
variable_table.insert(var_name, next_var_num);
next_var_num += 1;
// We can add the variable directly to our local variable map; it's `Copy`.
variable_table.insert(var_name, var);
// Now we tell Cranelift about our new variable, which has type I64 because
// everything we have at this point is of type I64. Once it's declare, we
// define it as having the value we computed above.
builder.declare_var(var, types::I64);
builder.def_var(var, val);
}
@@ -120,15 +183,30 @@ impl<M: Module> Backend<M> {
}
}
// Now that we're done, inject a return function (one with no actual value; basically
// the equivalent of Rust's `return;`). We then seal the block (which lets Cranelift
// know that the block is done), and then finalize the function (which lets Cranelift
// know we're done with the function).
builder.ins().return_(&[]);
builder.seal_block(main_block);
builder.finalize();
// This is a little odd. We want to tell the rest of Cranelift about this function,
// so we register it using the function ID and our builder context. However, the
// result of this function isn't actually super helpful. So we ignore it, unless
// it's an error.
let _ = self.module.define_function(func_id, &mut ctx)?;
// done!
Ok(func_id)
}
// Build the string table for use in referencing strings later.
//
// This function is slightly smart, in that it only puts strings in the table that
// are used by the `Program`. (Thanks to `Progam::strings()`!) If the strings have
// been declared globally, via `Backend::define_string()`, we will re-use that data.
// Otherwise, this will define the string for you.
fn build_string_table(
&mut self,
func: &mut Function,
@@ -149,30 +227,73 @@ impl<M: Module> Backend<M> {
}
}
impl ValueOrRef {
fn into_cranelift(
impl Expression {
fn into_crane(
self,
builder: &mut FunctionBuilder,
local_variables: &HashMap<ArcIntern<String>, usize>,
local_variables: &HashMap<ArcIntern<String>, Variable>,
global_variables: &HashMap<String, GlobalValue>,
) -> Result<entities::Value, ModuleError> {
) -> Result<entities::Value, BackendError> {
match self {
ValueOrRef::Value(_, value) => match value {
Value::Number(_base, numval) => Ok(builder.ins().iconst(types::I64, numval)),
},
// Values are pretty straightforward to compile, mostly because we only
// have one type of variable, and it's an integer type.
Expression::Value(_, Value::Number(_, v)) => Ok(builder.ins().iconst(types::I64, v)),
ValueOrRef::Ref(_, name) => {
if let Some(local_num) = local_variables.get(&name) {
return Ok(builder.use_var(Variable::new(*local_num)));
Expression::Reference(_, name) => {
// first we see if this is a local variable (which is nicer, from an
// optimization point of view.)
if let Some(local_var) = local_variables.get(&name) {
return Ok(builder.use_var(*local_var));
}
if let Some(global_id) = global_variables.get(name.as_str()) {
let val_ptr = builder.ins().symbol_value(types::I64, *global_id);
// then we check to see if this is a global reference, which requires us to
// first lookup where the value is stored, and then load it.
if let Some(global_var) = global_variables.get(name.as_ref()) {
let val_ptr = builder.ins().symbol_value(types::I64, *global_var);
return Ok(builder.ins().load(types::I64, MemFlags::new(), val_ptr, 0));
}
Err(ModuleError::Undeclared(name.to_string()))
// this should never happen, because we should have made sure that there are
// no unbound variables a long time before this. but still ...
Err(BackendError::VariableLookupFailure(name))
}
Expression::Primitive(_, prim, mut vals) => {
// we're going to use `pop`, so we're going to pull and compile the right value ...
let right =
vals.pop()
.unwrap()
.into_crane(builder, local_variables, global_variables)?;
// ... and then the left.
let left =
vals.pop()
.unwrap()
.into_crane(builder, local_variables, global_variables)?;
// then we just need to tell Cranelift how to do each of our primitives! Much
// like Statements, above, we probably want to eventually shuffle this off into
// a separate function (maybe something off `Primitive`), but for now it's simple
// enough that we just do the `match` here.
match prim {
Primitive::Plus => Ok(builder.ins().iadd(left, right)),
Primitive::Minus => Ok(builder.ins().isub(left, right)),
Primitive::Times => Ok(builder.ins().imul(left, right)),
Primitive::Divide => Ok(builder.ins().sdiv(left, right)),
}
}
}
}
}
// Just to avoid duplication, this just leverages the `From<ValueOrRef>` trait implementation
// for `ValueOrRef` to compile this via the `Expression` logic, above.
impl ValueOrRef {
fn into_crane(
self,
builder: &mut FunctionBuilder,
local_variables: &HashMap<ArcIntern<String>, Variable>,
global_variables: &HashMap<String, GlobalValue>,
) -> Result<entities::Value, BackendError> {
Expression::from(self).into_crane(builder, local_variables, global_variables)
}
}

View File

@@ -8,9 +8,14 @@ use std::fmt::Write;
use target_lexicon::Triple;
use thiserror::Error;
/// An object for querying / using functions built into the runtime.
///
/// Right now, this is a quite a bit of boilerplate for very nebulous
/// value. However, as the number of built-in functions gets large, it's
/// nice to have a single point to register and query them, so here we
/// go.
pub struct RuntimeFunctions {
builtin_functions: HashMap<String, FuncId>,
_referenced_functions: Vec<String>,
}
#[derive(Debug, Error, PartialEq)]
@@ -19,25 +24,27 @@ pub enum RuntimeFunctionError {
CannotFindRuntimeFunction(String),
}
extern "C" fn runtime_print(output_buffer: *mut String, name: *const i8, value: i64) {
let cstr = unsafe { CStr::from_ptr(name) };
let reconstituted = cstr.to_string_lossy();
if let Some(output_buffer) = unsafe { output_buffer.as_mut() } {
writeln!(output_buffer, "{} = {}i64", reconstituted, value).unwrap();
} else {
println!("{} = {}", reconstituted, value);
}
}
impl RuntimeFunctions {
/// Generate a new runtime function table for the given platform, and
/// declare them within the provided Cranelift module.
///
/// Note that this is very conservative: it assumes that your module
/// will want to use every runtime function. Unless the Cranelift object
/// builder is smart, this might inject a bunch of references (and thus
/// linker requirements) that aren't actually needed by your program.
///
/// Then again, right now there's exactly one runtime function, so ...
/// not a big deal.
pub fn new<M: Module>(platform: &Triple, module: &mut M) -> ModuleResult<RuntimeFunctions> {
let mut builtin_functions = HashMap::new();
let _referenced_functions = Vec::new();
let string_param = AbiParam::new(types::I64);
let int64_param = AbiParam::new(types::I64);
// declare print for Cranelift; it's something we're going to import
// into the current module (it's compiled separately), and takes two
// strings and an integer. (Which ... turn out to all be the same
// underlying type, which is weird but the way it is.)
let print_id = module.declare_function(
"print",
Linkage::Import,
@@ -47,14 +54,19 @@ impl RuntimeFunctions {
call_conv: CallConv::triple_default(platform),
},
)?;
// Toss this function in our internal dictionary, as well.
builtin_functions.insert("print".to_string(), print_id);
Ok(RuntimeFunctions {
builtin_functions,
_referenced_functions,
})
Ok(RuntimeFunctions { builtin_functions })
}
/// Include the named runtime function into the current Function context.
///
/// This is necessary for every runtime function reference within each
/// function. The returned `FuncRef` can be used in `call` invocations.
/// The only reason for this function to error is if you pass a name that
/// the runtime isn't familiar with.
pub fn include_runtime_function<M: Module>(
&self,
name: &str,
@@ -69,7 +81,30 @@ impl RuntimeFunctions {
}
}
/// Register live, local versions of the runtime functions into the JIT.
///
/// Note that these implementations are *not* the same as the ones defined
/// in `CARGO_MANIFEST_DIR/runtime/`, for ... reasons. It might be a good
/// change, in the future, to find a way to unify these implementations into
/// one; both to reduce the chance that they deviate, and to reduce overall
/// maintenance burden.
pub fn register_jit_implementations(builder: &mut JITBuilder) {
builder.symbol("print", runtime_print as *const u8);
}
}
// Print! This implementation is used in the JIT compiler, to actually print data. We
// use the `output_buffer` argument as an aid for testing; if it's non-NULL, it's a string
// we extend with the output, so that multiple JIT'd `Program`s can run concurrently
// without stomping over each other's output. If `output_buffer` is NULL, we just print
// to stdout.
extern "C" fn runtime_print(output_buffer: *mut String, name: *const i8, value: i64) {
let cstr = unsafe { CStr::from_ptr(name) };
let reconstituted = cstr.to_string_lossy();
if let Some(output_buffer) = unsafe { output_buffer.as_mut() } {
writeln!(output_buffer, "{} = {}i64", reconstituted, value).unwrap();
} else {
println!("{} = {}", reconstituted, value);
}
}

View File

@@ -1,17 +1,7 @@
use clap::Parser;
use codespan_reporting::diagnostic::Diagnostic;
use codespan_reporting::files::SimpleFiles;
use codespan_reporting::term;
use codespan_reporting::term::termcolor::{ColorChoice, StandardStream};
use cranelift_object::object;
use ngr::backend::Backend;
use ngr::backend::BackendError;
use ngr::ir::Program as IR;
use ngr::syntax::{ParserError, Program as Syntax};
use target_lexicon::Triple;
use thiserror::Error;
/// Clap is great! Even though we don't have many command line arguments
/// yet, this is just really neat.
#[derive(Parser, Debug)]
#[clap(author, version, about, long_about = None)]
struct CommandLineArguments {
@@ -23,76 +13,14 @@ struct CommandLineArguments {
file: String,
}
#[derive(Debug, Error)]
enum MainError {
#[error(transparent)]
Backend(#[from] BackendError),
#[error("Parser error")]
ParserError(#[from] ParserError),
#[error("IO error")]
IoError(#[from] std::io::Error),
#[error("write error")]
WriteError(#[from] object::write::Error),
}
impl From<MainError> for Diagnostic<usize> {
fn from(value: MainError) -> Self {
match value {
MainError::Backend(be) => be.into(),
MainError::ParserError(pe) => (&pe).into(),
MainError::IoError(e) => Diagnostic::error().with_message(format!("IO error: {}", e)),
MainError::WriteError(e) => {
Diagnostic::error().with_message(format!("Module write error: {}", e))
}
}
}
}
fn compile(file_database: &mut SimpleFiles<String, String>) -> Result<(), MainError> {
let args = CommandLineArguments::parse();
let syntax = Syntax::parse_file(file_database, &args.file)?;
let (mut errors, mut warnings) = syntax.validate();
let stop = !errors.is_empty();
let messages = errors
.drain(..)
.map(Into::into)
.chain(warnings.drain(..).map(Into::into));
let writer = StandardStream::stderr(ColorChoice::Auto);
let config = codespan_reporting::term::Config::default();
for message in messages {
term::emit(&mut writer.lock(), &config, file_database, &message).unwrap();
}
if stop {
return Ok(());
}
let ir = IR::from(syntax.simplify());
let mut backend = Backend::object_file(Triple::host())?;
backend.compile_function("gogogo", ir)?;
let bytes = backend.bytes()?;
std::fs::write(args.output.unwrap_or_else(|| "output.o".to_string()), bytes)?;
Ok(())
}
fn main() {
let mut file_database = SimpleFiles::new();
let args = CommandLineArguments::parse();
let mut compiler = ngr::Compiler::default();
match compile(&mut file_database) {
Ok(()) => {}
Err(e) => {
let writer = StandardStream::stderr(ColorChoice::Auto);
let config = codespan_reporting::term::Config::default();
let output_file = args.output.unwrap_or("output.o".to_string());
term::emit(
&mut writer.lock(),
&config,
&file_database,
&Diagnostic::from(e),
)
.unwrap();
}
if let Some(bytes) = compiler.compile(&args.file) {
std::fs::write(&output_file, bytes)
.unwrap_or_else(|x| eprintln!("Could not write to file {}: {}", output_file, x));
}
}

View File

@@ -1,130 +1,11 @@
use codespan_reporting::diagnostic::Diagnostic;
use codespan_reporting::files::SimpleFiles;
use codespan_reporting::term::{self, Config};
use cranelift_jit::JITModule;
use cranelift_module::ModuleError;
use ngr::backend::{Backend, BackendError};
use ngr::ir::Program as IR;
use ngr::syntax::{Location, ParserError, Statement};
use pretty::termcolor::{ColorChoice, StandardStream, WriteColor};
use ngr::backend::BackendError;
use rustyline::error::ReadlineError;
use rustyline::DefaultEditor;
use std::collections::HashMap;
pub struct RunLoop<'a> {
file_database: SimpleFiles<&'a str, String>,
jitter: Backend<JITModule>,
variable_binding_sites: HashMap<String, Location>,
gensym_index: usize,
writer: &'a mut dyn WriteColor,
config: Config,
}
#[allow(clippy::upper_case_acronyms)]
#[derive(Debug, thiserror::Error)]
enum REPLError {
#[error("Error parsing statement: {0}")]
Parser(#[from] ParserError),
#[error("JIT error: {0}")]
JIT(#[from] BackendError),
#[error("Internal cranelift error: {0}")]
Cranelift(#[from] ModuleError),
#[error(transparent)]
Reporting(#[from] codespan_reporting::files::Error),
}
impl From<REPLError> for Diagnostic<usize> {
fn from(value: REPLError) -> Self {
match value {
REPLError::Parser(err) => Diagnostic::from(&err),
REPLError::JIT(err) => Diagnostic::from(err),
REPLError::Cranelift(err) => Diagnostic::bug().with_message(format!("{}", err)),
REPLError::Reporting(err) => Diagnostic::bug().with_message(format!("{}", err)),
}
}
}
impl<'a> RunLoop<'a> {
pub fn new(writer: &'a mut dyn WriteColor, config: Config) -> Result<Self, BackendError> {
Ok(RunLoop {
file_database: SimpleFiles::new(),
jitter: Backend::jit(None)?,
variable_binding_sites: HashMap::new(),
gensym_index: 1,
writer,
config,
})
}
fn emit_diagnostic(
&mut self,
diagnostic: Diagnostic<usize>,
) -> Result<(), codespan_reporting::files::Error> {
term::emit(self.writer, &self.config, &self.file_database, &diagnostic)
}
fn process_input(&mut self, line_no: usize, command: String) {
if let Err(err) = self.process(line_no, command) {
if let Err(e) = self.emit_diagnostic(Diagnostic::from(err)) {
eprintln!(
"WOAH! System having trouble printing error messages. This is very bad. ({})",
e
);
}
}
}
fn process(&mut self, line_no: usize, command: String) -> Result<(), REPLError> {
let entry = self.file_database.add("entry", command);
let source = self
.file_database
.get(entry)
.expect("entry exists")
.source();
let syntax = Statement::parse(entry, source)?;
// if this is a variable binding, and we've never defined this variable before,
// we should tell cranelift about it. this is optimistic; if we fail to compile,
// then we won't use this definition until someone tries again.
if let Statement::Binding(_, ref name, _) = syntax {
if !self.variable_binding_sites.contains_key(name.as_str()) {
self.jitter.define_string(name)?;
self.jitter.define_variable(name.clone())?;
}
};
let (mut errors, mut warnings) = syntax.validate(&mut self.variable_binding_sites);
let stop = !errors.is_empty();
let messages = errors
.drain(..)
.map(Into::into)
.chain(warnings.drain(..).map(Into::into));
for message in messages {
self.emit_diagnostic(message)?;
}
if stop {
return Ok(());
}
let ir = IR::from(syntax.simplify(&mut self.gensym_index));
let name = format!("line{}", line_no);
let function_id = self.jitter.compile_function(&name, ir)?;
self.jitter.module.finalize_definitions()?;
let compiled_bytes = self.jitter.bytes(function_id);
let compiled_function = unsafe { std::mem::transmute::<_, fn() -> ()>(compiled_bytes) };
compiled_function();
Ok(())
}
}
fn main() -> Result<(), BackendError> {
let mut editor = DefaultEditor::new().expect("rustyline works");
let mut line_no = 0;
let mut writer = StandardStream::stdout(ColorChoice::Auto);
let config = codespan_reporting::term::Config::default();
let mut state = RunLoop::new(&mut writer, config)?;
let mut state = ngr::REPL::default();
println!("No Good Reason, the Interpreter!");
loop {
@@ -135,18 +16,30 @@ fn main() -> Result<(), BackendError> {
":quit" => break,
_ => state.process_input(line_no, command),
},
// it's not clear to me what this could be, but OK
Err(ReadlineError::Io(e)) => {
eprintln!("IO error: {}", e);
break;
}
// Control-D and Control-C
Err(ReadlineError::Eof) => break,
Err(ReadlineError::Interrupted) => break,
// For some reason this doesn't exist on Windows. I also don't quite know
// what would cause this, but ...
#[cfg(not(windows))]
Err(ReadlineError::Errno(e)) => {
eprintln!("Unknown syscall error: {}", e);
break;
}
// We don't actually do any reflow-ing if we change the terminal size,
// so we can just ignore this.
Err(ReadlineError::WindowResized) => continue,
// Why on earth are there so many error types?
Err(e) => {
eprintln!("Unknown internal error: {}", e);
break;

157
src/compiler.rs Normal file
View File

@@ -0,0 +1,157 @@
use crate::backend::Backend;
use crate::ir::Program as IR;
use crate::syntax::Program as Syntax;
use codespan_reporting::{
diagnostic::Diagnostic,
files::SimpleFiles,
term::{self, Config},
};
use pretty::termcolor::{ColorChoice, StandardStream};
use target_lexicon::Triple;
/// A high-level compiler for NGR programs.
///
/// This object can be built once, and then re-used many times to build multiple
/// files. For most users, the [`Default`] implementation should be sufficient;
/// it will use `stderr` for warnings and errors, with default colors based on
/// what we discover from the terminal. For those who want to provide alternate
/// outputs, though, the `Compiler::new` constructor is available.
pub struct Compiler {
file_database: SimpleFiles<String, String>,
console: StandardStream,
console_config: Config,
}
impl Default for Compiler {
fn default() -> Self {
let console = StandardStream::stderr(ColorChoice::Auto);
Compiler::new(console, Config::default())
}
}
impl Compiler {
/// Create a new compiler object.
///
/// This object can be re-used to compile as many files as you like.
/// Use this function if you want to configure your output console and/or
/// its configuration in some custom way. Alternatively, you can use the
/// `Default` implementation, which will emit information to `stderr` with
/// a reasonable default configuration.
pub fn new(console: StandardStream, console_config: Config) -> Self {
Compiler {
file_database: SimpleFiles::new(),
console,
console_config,
}
}
/// Compile the given file, returning the object file as a vector of bytes.
///
/// This function may create output, via the console configured with this
/// `Compiler` object. If the compilation fails for any reason, will return
/// `None`.
pub fn compile<P: AsRef<str>>(&mut self, input_file: P) -> Option<Vec<u8>> {
match self.compile_internal(input_file.as_ref()) {
Ok(x) => x,
Err(e) => {
self.emit(e.into());
None
}
}
}
/// This is the actual meat of the compilation chain; we hide it from the user
/// because the type is kind of unpleasant.
///
/// The weird error type comes from the fact that we can run into three types
/// of result:
///
/// * Fundamental errors, like an incorrectly formatted file or some
/// oddity with IO. These return `Err`.
/// * Validation errors, where we reject the program due to something
/// semantically wrong with them. These return `Ok(None)`.
/// * Success! In this case, we return `Ok(Some(...))`, where the bytes
/// returned is the contents of the compiled object file.
///
fn compile_internal(&mut self, input_file: &str) -> Result<Option<Vec<u8>>, CompilerError> {
// Try to parse the file into our syntax AST. If we fail, emit the error
// and then immediately return `None`.
let syntax = Syntax::parse_file(&mut self.file_database, input_file)?;
// Now validate the user's syntax AST. This can possibly find errors and/or
// create warnings. We can continue if we only get warnings, but need to stop
// if we get any errors.
let (mut errors, mut warnings) = syntax.validate();
let stop = !errors.is_empty();
let messages = errors
.drain(..)
.map(Into::into)
.chain(warnings.drain(..).map(Into::into));
// emit all the messages we receive; warnings *and* errors
for message in messages {
self.emit(message);
}
// we got errors, so just stop right now. perhaps oddly, this is Ok(None);
// we've already said all we're going to say in the messags above, so there's
// no need to provide another `Err` result.
if stop {
return Ok(None);
}
// Now that we've validated it, turn it into IR.
let ir = IR::from(syntax);
// Finally, send all this to Cranelift for conversion into an object file.
let mut backend = Backend::object_file(Triple::host())?;
backend.compile_function("gogogo", ir)?;
Ok(Some(backend.bytes()?))
}
/// Emit a diagnostic.
///
/// This is just a really handy shorthand we use elsewhere in the object, because
/// there's a lot of boilerplate we'd like to skip.
fn emit(&mut self, diagnostic: Diagnostic<usize>) {
term::emit(
&mut self.console.lock(),
&self.console_config,
&self.file_database,
&diagnostic,
)
.expect("codespan reporting term::emit works");
}
}
// This is just a handy type that we can convert things into; it's not
// exposed outside this module, and doesn't actually do much of interest.
#[derive(Debug, thiserror::Error)]
enum CompilerError {
#[error(transparent)]
Backend(#[from] crate::backend::BackendError),
#[error(transparent)]
ParserError(#[from] crate::syntax::ParserError),
#[error(transparent)]
IoError(#[from] std::io::Error),
#[error(transparent)]
WriteError(#[from] cranelift_object::object::write::Error),
}
// Since we're going to use codespan to report pretty much all errors,
// this just passes through most of the errors, or makes simple versions
// of `Diagnostic` for those that we don't have existing `From`s.
impl From<CompilerError> for Diagnostic<usize> {
fn from(value: CompilerError) -> Self {
match value {
CompilerError::Backend(be) => be.into(),
CompilerError::ParserError(pe) => (&pe).into(),
CompilerError::IoError(e) => {
Diagnostic::error().with_message(format!("IO error: {}", e))
}
CompilerError::WriteError(e) => {
Diagnostic::error().with_message(format!("Module write error: {}", e))
}
}
}
}

View File

@@ -1,3 +1,38 @@
//! Helpful functions for evaluating NGR programs.
//!
//! Look, this is a compiler, and so you might be asking why it has a bunch of
//! stuff in it to help with writing interpreters. Well, the answer is simple:
//! testing. It's really nice to know that if you start with a program that
//! does a thing, and then you muck with it, you end up with a program that does
//! the exact same thing. If you talk to people who think about language
//! semantics, they'll call this "observational equivalence": maybe the two
//! programs don't do 100% the same things in the same order, but you shouldn't
//! be able to observe the difference ... at least, not without a stopwatch,
//! memory profilers, etc.
//!
//! The actual evaluators for our various syntaxes are hidden in `eval` functions
//! of the various ASTs. It's nice to have them "next to" the syntax that way, so
//! that we just edit stuff in one part of the source tree at a time. This module,
//! then, just contains some things that are generally helpful across all the
//! interpreters we've written.
//!
//! In particular, this module helps with:
//!
//! * Defining a common error type -- [`EvalError`] -- that we can reasonably
//! compare. It's nice to compare errors, here, because we want to know that
//! if a program used to fail, it will still fail after we change it, and
//! fail in the exact same way.
//! * Defining a notion of a binding environment: [`EvalEnvironment`]. This
//! will help us keep track of variables bound in our program, as we run it.
//! * Defining a notion of a runtime value: [`Value`]. Yes, this is the
//! umpteenth time that we're re-defining basically the same enumeration
//! with exactly the same name, but it's nice to have it separated so that
//! we don't confuse them.
//! * Finally, this module implements all of our primitive functions, as the
//! [`Value::calculate`] function. This is just a nice abstraction boundary,
//! because the implementation of some parts of these primitives is really
//! awful to look at.
//!
mod env;
mod primop;
mod value;
@@ -9,6 +44,13 @@ pub use value::Value;
use crate::backend::BackendError;
/// All of the errors that can happen trying to evaluate an NGR program.
///
/// This is yet another standard [`thiserror::Error`] type, but with the
/// caveat that it implements [`PartialEq`] even though some of its
/// constituent members don't. It does so through the very sketchy mechanism
/// of converting those errors to strings and then seeing if they're the
/// same.
#[derive(Debug, thiserror::Error)]
pub enum EvalError {
#[error(transparent)]
@@ -18,15 +60,15 @@ pub enum EvalError {
#[error(transparent)]
Backend(#[from] BackendError),
#[error("IO error: {0}")]
IO(String),
IO(#[from] std::io::Error),
#[error(transparent)]
Module(#[from] ModuleError),
}
impl From<std::io::Error> for EvalError {
fn from(value: std::io::Error) -> Self {
EvalError::IO(value.to_string())
}
#[error("Linker error: {0}")]
Linker(String),
#[error("Program exitted with status {0}")]
ExitCode(std::process::ExitStatus),
#[error("Unexpected output at runtime: {0}")]
RuntimeOutput(String),
}
impl PartialEq for EvalError {
@@ -48,7 +90,7 @@ impl PartialEq for EvalError {
},
EvalError::IO(a) => match other {
EvalError::IO(b) => a == b,
EvalError::IO(b) => a.to_string() == b.to_string(),
_ => false,
},
@@ -56,6 +98,21 @@ impl PartialEq for EvalError {
EvalError::Module(b) => a.to_string() == b.to_string(),
_ => false,
},
EvalError::Linker(a) => match other {
EvalError::Linker(b) => a == b,
_ => false,
},
EvalError::ExitCode(a) => match other {
EvalError::ExitCode(b) => a == b,
_ => false,
},
EvalError::RuntimeOutput(a) => match other {
EvalError::RuntimeOutput(b) => a == b,
_ => false,
},
}
}
}

View File

@@ -2,15 +2,28 @@ use crate::eval::Value;
use internment::ArcIntern;
use std::sync::Arc;
/// An evaluation environment, which maps variable names to their
/// current values.
///
/// One key difference between `EvalEnvironment` and `HashMap` is that
/// `EvalEnvironment` uses an `extend` mechanism to add keys, rather
/// than an `insert`. This difference allows you to add mappings for
/// a subcomputation while still retaining the old version without those
/// keys, which is really handy for implementing variable scoping.
pub struct EvalEnvironment {
inner: Arc<EvalEnvInternal>,
}
pub enum EvalEnvInternal {
enum EvalEnvInternal {
Empty,
Value(ArcIntern<String>, Value, Arc<EvalEnvInternal>),
}
/// Errors that can happen when looking up a variable.
///
/// This enumeration may be extended in the future, depending on if we
/// get more subtle with our keys. But for now, this is just a handy
/// way to make lookup failures be `thiserror::Error`s.
#[derive(Clone, Debug, PartialEq, thiserror::Error)]
pub enum LookupError {
#[error("Could not find variable '{0}' in environment")]
@@ -24,28 +37,38 @@ impl Default for EvalEnvironment {
}
impl EvalEnvironment {
/// Create a new, empty environment.
pub fn empty() -> Self {
EvalEnvironment {
inner: Arc::new(EvalEnvInternal::Empty),
}
}
/// Extend the environment with a new mapping.
///
/// Note the types: the result of this method is a new `EvalEnvironment`,
/// with its own lifetime, and the original environment is left unmodified.
pub fn extend(&self, name: ArcIntern<String>, value: Value) -> Self {
EvalEnvironment {
inner: Arc::new(EvalEnvInternal::Value(name, value, self.inner.clone())),
}
}
/// Look up a variable in the environment, returning an error if it isn't there.
pub fn lookup(&self, n: ArcIntern<String>) -> Result<Value, LookupError> {
self.inner.lookup(n)
}
}
impl EvalEnvInternal {
/// Look up a variable in the environment, returning an error if it isn't there.
fn lookup(&self, n: ArcIntern<String>) -> Result<Value, LookupError> {
match self {
// if this is an empty dictionary, never mind, couldn't find it
EvalEnvInternal::Empty => Err(LookupError::CouldNotFind(n)),
// is this the key we have right here? if yes, return our value
EvalEnvInternal::Value(name, value, _) if *name == n => Ok(value.clone()),
// otherwise, recurse up our chain of environments
EvalEnvInternal::Value(_, _, rest) => rest.lookup(n),
}
}
@@ -70,6 +93,9 @@ mod tests {
assert!(tester.lookup(arced("baz")).is_err());
}
// added this test to make sure that our nesting property works propertly.
// it's not a big deal now, but it'll be really handy later when we add any
// kind of variable scoping.
#[test]
fn nested() {
let tester = EvalEnvironment::default();

View File

@@ -1,19 +1,39 @@
use crate::eval::value::Value;
/// Errors that can occur running primitive operations in the evaluators.
#[derive(Clone, Debug, PartialEq, thiserror::Error)]
pub enum PrimOpError {
#[error("Math error (underflow or overflow) computing {0} operator")]
MathFailure(&'static str),
/// This particular variant covers the case in which a primitive
/// operator takes two arguments that are supposed to be the same,
/// but they differ. (So, like, all the math operators.)
#[error("Type mismatch ({1} vs {2}) computing {0} operator")]
TypeMismatch(String, Value, Value),
/// This variant covers when an operator must take a particular
/// type, but the user has provided a different one.
#[error("Bad type for operator {0}: {1}")]
BadTypeFor(&'static str, Value),
/// Probably obvious from the name, but just to be very clear: this
/// happens when you pass three arguments to a two argument operator,
/// etc. Technically that's a type error of some sort, but we split
/// it out.
#[error("Illegal number of arguments for {0}: {1} arguments found")]
BadArgCount(String, usize),
#[error("Unknown primitive operation {0}")]
UnknownPrimOp(String),
}
// Implementing primitives in an interpreter like this is *super* tedious,
// and the only way to make it even somewhat manageable is to use macros.
// This particular macro works for binary operations, and assumes that
// you've already worked out that the `calculate` call provided two arguments.
//
// In those cases, it will rul the operations we know about, and error if
// it doesn't.
//
// This macro then needs to be instantiated for every type, which is super
// fun.
macro_rules! run_op {
($op: ident, $left: expr, $right: expr) => {
match $op {
@@ -23,15 +43,15 @@ macro_rules! run_op {
.map(Into::into),
"-" => $left
.checked_sub($right)
.ok_or(PrimOpError::MathFailure("+"))
.ok_or(PrimOpError::MathFailure("-"))
.map(Into::into),
"*" => $left
.checked_mul($right)
.ok_or(PrimOpError::MathFailure("+"))
.ok_or(PrimOpError::MathFailure("*"))
.map(Into::into),
"/" => $left
.checked_div($right)
.ok_or(PrimOpError::MathFailure("+"))
.ok_or(PrimOpError::MathFailure("/"))
.map(Into::into),
_ => Err(PrimOpError::UnknownPrimOp($op.to_string())),
}
@@ -41,6 +61,8 @@ macro_rules! run_op {
impl Value {
fn binary_op(operation: &str, left: &Value, right: &Value) -> Result<Value, PrimOpError> {
match left {
// for now we only have one type, but in the future this is
// going to be very irritating.
Value::I64(x) => match right {
Value::I64(y) => run_op!(operation, x, *y),
// _ => Err(PrimOpError::TypeMismatch(
@@ -52,6 +74,14 @@ impl Value {
}
}
/// Calculate the result of running the given primitive on the given arguments.
///
/// This can cause errors in a whole mess of ways, so be careful about your
/// inputs. For example, addition only works when the two values have the exact
/// same type, so expect an error if you try to do so. In addition, this
/// implementation catches and raises an error on overflow or underflow, so
/// its worth being careful to make sure that your inputs won't cause either
/// condition.
pub fn calculate(operation: &str, values: Vec<Value>) -> Result<Value, PrimOpError> {
if values.len() == 2 {
Value::binary_op(operation, &values[0], &values[1])

View File

@@ -1,5 +1,10 @@
use std::fmt::Display;
/// Values in the interpreter.
///
/// Yes, this is yet another definition of a structure called `Value`, which
/// are almost entirely identical. However, it's nice to have them separated
/// by type so that we don't mix them up.
#[derive(Clone, Debug, PartialEq)]
pub enum Value {
I64(i64),

View File

@@ -1,3 +1,17 @@
//! The middle of the compiler: analysis, simplification, optimization.
//!
//! For the moment, this module doesn't do much besides define an intermediate
//! representation for NGR programs that is a little easier to work with then
//! the structures we've built from the actual user syntax. For example, in the
//! IR syntax, function calls are simplified so that all their arguments are
//! either variables or constants, which can make reasoning about programs
//! (and implicit temporary variables) quite a bit easier.
//!
//! For the foreseeable future, this module will likely remain mostly empty
//! besides definitions, as we'll likely want to focus on just processing /
//! validating syntax, and then figuring out how to turn it into Cranelift
//! and object code. After that point, however, this will be the module to
//! come to for analysis and optimization work.
mod ast;
mod eval;
mod from_syntax;

View File

@@ -1,3 +1,4 @@
use crate::syntax::Location;
use internment::ArcIntern;
use pretty::{DocAllocator, Pretty};
use proptest::{
@@ -5,13 +6,28 @@ use proptest::{
strategy::{BoxedStrategy, Strategy},
};
use crate::syntax::Location;
/// We're going to represent variables as interned strings.
///
/// These should be fast enough for comparison that it's OK, since it's going to end up
/// being pretty much the pointer to the string.
type Variable = ArcIntern<String>;
/// The representation of a program within our IR. For now, this is exactly one file.
///
/// In addition, for the moment there's not really much of interest to hold here besides
/// the list of statements read from the file. Order is important. In the future, you
/// could imagine caching analysis information in this structure.
///
/// `Program` implements both [`Pretty`] and [`Arbitrary`]. The former should be used
/// to print the structure whenever possible, especially if you value your or your
/// user's time. The latter is useful for testing that conversions of `Program` retain
/// their meaning. All `Program`s generated through [`Arbitrary`] are guaranteed to be
/// syntactically valid, although they may contain runtime issue like over- or underflow.
#[derive(Debug)]
pub struct Program {
pub statements: Vec<Statement>,
// For now, a program is just a vector of statements. In the future, we'll probably
// extend this to include a bunch of other information, but for now: just a list.
pub(crate) statements: Vec<Statement>,
}
impl<'a, 'b, D, A> Pretty<'a, D, A> for &'b Program
@@ -23,6 +39,8 @@ where
let mut result = allocator.nil();
for stmt in self.statements.iter() {
// there's probably a better way to do this, rather than constantly
// adding to the end, but this works.
result = result
.append(stmt.pretty(allocator))
.append(allocator.text(";"))
@@ -39,11 +57,21 @@ impl Arbitrary for Program {
fn arbitrary_with(args: Self::Parameters) -> Self::Strategy {
crate::syntax::Program::arbitrary_with(args)
.prop_map(|x| Program::from(x.simplify()))
.prop_map(Program::from)
.boxed()
}
}
/// The representation of a statement in the language.
///
/// For now, this is either a binding site (`x = 4`) or a print statement
/// (`print x`). Someday, though, more!
///
/// As with `Program`, this type implements [`Pretty`], which should
/// be used to display the structure whenever possible. It does not
/// implement [`Arbitrary`], though, mostly because it's slightly
/// complicated to do so.
///
#[derive(Debug)]
pub enum Statement {
Binding(Location, Variable, Expression),
@@ -71,6 +99,18 @@ where
}
}
/// The representation of an expression.
///
/// Note that expressions, like everything else in this syntax tree,
/// supports [`Pretty`], and it's strongly encouraged that you use
/// that trait/module when printing these structures.
///
/// Also, Expressions at this point in the compiler are explicitly
/// defined so that they are *not* recursive. By this point, if an
/// expression requires some other data (like, for example, invoking
/// a primitive), any subexpressions have been bound to variables so
/// that the referenced data will always either be a constant or a
/// variable reference.
#[derive(Debug)]
pub enum Expression {
Value(Location, Value),
@@ -107,6 +147,12 @@ where
}
}
/// A type representing the primitives allowed in the language.
///
/// Having this as an enumeration avoids a lot of "this should not happen"
/// cases, but might prove to be cumbersome in the future. If that happens,
/// this may either become a more hierarchical enumeration, or we'll just
/// deal with the "this should not happen" cases.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum Primitive {
Plus,
@@ -144,6 +190,11 @@ where
}
}
/// An expression that is always either a value or a reference.
///
/// This is the type used to guarantee that we don't nest expressions
/// at this level. Instead, expressions that take arguments take one
/// of these, which can only be a constant or a reference.
#[derive(Debug)]
pub enum ValueOrRef {
Value(Location, Value),
@@ -163,8 +214,23 @@ where
}
}
impl From<ValueOrRef> for Expression {
fn from(value: ValueOrRef) -> Self {
match value {
ValueOrRef::Value(loc, val) => Expression::Value(loc, val),
ValueOrRef::Ref(loc, var) => Expression::Reference(loc, var),
}
}
}
/// A constant in the IR.
#[derive(Debug)]
pub enum Value {
/// A numerical constant.
///
/// The optional argument is the base that was used by the user to input
/// the number. By retaining it, we can ensure that if we need to print the
/// number back out, we can do so in the form that the user entered it.
Number(Option<u8>, i64),
}

View File

@@ -4,6 +4,10 @@ use crate::ir::{Expression, Program, Statement};
use super::{Primitive, ValueOrRef};
impl Program {
/// Evaluate the program, returning either an error or a string containing everything
/// the program printed out.
///
/// The print outs will be newline separated, with one print out per line.
pub fn eval(&self) -> Result<String, EvalError> {
let mut env = EvalEnvironment::empty();
let mut stdout = String::new();
@@ -39,6 +43,9 @@ impl Expression {
Expression::Primitive(_, op, args) => {
let mut arg_values = Vec::with_capacity(args.len());
// we implement primitive operations by first evaluating each of the
// arguments to the function, and then gathering up all the values
// produced.
for arg in args.iter() {
match arg {
ValueOrRef::Ref(_, n) => arg_values.push(env.lookup(n.clone())?),
@@ -48,6 +55,8 @@ impl Expression {
}
}
// and then finally we call `calculate` to run them. trust me, it's nice
// to not have to deal with all the nonsense hidden under `calculate`.
match op {
Primitive::Plus => Ok(Value::calculate("+", arg_values)?),
Primitive::Minus => Ok(Value::calculate("-", arg_values)?),
@@ -62,7 +71,7 @@ impl Expression {
#[test]
fn two_plus_three() {
let input = crate::syntax::Program::parse(0, "x = 2 + 3; print x;").expect("parse works");
let ir = Program::from(input.simplify());
let ir = Program::from(input);
let output = ir.eval().expect("runs successfully");
assert_eq!("x = 5i64\n", &output);
}
@@ -71,7 +80,7 @@ fn two_plus_three() {
fn lotsa_math() {
let input =
crate::syntax::Program::parse(0, "x = 2 + 3 * 10 / 5 - 1; print x;").expect("parse works");
let ir = Program::from(input.simplify());
let ir = Program::from(input);
let output = ir.eval().expect("runs successfully");
assert_eq!("x = 7i64\n", &output);
}

View File

@@ -1,82 +1,185 @@
use internment::ArcIntern;
use std::sync::atomic::AtomicUsize;
use crate::ir::ast as ir;
use crate::syntax::ast as syntax;
use crate::syntax;
use super::ValueOrRef;
impl From<syntax::Program> for ir::Program {
/// We implement the top-level conversion of a syntax::Program into an
/// ir::Program using just the standard `From::from`, because we don't
/// need to return any arguments and we shouldn't produce any errors.
/// Technically there's an `unwrap` deep under the hood that we could
/// float out, but the validator really should've made sure that never
/// happens, so we're just going to assume.
fn from(mut value: syntax::Program) -> Self {
ir::Program {
statements: value.statements.drain(..).map(Into::into).collect(),
let mut statements = Vec::new();
for stmt in value.statements.drain(..) {
statements.append(&mut stmt.simplify());
}
ir::Program { statements }
}
}
impl From<Vec<syntax::Statement>> for ir::Program {
fn from(mut value: Vec<syntax::Statement>) -> Self {
ir::Program {
statements: value.drain(..).map(Into::into).collect(),
}
}
}
impl From<syntax::Statement> for ir::Statement {
impl From<syntax::Statement> for ir::Program {
/// One interesting thing about this conversion is that there isn't
/// a natural translation from syntax::Statement to ir::Statement,
/// because the syntax version can have nested expressions and the
/// IR version can't.
///
/// As a result, we can naturally convert a syntax::Statement into
/// an ir::Program, because we can allow the additional binding
/// sites to be generated, instead. And, bonus, it turns out that
/// this is what we wanted anyways.
fn from(value: syntax::Statement) -> Self {
match value {
syntax::Statement::Binding(loc, name, expr) => {
ir::Statement::Binding(loc, ArcIntern::from(name), ir::Expression::from(expr))
}
syntax::Statement::Print(loc, name) => ir::Statement::Print(loc, ArcIntern::from(name)),
ir::Program {
statements: value.simplify(),
}
}
}
impl From<syntax::Expression> for ir::Expression {
fn from(value: syntax::Expression) -> Self {
match value {
syntax::Expression::Primitive(loc, name, mut exprs) => ir::Expression::Primitive(
impl syntax::Statement {
/// Simplify a syntax::Statement into a series of ir::Statements.
///
/// The reason this function is one-to-many is because we may have to
/// introduce new binding sites in order to avoid having nested
/// expressions. Nested expressions, like `(1 + 2) * 3`, are allowed
/// in syntax::Expression but are expressly *not* allowed in
/// ir::Expression. So this pass converts them into bindings, like
/// this:
///
/// x = (1 + 2) * 3;
///
/// ==>
///
/// x:1 = 1 + 2;
/// x:2 = x:1 * 3;
/// x = x:2
///
/// Thus ensuring that things are nice and simple. Note that the
/// binding of `x:2` is not, strictly speaking, necessary, but it
/// makes the code below much easier to read.
fn simplify(self) -> Vec<ir::Statement> {
let mut new_statements = vec![];
match self {
// Print statements we don't have to do much with
syntax::Statement::Print(loc, name) => {
new_statements.push(ir::Statement::Print(loc, ArcIntern::new(name)))
}
// Bindings, however, may involve a single expression turning into
// a series of statements and then an expression.
syntax::Statement::Binding(loc, name, value) => {
let (mut prereqs, new_value) = value.rebind(&name);
new_statements.append(&mut prereqs);
new_statements.push(ir::Statement::Binding(
loc,
ir::Primitive::try_from(name.as_str()).unwrap(),
exprs.drain(..).map(Into::into).collect(),
),
syntax::Expression::Reference(loc, name) => {
ir::Expression::Reference(loc, ArcIntern::from(name))
}
syntax::Expression::Value(loc, value) => {
ir::Expression::Value(loc, ir::Value::from(value))
ArcIntern::new(name),
new_value.into(),
))
}
}
new_statements
}
}
impl From<syntax::Expression> for ir::ValueOrRef {
fn from(value: syntax::Expression) -> Self {
match value {
syntax::Expression::Primitive(loc, _, _) => {
panic!("{:?}: couldn't convert to valueorref", loc)
impl syntax::Expression {
/// This actually does the meat of the simplification work, here, by rebinding
/// any nested expressions into their own variables. We have this return
/// `ValueOrRef` in all cases because it makes for slighly less code; in the
/// case when we actually want an `Expression`, we can just use `into()`.
fn rebind(self, base_name: &str) -> (Vec<ir::Statement>, ir::ValueOrRef) {
match self {
// Values just convert in the obvious way, and require no prereqs
syntax::Expression::Value(loc, val) => (vec![], ValueOrRef::Value(loc, val.into())),
// Similarly, references just convert in the obvious way, and require
// no prereqs
syntax::Expression::Reference(loc, name) => {
(vec![], ValueOrRef::Ref(loc, ArcIntern::new(name)))
}
syntax::Expression::Reference(loc, var) => {
ir::ValueOrRef::Ref(loc, ArcIntern::new(var))
// Primitive expressions are where we do the real work.
syntax::Expression::Primitive(loc, prim, mut expressions) => {
// generate a fresh new name for the binding site we're going to
// introduce, basing the name on wherever we came from; so if this
// expression was bound to `x` originally, it might become `x:23`.
//
// gensym is guaranteed to give us a name that is unused anywhere
// else in the program.
let new_name = gensym(base_name);
let mut prereqs = Vec::new();
let mut new_exprs = Vec::new();
// here we loop through every argument, and recurse on the expressions
// we find. that will give us any new binding sites that *they* introduce,
// and a simple value or reference that we can use in our result.
for expr in expressions.drain(..) {
let (mut cur_prereqs, arg) = expr.rebind(new_name.as_str());
prereqs.append(&mut cur_prereqs);
new_exprs.push(arg);
}
syntax::Expression::Value(loc, val) => ir::ValueOrRef::Value(loc, val.into()),
// now we're going to use those new arguments to run the primitive, binding
// the results to the new variable we introduced.
let prim =
ir::Primitive::try_from(prim.as_str()).expect("is valid primitive function");
prereqs.push(ir::Statement::Binding(
loc.clone(),
new_name.clone(),
ir::Expression::Primitive(loc.clone(), prim, new_exprs),
));
// and finally, we can return all the new bindings, and a reference to
// the variable we just introduced to hold the value of the primitive
// invocation.
(prereqs, ValueOrRef::Ref(loc, new_name))
}
}
}
}
impl From<syntax::Value> for ir::Value {
fn from(x: syntax::Value) -> Self {
match x {
syntax::Value::Number(base, value) => ir::Value::Number(base, value),
fn from(value: syntax::Value) -> Self {
match value {
syntax::Value::Number(base, val) => ir::Value::Number(base, val),
}
}
}
impl From<String> for ir::Primitive {
fn from(value: String) -> Self {
value.try_into().unwrap()
}
}
/// Generate a fresh new name based on the given name.
///
/// The new name is guaranteed to be unique across the entirety of the
/// execution. This is achieved by using characters in the variable name
/// that would not be valid input, and by including a counter that is
/// incremented on every invocation.
fn gensym(name: &str) -> ArcIntern<String> {
static COUNTER: AtomicUsize = AtomicUsize::new(0);
let new_name = format!(
"<{}:{}>",
name,
COUNTER.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
);
ArcIntern::new(new_name)
}
proptest::proptest! {
#[test]
fn translation_maintains_semantics(input: syntax::Program) {
let syntax_result = input.eval();
let ir = ir::Program::from(input.simplify());
let ir = ir::Program::from(input);
let ir_result = ir.eval();
assert_eq!(syntax_result, ir_result);
}

View File

@@ -3,6 +3,10 @@ use internment::ArcIntern;
use std::collections::HashSet;
impl Program {
/// Get the complete list of strings used within the program.
///
/// For the purposes of this function, strings are the variables used in
/// `print` statements.
pub fn strings(&self) -> HashSet<ArcIntern<String>> {
let mut result = HashSet::new();

View File

@@ -1,4 +1,75 @@
//! # NGR (No Good Reason) Compiler
//!
//! This is the top-level module for the NGR compiler; a compiler written
//! in Rust for no good reason. I may eventually try to turn this into a
//! basic guide for writing compilers, but for now it's a fairly silly
//! (although complete) language and implementation, featuring:
//!
//! * Variable binding with basic arithmetic operators.
//! * The ability to print variable values.
//!
//! I'll be extending this list into the future, with the eventual goal of
//! being able to implement basic programming tasks with it. For example,
//! I have a goal of eventually writing reasonably-clear
//! [Advent of Code](https://adventofcode.com/) implementations with it.
//!
//! Users of this as a library will want to choose their adventure based
//! on how much they want to customize their experience; I've defaulted
//! to providing the ability to see internals, rather than masking them,
//! so folks can play with things as they see fit.
//!
//! ## Easy Mode - Just Running a REPL or Compiler
//!
//! For easiest use, you will want to use either the [`Compiler`] object
//! or the [`REPL`] object.
//!
//! As you might expect, the [`Compiler`] object builds a compiler, which
//! can be re-used to compile as many files as you'd like. Right now,
//! that's all it does. (TODO: Add a linker function to it.)
//!
//! The [`REPL`] object implements the core of what you'll need to
//! implement a just-in-time compiled read-eval-print loop. It will
//! maintain variable state and make sure that variables are linked
//! appropriately as the loop progresses.
//!
//! ## Hard Mode - Looking at the individual passes
//!
//! This compiler is broken into three core parts:
//!
//! 1. The front-end / syntax engine. This portion of the compiler is
//! responsible for turning basic strings (or files) into a machine-
//! friendly abstract syntax tree. See the [`syntax`] module for
//! more information.
//! 2. The IR. This portion of the compiler will be responsible for
//! high-level code analysis and transformation ... although for
//! now, it doesn't do much at all. See the [`ir`] module for more
//! information.
//! 3. The Backend implementation. This portion of the compiler turns
//! the IR from the previous section into Cranelift structures, and
//! helps with either compiling them via JIT or statically compiling
//! them into a file. The [`backend`] module also contains information
//! about the runtime functions made available to the user.
//!
//! ## Testing
//!
//! Testing is a key focus of this effort. To that end, both the syntax
//! tree used in the syntax module and the IR used in the middle of the
//! compiler both implement `Arbitrary`, and are subject to property-based
//! testing to make sure that various passes work properly.
//!
//! In addition, to support basic equivalence testing, we include support
//! for evaluating all expressions. The [`eval`] module provides some
//! utility support for this work.
//!
pub mod backend;
pub mod eval;
pub mod ir;
pub mod syntax;
/// Implementation module for the high-level compiler.
mod compiler;
/// Implementation module for the high-level REPL.
mod repl;
pub use crate::compiler::Compiler;
pub use crate::repl::REPL;

166
src/repl.rs Normal file
View File

@@ -0,0 +1,166 @@
use crate::backend::{Backend, BackendError};
use crate::ir::Program as IR;
use crate::syntax::{Location, ParserError, Statement};
use codespan_reporting::diagnostic::Diagnostic;
use codespan_reporting::files::SimpleFiles;
use codespan_reporting::term::{self, Config};
use cranelift_jit::JITModule;
use cranelift_module::ModuleError;
use pretty::termcolor::{ColorChoice, StandardStream};
use std::collections::HashMap;
/// A high-level REPL helper for NGR.
///
/// This object holds most of the state required to implement some
/// form of interactive compiler for NGR; all you need to do is provide
/// the actual user IO.
///
/// For most console-based used cases, the [`Default`] implementation
/// should be sufficient; it prints any warnings or errors to `stdout`,
/// using a default color scheme that should work based on the terminal
/// type. For more complex interactions, though, you may want to use
/// the `REPL::new` function to provide your own print substrate.
pub struct REPL {
file_database: SimpleFiles<String, String>,
jitter: Backend<JITModule>,
variable_binding_sites: HashMap<String, Location>,
console: StandardStream,
console_config: Config,
}
impl Default for REPL {
fn default() -> Self {
let console = StandardStream::stdout(ColorChoice::Auto);
REPL::new(console, Config::default()).unwrap()
}
}
#[allow(clippy::upper_case_acronyms)]
#[derive(Debug, thiserror::Error)]
enum REPLError {
#[error("Error parsing statement: {0}")]
Parser(#[from] ParserError),
#[error("JIT error: {0}")]
JIT(#[from] BackendError),
#[error("Internal cranelift error: {0}")]
Cranelift(#[from] ModuleError),
#[error(transparent)]
Reporting(#[from] codespan_reporting::files::Error),
}
impl From<REPLError> for Diagnostic<usize> {
fn from(value: REPLError) -> Self {
match value {
REPLError::Parser(err) => Diagnostic::from(&err),
REPLError::JIT(err) => Diagnostic::from(err),
REPLError::Cranelift(err) => Diagnostic::bug().with_message(format!("{}", err)),
REPLError::Reporting(err) => Diagnostic::bug().with_message(format!("{}", err)),
}
}
}
impl REPL {
/// Construct a new REPL helper, using the given stream implementation and console configuration.
///
/// For most users, the [`Default::default`] implementation will be sufficient;
/// it will use `stdout` and a default console configuration. But if you need to
/// be more specific, this will help you provide more guidance to the REPL as it
/// evaluates things.
pub fn new(console: StandardStream, console_config: Config) -> Result<Self, BackendError> {
Ok(REPL {
file_database: SimpleFiles::new(),
jitter: Backend::jit(None)?,
variable_binding_sites: HashMap::new(),
console,
console_config,
})
}
/// Emit a diagnostic to the configured console.
///
/// This is just a convenience function; there's a lot of boilerplate in printing
/// diagnostics, and it was nice to pull it out into its own function.
fn emit_diagnostic(
&mut self,
diagnostic: Diagnostic<usize>,
) -> Result<(), codespan_reporting::files::Error> {
term::emit(
&mut self.console,
&self.console_config,
&self.file_database,
&diagnostic,
)
}
/// Process a line of input, printing any problems or the results.
///
/// The line number argument is just for a modicum of source information, to
/// provide to the user if some parsing or validation step fails. It can be
/// changed to be any value you like that provides some insight into what
/// failed, although it is probably a good idea for it to be different for
/// every invocation of this function. (Not critical, but a good idea.)
///
/// Any warnings or errors generated in processing this command will be
/// printed to the configured console. If there are no problems, the
/// command will be compiled and then executed.
pub fn process_input(&mut self, line_no: usize, command: String) {
if let Err(err) = self.process(line_no, command) {
if let Err(e) = self.emit_diagnostic(Diagnostic::from(err)) {
eprintln!(
"WOAH! System having trouble printing error messages. This is very bad. ({})",
e
);
}
}
}
/// The internal implementation, with a handy `Result` type.
///
/// All information from the documentation of `REPL::process_input` applies here,
/// as well; this is the internal implementation of that function, which is
/// differentiated by returning a `Result` type that is hidden from the user
/// in the case of `REPL::process_input`.
fn process(&mut self, line_no: usize, command: String) -> Result<(), REPLError> {
let entry = self.file_database.add("entry".to_string(), command);
let source = self
.file_database
.get(entry)
.expect("entry exists")
.source();
let syntax = Statement::parse(entry, source)?;
// if this is a variable binding, and we've never defined this variable before,
// we should tell cranelift about it. this is optimistic; if we fail to compile,
// then we won't use this definition until someone tries again.
if let Statement::Binding(_, ref name, _) = syntax {
if !self.variable_binding_sites.contains_key(name.as_str()) {
self.jitter.define_string(name)?;
self.jitter.define_variable(name.clone())?;
}
};
let (mut errors, mut warnings) = syntax.validate(&mut self.variable_binding_sites);
let stop = !errors.is_empty();
let messages = errors
.drain(..)
.map(Into::into)
.chain(warnings.drain(..).map(Into::into));
for message in messages {
self.emit_diagnostic(message)?;
}
if stop {
return Ok(());
}
let ir = IR::from(syntax);
let name = format!("line{}", line_no);
let function_id = self.jitter.compile_function(&name, ir)?;
self.jitter.module.finalize_definitions()?;
let compiled_bytes = self.jitter.bytes(function_id);
let compiled_function = unsafe { std::mem::transmute::<_, fn() -> ()>(compiled_bytes) };
compiled_function();
Ok(())
}
}

View File

@@ -1,12 +1,36 @@
//! NGR Parsing: Reading input, turning it into sense (or errors).
//!
//! This module implement the front end of the compiler, which is responsible for
//! reading in NGR syntax as a string, turning it into a series of reasonable Rust
//! structures for us to manipulate, and doing some validation while it's at it.
//!
//! The core flow for this work is:
//!
//! * Turning the string into a series of language-specific [`Token`]s.
//! * Taking those tokens, and computing a basic syntax tree from them,
//! using our parser ([`ProgramParser`] or [`StatementParser`], generated
//! by [`lalrpop`](https://lalrpop.github.io/lalrpop/)).
//! * Validating the tree we have parsed, using [`Program::validate`],
//! returning any warnings or errors we have found.
//!
//! In addition to all of this, we make sure that the structures defined in this
//! module are all:
//!
//! * Instances of [`Pretty`](::pretty::Pretty), so that you can print stuff back
//! out that can be read by a human.
//! * Instances of [`Arbitrary`](proptest::prelude::Arbitrary), so they can be
//! used in `proptest`-based property testing. There are built-in tests in
//! the library, for example, to make sure that the pretty-printing round-trips.
//! * Can be evaluated using an `eval` function, for comparison with later
//! versions of the function downstream.
use codespan_reporting::{diagnostic::Diagnostic, files::SimpleFiles};
use lalrpop_util::lalrpop_mod;
use logos::Logos;
mod arbitrary;
pub mod ast;
mod ast;
mod eval;
mod location;
mod simplify;
mod tokens;
lalrpop_mod!(
#[allow(clippy::just_underscores_and_digits, clippy::clone_on_copy)]
@@ -18,7 +42,7 @@ mod validate;
pub use crate::syntax::ast::*;
pub use crate::syntax::location::Location;
use crate::syntax::parser::ProgramParser;
pub use crate::syntax::parser::{ProgramParser, StatementParser};
pub use crate::syntax::tokens::{LexerError, Token};
#[cfg(test)]
use ::pretty::{Arena, Pretty};
@@ -29,33 +53,62 @@ use proptest::{prop_assert, prop_assert_eq};
use std::str::FromStr;
use thiserror::Error;
use self::parser::StatementParser;
/// One of the many errors that can occur when processing text input.
///
/// If you get one of these and want to display it to the user, we strongly
/// suggest using the [`From`] implementation to turn this into a [`Diagnostic`],
/// and then printing it via [`codespan_reporting`].
#[derive(Debug, Error)]
pub enum ParserError {
/// Raised by the lexer when we see some text that doesn't make
/// any sense in the language.
#[error("Invalid token")]
InvalidToken(Location),
/// Raised when we're parsing the file and run into an EOF in a
/// place we really weren't expecting.
#[error("Unrecognized EOF")]
UnrecognizedEOF(Location, Vec<String>),
/// Raised when we're parsing the file, and run into a token in a
/// place we weren't expecting it.
#[error("Unrecognized token")]
UnrecognizedToken(Location, Location, Token, Vec<String>),
/// Raised when we were expecting the end of the file, but instead
/// got another token.
#[error("Extra token")]
ExtraToken(Location, Token, Location),
/// Raised when the lexer just had some sort of internal problem
/// and just gave up.
#[error("Lexing failure")]
LexFailure(Location),
/// Raised when we tried to reference a file, or add a file, to our
/// file database, and the database ran into a problem.
#[error("File database error")]
FileDatabaseError(#[from] codespan_reporting::files::Error),
/// Raised when the OS is having problems giving us data.
#[error("Read error")]
ReadError(#[from] std::io::Error),
}
impl ParserError {
/// Convert one of lalrpop's parser errors into one of our own, which we can more
/// easily implement translation into [`Diagnostic`].
///
/// This function is relatively straightforward, because we match the errors pretty
/// closely. The major thing we do here is convert [`lalrpop`]'s notion of a location,
/// which is just an offset that it got from the lexer, into an actual location that
/// we can use in our [`Diagnostic`]s.
fn convert(file_idx: usize, err: ParseError<usize, Token, LexerError>) -> Self {
match err {
ParseError::InvalidToken { location } => {
ParserError::InvalidToken(Location::new(file_idx, location))
}
ParseError::UnrecognizedEOF { location, expected } => {
ParseError::UnrecognizedEof { location, expected } => {
ParserError::UnrecognizedEOF(Location::new(file_idx, location), expected)
}
ParseError::UnrecognizedToken {
@@ -83,6 +136,10 @@ impl ParserError {
}
}
/// This is just a nice little function to print out what we expected, if
/// we had some expectations. Because English is a little wonky, there's
/// some odd stuff with whether we get 0, 1, 2, or more, and it's nice to
/// just split that bit of logic out.
fn display_expected(expected: &[String]) -> String {
match expected.len() {
0 => "".to_string(),
@@ -96,6 +153,8 @@ fn display_expected(expected: &[String]) -> String {
}
}
/// Given a list of strings, comma separate (with a space) them, as in an
/// English list.
fn comma_separate(strings: &[String]) -> String {
let mut result = String::new();
@@ -125,12 +184,12 @@ impl<'a> From<&'a ParserError> for Diagnostic<usize> {
let expected_str =
format!("unexpected token {}{}", token, display_expected(expected));
let unexpected_str = format!("unexpected token {}", token);
let mut labels = start.range_label(end);
let labels = start.range_label(end);
Diagnostic::error()
.with_labels(
labels
.drain(..)
.into_iter()
.map(|l| l.with_message(unexpected_str.clone()))
.collect(),
)
@@ -142,12 +201,12 @@ impl<'a> From<&'a ParserError> for Diagnostic<usize> {
let expected_str =
format!("unexpected token {} after the expected end of file", token);
let unexpected_str = format!("unexpected token {}", token);
let mut labels = start.range_label(end);
let labels = start.range_label(end);
Diagnostic::error()
.with_labels(
labels
.drain(..)
.into_iter()
.map(|l| l.with_message(unexpected_str.clone()))
.collect(),
)
@@ -167,6 +226,14 @@ impl<'a> From<&'a ParserError> for Diagnostic<usize> {
}
impl Program {
/// Parse the given file, adding it to the database as part of the process.
///
/// This operation reads the file from disk and adds it to the database for future
/// reference. If you get an error, we strongly suggest conversion to [`Diagnostic`]
/// and then reporting it to the user via [`codespan_reporting`]. You should use
/// this function if you're pretty sure that you've never seen this file before,
/// and [`Program::parse`] if you have and know its index and already have it in
/// memory.
pub fn parse_file(
file_database: &mut SimpleFiles<String, String>,
file_name: &str,
@@ -177,6 +244,11 @@ impl Program {
Program::parse(file_handle, file_db_info.source())
}
/// Parse a block of text you have in memory, using the given index for [`Location`]s.
///
/// If you use a nonsensical file index, everything will work fine until you try to
/// report an error, at which point [`codespan_reporting`] may have some nasty things
/// to say to you.
pub fn parse(file_idx: usize, buffer: &str) -> Result<Program, ParserError> {
let lexer = Token::lexer(buffer)
.spanned()
@@ -188,6 +260,12 @@ impl Program {
}
impl Statement {
/// Parse a statement that you have in memory, using the given index for [`Location`]s.
///
/// As with [`Program::parse`], if you use a bad file index, you'll get weird behaviors
/// when you try to print errors, but things should otherwise work fine. This function
/// will only parse a single statement, which is useful in the REPL, but probably shouldn't
/// be used when reading in whole files.
pub fn parse(file_idx: usize, buffer: &str) -> Result<Statement, ParserError> {
let lexer = Token::lexer(buffer)
.spanned()

View File

@@ -1,12 +1,32 @@
use crate::syntax::Location;
/// The set of valid binary operators.
pub static BINARY_OPERATORS: &[&str] = &["+", "-", "*", "/"];
/// A structure represented a parsed program.
///
/// One `Program` is associated with exactly one input file, and the
/// vector is arranged in exactly the same order as the parsed file.
/// Because this is the syntax layer, the program is guaranteed to be
/// syntactically valid, but may be nonsense. There could be attempts
/// to use unbound variables, for example, until after someone runs
/// `validate` and it comes back without errors.
#[derive(Clone, Debug, PartialEq)]
pub struct Program {
pub statements: Vec<Statement>,
}
/// A parsed statement.
///
/// Statements are guaranteed to be syntactically valid, but may be
/// complete nonsense at the semantic level. Which is to say, all the
/// print statements were correctly formatted, and all the variables
/// referenced are definitely valid symbols, but they may not have
/// been defined or anything.
///
/// Note that equivalence testing on statements is independent of
/// source location; it is testing if the two statements say the same
/// thing, not if they are the exact same statement.
#[derive(Clone, Debug)]
pub enum Statement {
Binding(Location, String, Expression),
@@ -28,6 +48,12 @@ impl PartialEq for Statement {
}
}
/// An expression in the underlying syntax.
///
/// Like statements, these expressions are guaranteed to have been
/// formatted correctly, but may not actually make any sense. Also
/// like Statements, the [`PartialEq`] implementation does not take
/// source positions into account.
#[derive(Clone, Debug)]
pub enum Expression {
Value(Location, Value),
@@ -54,7 +80,9 @@ impl PartialEq for Expression {
}
}
/// A value from the source syntax
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum Value {
/// The value of the number, and an optional base that it was written in
Number(Option<u8>, i64),
}

View File

@@ -4,11 +4,23 @@ use crate::eval::{EvalEnvironment, EvalError, Value};
use crate::syntax::{Expression, Program, Statement};
impl Program {
/// Evaluate the program, returning either an error or what it prints out when run.
///
/// Doing this evaluation is particularly useful for testing, to ensure that if we
/// modify a program in some way it does the same thing on both sides of the
/// transformation. It's also sometimes just nice to know what a program will be
/// doing.
///
/// Note that the errors here are slightly more strict that we enforce at runtime.
/// For example, we check for overflow and underflow errors during evaluation, and
/// we don't check for those in the compiled code.
pub fn eval(&self) -> Result<String, EvalError> {
let mut env = EvalEnvironment::empty();
let mut stdout = String::new();
for stmt in self.statements.iter() {
// at this point, evaluation is pretty simple. just walk through each
// statement, in order, and record printouts as we come to them.
match stmt {
Statement::Binding(_, name, value) => {
let actual_value = value.eval(&env)?;
@@ -40,6 +52,7 @@ impl Expression {
let mut arg_values = Vec::with_capacity(args.len());
for arg in args.iter() {
// yay, recursion! makes this pretty straightforward
arg_values.push(arg.eval(env)?);
}

View File

@@ -1,5 +1,9 @@
use codespan_reporting::diagnostic::{Diagnostic, Label};
/// A source location, for use in pointing users towards warnings and errors.
///
/// Internally, locations are very tied to the `codespan_reporting` library,
/// and the primary use of them is to serve as anchors within that library.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Location {
file_idx: usize,
@@ -7,10 +11,22 @@ pub struct Location {
}
impl Location {
/// Generate a new `Location` from a file index and an offset from the
/// start of the file.
///
/// The file index is based on the file database being used. See the
/// `codespan_reporting::files::SimpleFiles::add` function, which is
/// normally where we get this index.
pub fn new(file_idx: usize, offset: usize) -> Self {
Location { file_idx, offset }
}
/// Generate a `Location` for a completely manufactured bit of code.
///
/// Ideally, this is used only in testing, as any code we generate as
/// part of the compiler should, theoretically, be tied to some actual
/// location in the source code. That being said, this can be used in
/// a pinch ... just maybe try to avoid it if you can.
pub fn manufactured() -> Self {
Location {
file_idx: 0,
@@ -18,27 +34,73 @@ impl Location {
}
}
/// Generate a primary label for a [`Diagnostic`], based on this source
/// location.
///
/// Note, this is just the [`Label`], you'll want to fill in the [`Diagnostic`]
/// with a lot more information.
///
/// Primary labels are the things that are they key cause of the message.
/// If, for example, it was an error to bind a variable named "x", and
/// then have another binding of a variable named "x", the second one
/// would likely be the primary label (because that's where the error
/// actually happened), but you'd probably want to make the first location
/// the secondary label to help users find it.
pub fn primary_label(&self) -> Label<usize> {
Label::primary(self.file_idx, self.offset..self.offset)
}
/// Generate a secondary label for a [`Diagnostic`], based on this source
/// location.
///
/// Note, this is just the [`Label`], you'll want to fill in the [`Diagnostic`]
/// with a lot more information.
///
/// Secondary labels are the things that are involved in the message, but
/// aren't necessarily a problem in and of themselves. If, for example, it
/// was an error to bind a variable named "x", and then have another binding
/// of a variable named "x", the second one would likely be the primary
/// label (because that's where the error actually happened), but you'd
/// probably want to make the first location the secondary label to help
/// users find it.
pub fn secondary_label(&self) -> Label<usize> {
Label::secondary(self.file_idx, self.offset..self.offset)
}
pub fn range_label(&self, end: &Location) -> Vec<Label<usize>> {
if self.file_idx == end.file_idx {
vec![Label::primary(self.file_idx, self.offset..end.offset)]
} else if self.file_idx == 0 {
// if this is a manufactured item, then ... just try the other one
vec![Label::primary(end.file_idx, end.offset..end.offset)]
/// Given this location and another, generate a primary label that
/// specifies the area between those two locations.
///
/// See [`Self::primary_label`] for some discussion of primary versus
/// secondary labels. If the two locations are the same, this method does
/// the exact same thing as [`Self::primary_label`]. If this item was
/// generated by [`Self::manufactured`], it will act as if you'd called
/// `primary_label` on the argument. Otherwise, it will generate the obvious
/// span.
///
/// This function will return `None` only in the case that you provide
/// labels from two different files, which it cannot sensibly handle.
pub fn range_label(&self, end: &Location) -> Option<Label<usize>> {
if self.file_idx == 0 {
return Some(end.primary_label());
}
if self.file_idx != end.file_idx {
return None;
}
if self.offset > end.offset {
Some(Label::primary(self.file_idx, end.offset..self.offset))
} else {
// we'll just pick the first location if this is in two different
// files
vec![Label::primary(self.file_idx, self.offset..self.offset)]
Some(Label::primary(self.file_idx, self.offset..end.offset))
}
}
/// Return an error diagnostic centered at this location.
///
/// Note that this [`Diagnostic`] will have no information associated with
/// it other than that (a) there is an error, and (b) that the error is at
/// this particular location. You'll need to extend it with actually useful
/// information, like what kind of error it is.
pub fn error(&self) -> Diagnostic<usize> {
Diagnostic::error().with_labels(vec![Label::primary(
self.file_idx,
@@ -46,6 +108,12 @@ impl Location {
)])
}
/// Return an error diagnostic centered at this location, with the given message.
///
/// This is much more useful than [`Self::error`], because it actually provides
/// the user with some guidance. That being said, you still might want to add
/// even more information to ut, using [`Diagnostic::with_labels`],
/// [`Diagnostic::with_notes`], or [`Diagnostic::with_code`].
pub fn labelled_error(&self, msg: &str) -> Diagnostic<usize> {
Diagnostic::error().with_labels(vec![Label::primary(
self.file_idx,

View File

@@ -1,14 +1,32 @@
//! The parser for NGR!
//!
//! This file contains the grammar for the NGR language; a grammar is a nice,
//! machine-readable way to describe how your language's syntax works. For
//! example, here we describe a program as a series of statements, statements
//! as either variable binding or print statements, etc. As the grammar gets
//! more complicated, using tools like [`lalrpop`] becomes even more important.
//! (Although, at some point, things can become so complicated that you might
//! eventually want to leave lalrpop behind.)
//!
use crate::syntax::{LexerError, Location};
use crate::syntax::ast::{Program,Statement,Expression,Value};
use crate::syntax::tokens::Token;
use internment::ArcIntern;
// one cool thing about lalrpop: we can pass arguments. in this case, the
// file index of the file we're parsing. we combine this with the file offset
// that Logos gives us to make a [`crate::syntax::Location`].
grammar(file_idx: usize);
// this is a slighlyt odd way to describe this, but: consider this section
// as describing the stuff that is external to the lalrpop grammar that it
// needs to know to do its job.
extern {
type Location = usize;
type Location = usize; // Logos, our lexer, implements locations as
// offsets from the start of the file.
type Error = LexerError;
// here we redeclare all of the tokens.
enum Token {
"=" => Token::Equals,
";" => Token::Semi,
@@ -22,57 +40,123 @@ extern {
"*" => Token::Operator('*'),
"/" => Token::Operator('/'),
// the previous items just match their tokens, and if you try
// to name and use "their value", you get their source location.
// For these, we want "their value" to be their actual contents,
// which is why we put their types in angle brackets.
"<num>" => Token::Number((<Option<u8>>,<i64>)),
"<var>" => Token::Variable(<ArcIntern<String>>),
}
}
pub Program: Program = {
// a program is just a set of statements
<stmts:Statements> => Program {
statements: stmts
}
}
Statements: Vec<Statement> = {
// a statement is either a set of statements followed by another
// statement (note, here, that you can name the result of a sub-parse
// using <name: subrule>) ...
<mut stmts:Statements> <stmt:Statement> => {
stmts.push(stmt);
stmts
},
// ... or it's nothing. This may feel like an awkward way to define
// lists of things -- and it is a bit awkward -- but there are actual
// technical reasons that you want to (a) use recursivion to define
// these, and (b) use *left* recursion, specifically. That's why, in
// this file, all of the recursive cases are to the left, like they
// are above.
//
// the details of why left recursion is better is actually pretty
// fiddly and in the weeds, and if you're interested you should look
// up LALR parsers versus LL parsers; both their differences and how
// they're constructed, as they're kind of neat.
//
// but if you're just writing grammars with lalrpop, then you should
// just remember that you should always use left recursion, and be
// done with it.
=> {
Vec::new()
}
}
pub Statement: Statement = {
// A statement can be a variable binding. Note, here, that we use this
// funny @L thing to get the source location before the variable, so that
// we can say that this statement spans across everything.
<l:@L> <v:"<var>"> "=" <e:Expression> ";" => Statement::Binding(Location::new(file_idx, l), v.to_string(), e),
// Alternatively, a statement can just be a print statement.
"print" <l:@L> <v:"<var>"> ";" => Statement::Print(Location::new(file_idx, l), v.to_string()),
}
// Expressions! Expressions are a little fiddly, because we're going to
// use a little bit of a trick to make sure that we get operator precedence
// right. The trick works by creating a top-level `Expression` grammar entry
// that just points to the thing with the *weakest* precedence. In this case,
// we have addition, subtraction, multiplication, and division, so addition
// and subtraction have the weakest precedence.
//
// Then, as we go down the precedence tree, each item will recurse (left!)
// to other items at the same precedence level. The right hand operator, for
// binary operators (which is all of ours, at the moment) will then be one
// level stronger precendence. In addition, we'll let people just fall through
// to the next level; so if there isn't an addition or subtraction, we'll just
// fall through to the multiplication/division case.
//
// Finally, at the bottom, we'll have the core expressions (like constants,
// variables, etc.) as well as a parenthesized version of `Expression`, which
// gets us right up top again.
//
// Understanding why this works to solve all your operator precedence problems
// is a little hard to give an easy intuition for, but for myself it helped
// to run through a few examples. Consider thinking about how you want to
// parse something like "1 + 2 * 3", for example, versus "1 + 2 + 3" or
// "1 * 2 + 3", and hopefully that'll help.
Expression: Expression = {
AdditiveExpression,
}
// we group addition and subtraction under the heading "additive"
AdditiveExpression: Expression = {
<e1:AdditiveExpression> <l:@L> "+" <e2:MultiplicativeExpression> => Expression::Primitive(Location::new(file_idx, l), "+".to_string(), vec![e1, e2]),
<e1:AdditiveExpression> <l:@L> "-" <e2:MultiplicativeExpression> => Expression::Primitive(Location::new(file_idx, l), "-".to_string(), vec![e1, e2]),
MultiplicativeExpression,
}
// similarly, we group multiplication and division under "multiplicative"
MultiplicativeExpression: Expression = {
<e1:MultiplicativeExpression> <l:@L> "*" <e2:AtomicExpression> => Expression::Primitive(Location::new(file_idx, l), "*".to_string(), vec![e1, e2]),
<e1:MultiplicativeExpression> <l:@L> "/" <e2:AtomicExpression> => Expression::Primitive(Location::new(file_idx, l), "/".to_string(), vec![e1, e2]),
AtomicExpression,
}
// finally, we describe our lowest-level expressions as "atomic", because
// they cannot be further divided into parts
AtomicExpression: Expression = {
// just a variable reference
<l:@L> <v:"<var>"> => Expression::Reference(Location::new(file_idx, l), v.to_string()),
// just a number
<l:@L> <n:"<num>"> => {
let val = Value::Number(n.0, n.1);
Expression::Value(Location::new(file_idx, l), val)
},
// a tricky case: also just a number, but using a negative sign. an
// alternative way to do this -- and we may do this eventually -- is
// to implement a unary negation expression. this has the odd effect
// that the user never actually writes down a negative number; they just
// write positive numbers which are immediately sent to a negation
// primitive!
<l:@L> "-" <n:"<num>"> => {
let val = Value::Number(n.0, -n.1);
Expression::Value(Location::new(file_idx, l), val)
},
// finally, let people parenthesize expressions and get back to a
// lower precedence
"(" <e:Expression> ")" => e,
}

View File

@@ -1,63 +0,0 @@
use crate::syntax::ast::{Expression, Program, Statement};
impl Program {
pub fn simplify(mut self) -> Self {
let mut new_statements = Vec::new();
let mut gensym_index = 1;
for stmt in self.statements.drain(..) {
new_statements.append(&mut stmt.simplify(&mut gensym_index));
}
self.statements = new_statements;
self
}
}
impl Statement {
pub fn simplify(self, gensym_index: &mut usize) -> Vec<Statement> {
let mut new_statements = vec![];
match self {
Statement::Print(_, _) => new_statements.push(self),
Statement::Binding(_, _, Expression::Reference(_, _)) => new_statements.push(self),
Statement::Binding(_, _, Expression::Value(_, _)) => new_statements.push(self),
Statement::Binding(loc, name, value) => {
let (mut prereqs, new_value) = value.rebind(&name, gensym_index);
new_statements.append(&mut prereqs);
new_statements.push(Statement::Binding(loc, name, new_value))
}
}
new_statements
}
}
impl Expression {
fn rebind(self, base_name: &str, gensym_index: &mut usize) -> (Vec<Statement>, Expression) {
match self {
Expression::Value(_, _) => (vec![], self),
Expression::Reference(_, _) => (vec![], self),
Expression::Primitive(loc, prim, mut expressions) => {
let mut prereqs = Vec::new();
let mut new_exprs = Vec::new();
for expr in expressions.drain(..) {
let (mut cur_prereqs, arg) = expr.rebind(base_name, gensym_index);
prereqs.append(&mut cur_prereqs);
new_exprs.push(arg);
}
let new_name = format!("<{}:{}>", base_name, *gensym_index);
*gensym_index += 1;
prereqs.push(Statement::Binding(
loc.clone(),
new_name.clone(),
Expression::Primitive(loc.clone(), prim, new_exprs),
));
(prereqs, Expression::Reference(loc, new_name))
}
}
}
}

View File

@@ -4,8 +4,30 @@ use std::fmt;
use std::num::ParseIntError;
use thiserror::Error;
/// A single token of the input stream; used to help the parsing go down
/// more easily.
///
/// The key way to generate this structure is via the [`Logos`] trait.
/// See the [`logos`] documentation for more information; we use the
/// [`Token::lexer`] function internally.
///
/// The first step in the compilation process is turning the raw string
/// data (in UTF-8, which is its own joy) in to a sequence of more sensible
/// tokens. Here, for example, we turn "x=5" into three tokens: a
/// [`Token::Variable`] for "x", a [`Token::Equals`] for the "=", and
/// then a [`Token::Number`] for the "5". Later on, we'll worry about
/// making sense of those three tokens.
///
/// For now, our list of tokens is relatively straightforward. We'll
/// need/want to extend these later.
///
/// The [`std::fmt::Display`] implementation for [`Token`] should
/// round-trip; if you lex a string generated with the [`std::fmt::Display`]
/// trait, you should get back the exact same token.
#[derive(Logos, Clone, Debug, PartialEq, Eq)]
pub enum Token {
// Our first set of tokens are simple characters that we're
// going to use to structure NGR programs.
#[token("=")]
Equals,
@@ -18,12 +40,20 @@ pub enum Token {
#[token(")")]
RightParen,
// Next we take of any reserved words; I always like to put
// these before we start recognizing more complicated regular
// expressions. I don't think it matters, but it works for me.
#[token("print")]
Print,
// Next are the operators for NGR. We only have 4, now, but
// we might extend these later, or even make them user-definable!
#[regex(r"[+\-*/]", |v| v.slice().chars().next())]
Operator(char),
/// Numbers capture both the value we read from the input,
/// converted to an `i64`, as well as the base the user used
/// to write the number, if they did so.
#[regex(r"0b[01]+", |v| parse_number(Some(2), v))]
#[regex(r"0o[0-7]+", |v| parse_number(Some(8), v))]
#[regex(r"0d[0-9]+", |v| parse_number(Some(10), v))]
@@ -31,12 +61,23 @@ pub enum Token {
#[regex(r"[0-9]+", |v| parse_number(None, v))]
Number((Option<u8>, i64)),
// Variables; this is a very standard, simple set of characters
// for variables, but feel free to experiment with more complicated
// things. I chose to force variables to start with a lower case
// letter, too.
#[regex(r"[a-z][a-zA-Z0-9_]*", |v| ArcIntern::new(v.slice().to_string()))]
Variable(ArcIntern<String>),
// the next token will be an error token
#[error]
// we're actually just going to skip whitespace, though
#[regex(r"[ \t\r\n\f]+", logos::skip)]
// this is an extremely simple version of comments, just line
// comments. More complicated /* */ comments can be harder to
// implement, and didn't seem worth it at the time.
#[regex(r"//.*", logos::skip)]
/// This token represents that some core error happened in lexing;
/// possibly that something didn't match anything at all.
Error,
}
@@ -63,19 +104,28 @@ impl fmt::Display for Token {
}
}
/// A sudden and unexpected error in the lexer.
#[derive(Debug, Error, PartialEq, Eq)]
pub enum LexerError {
/// The `usize` here is the offset that we ran into the problem, given
/// from the start of the file.
#[error("Failed lexing at {0}")]
LexFailure(usize),
}
#[cfg(test)]
impl Token {
/// Create a variable token with the given name. Very handy for
/// testing.
pub(crate) fn var(s: &str) -> Token {
Token::Variable(ArcIntern::new(s.to_string()))
}
}
/// Parse a number in the given base, return a pair of the base and the
/// parsed number. This is just a helper used for all of the number
/// regular expression cases, which kicks off to the obvious Rust
/// standard library function.
fn parse_number(
base: Option<u8>,
value: &Lexer<Token>,

View File

@@ -2,6 +2,13 @@ use crate::syntax::{Expression, Location, Program, Statement};
use codespan_reporting::diagnostic::Diagnostic;
use std::collections::HashMap;
/// An error we found while validating the input program.
///
/// These errors indicate that we should stop trying to compile
/// the program, because it's just fundamentally broken in a way
/// that we're not going to be able to work through. As with most
/// of these errors, we recommend converting this to a [`Diagnostic`]
/// and using [`codespan_reporting`] to present them to the user.
pub enum Error {
UnboundVariable(Location, String),
}
@@ -16,6 +23,13 @@ impl From<Error> for Diagnostic<usize> {
}
}
/// A problem we found validating the input that isn't critical.
///
/// These are things that the user might want to do something about,
/// but we can keep going without it being a problem. As with most of
/// these things, if you want to present this information to the user,
/// the best way to do so is via [`From`] and [`Diagnostic`], and then
/// interactions via [`codespan_reporting`].
#[derive(Debug, PartialEq, Eq)]
pub enum Warning {
ShadowedVariable(Location, Location, String),
@@ -37,6 +51,11 @@ impl From<Warning> for Diagnostic<usize> {
}
impl Program {
/// Validate that the program makes semantic sense, not just syntactic sense.
///
/// This checks for things like references to variables that don't exist, for
/// example, and generates warnings for things that are inadvisable but not
/// actually a problem.
pub fn validate(&self) -> (Vec<Error>, Vec<Warning>) {
let mut errors = vec![];
let mut warnings = vec![];
@@ -53,6 +72,15 @@ impl Program {
}
impl Statement {
/// Validate that the statement makes semantic sense, not just syntactic sense.
///
/// This checks for things like references to variables that don't exist, for
/// example, and generates warnings for things that are inadvisable but not
/// actually a problem. Since statements appear in a broader context, you'll
/// need to provide the set of variables that are bound where this statement
/// occurs. We use a `HashMap` to map these bound locations to the locations
/// where their bound, because these locations are handy when generating errors
/// and warnings.
pub fn validate(
&self,
bound_variables: &mut HashMap<String, Location>,