commit b2f6b12ced1ac52739a36b50c2bc25c854c77307 Author: Adam Wick Date: Sat Aug 1 20:45:33 2020 -0700 🤷 The initial version of the compiler, both static and JIT. This implements a full compiler, with both static compilation and JIT support, for the world's simplest and silliest programming language. You can do math, and print variables. That's it. On the bright side, it implements every part of the compiler, from the lexer and parser; through analysis and simplification; and into a reasonable code generator. This should be a good jumping off point for adding more advanced features. Tests, including proptests, are included to help avoid regressions. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..55de121 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +/target +Cargo.lock +**/*.o +test +*.dSYM +.vscode +proptest-regressions/ \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..e69de29 diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..41e2435 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "ngr" +version = "0.1.0" +authors = ["awick"] +edition = "2021" + +[lib] +name = "ngr" +path = "src/lib.rs" + +[dependencies] +clap = { version = "^3.0.14", features = ["derive"] } +codespan = "0.11.1" +codespan-reporting = "0.11.1" +cranelift-codegen = "0.94.0" +cranelift-jit = "0.94.0" +cranelift-frontend = "0.94.0" +cranelift-module = "0.94.0" +cranelift-native = "0.94.0" +cranelift-object = "0.94.0" +internment = { version = "0.7.0", default-features = false, features = ["arc"] } +lalrpop-util = "^0.19.7" +lazy_static = "^1.4.0" +logos = "^0.12.0" +pretty = { version = "^0.11.2", features = ["termcolor"] } +proptest = "^1.0.0" +rustyline = "^11.0.0" +target-lexicon = "^0.12.5" +thiserror = "^1.0.30" + +[build-dependencies] +lalrpop = "^0.19.7" diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..23c7d3f --- /dev/null +++ b/build.rs @@ -0,0 +1,5 @@ +extern crate lalrpop; + +fn main() { + lalrpop::process_root().unwrap(); +} diff --git a/examples/basic/test1.ngr b/examples/basic/test1.ngr new file mode 100644 index 0000000..b8d66e1 --- /dev/null +++ b/examples/basic/test1.ngr @@ -0,0 +1,4 @@ +x = 5; +y = 4*x + 3; +print x; +print y; diff --git a/examples/basic/test2.ngr b/examples/basic/test2.ngr new file mode 100644 index 0000000..775e24a --- /dev/null +++ b/examples/basic/test2.ngr @@ -0,0 +1,4 @@ +x = 5; +x = 4*x + 3; +print x; +print y; diff --git a/runtime/rts.c b/runtime/rts.c new file mode 100644 index 0000000..4f955e6 --- /dev/null +++ b/runtime/rts.c @@ -0,0 +1,17 @@ +#include +#include + +void print(char *variable_name, uint64_t value) { + printf("%s = %llu\n", variable_name, value); +} + +void caller() { + print("x", 4); +} + +extern void gogogo(); + +int main(int argc, char **argv) { + gogogo(); + return 0; +} \ No newline at end of file diff --git a/runtime/sample.c b/runtime/sample.c new file mode 100644 index 0000000..c7c18b1 --- /dev/null +++ b/runtime/sample.c @@ -0,0 +1,5 @@ +extern void print(char *variable_name, unsigned long long value); + +void gogogo() { + print("x", 4); +} \ No newline at end of file diff --git a/src/backend.rs b/src/backend.rs new file mode 100644 index 0000000..54622dc --- /dev/null +++ b/src/backend.rs @@ -0,0 +1,104 @@ +mod error; +mod into_crane; +mod runtime; + +use std::collections::HashMap; + +pub use self::error::BackendError; +pub use self::runtime::{RuntimeFunctionError, RuntimeFunctions}; +use cranelift_codegen::settings::Configurable; +use cranelift_codegen::{isa, settings}; +use cranelift_jit::{JITBuilder, JITModule}; +use cranelift_module::{default_libcall_names, DataContext, DataId, FuncId, Linkage, Module}; +use cranelift_object::{object, ObjectBuilder, ObjectModule}; +use target_lexicon::Triple; + +const EMPTY_DATUM: [u8; 8] = [0; 8]; + +pub struct Backend { + pub module: M, + data_ctx: DataContext, + runtime_functions: RuntimeFunctions, + defined_strings: HashMap, + defined_symbols: HashMap, +} + +impl Backend { + pub fn jit() -> Result { + let platform = Triple::host(); + let isa_builder = isa::lookup(platform.clone())?; + let mut settings_builder = settings::builder(); + settings_builder.set("use_colocated_libcalls", "false")?; + settings_builder.set("is_pic", "false")?; + let isa = isa_builder.finish(settings::Flags::new(settings_builder))?; + let mut builder = JITBuilder::with_isa(isa, cranelift_module::default_libcall_names()); + + RuntimeFunctions::register_jit_implementations(&mut builder); + + let mut module = JITModule::new(builder); + let runtime_functions = RuntimeFunctions::new(&platform, &mut module)?; + + Ok(Backend { + module, + data_ctx: DataContext::new(), + runtime_functions, + defined_strings: HashMap::new(), + defined_symbols: HashMap::new(), + }) + } + + pub fn bytes(&self, function_id: FuncId) -> *const u8 { + self.module.get_finalized_function(function_id) + } +} + +impl Backend { + pub fn object_file(platform: Triple) -> Result { + let isa_builder = isa::lookup(platform.clone())?; + let mut settings_builder = settings::builder(); + settings_builder.set("is_pic", "true")?; + let isa = isa_builder.finish(settings::Flags::new(settings_builder))?; + + let object_builder = ObjectBuilder::new(isa, "example", default_libcall_names())?; + let mut module = ObjectModule::new(object_builder); + let runtime_functions = RuntimeFunctions::new(&platform, &mut module)?; + + Ok(Backend { + module, + data_ctx: DataContext::new(), + runtime_functions, + defined_strings: HashMap::new(), + defined_symbols: HashMap::new(), + }) + } + + pub fn bytes(self) -> Result, object::write::Error> { + self.module.finish().emit() + } +} + +impl Backend { + pub fn define_string(&mut self, s: &str) -> Result { + let name = format!("{}", s); + let global_id = self + .module + .declare_data(&name, Linkage::Local, false, false)?; + let mut data_context = DataContext::new(); + data_context.set_align(8); + data_context.define(s.to_owned().into_boxed_str().into_boxed_bytes()); + self.module.define_data(global_id, &data_context)?; + self.defined_strings.insert(s.to_owned(), global_id); + Ok(global_id) + } + + pub fn define_variable(&mut self, name: String) -> Result { + self.data_ctx.define(Box::new(EMPTY_DATUM)); + let id = self + .module + .declare_data(&name, Linkage::Export, true, false)?; + self.module.define_data(id, &self.data_ctx)?; + self.data_ctx.clear(); + self.defined_symbols.insert(name, id); + Ok(id) + } +} diff --git a/src/backend/error.rs b/src/backend/error.rs new file mode 100644 index 0000000..26b7bf0 --- /dev/null +++ b/src/backend/error.rs @@ -0,0 +1,46 @@ +use crate::backend::runtime::RuntimeFunctionError; +use codespan_reporting::diagnostic::Diagnostic; +use cranelift_codegen::{isa::LookupError, settings::SetError, CodegenError}; +use cranelift_module::ModuleError; +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum BackendError { + #[error("Cranelift module error: {0}")] + Cranelift(#[from] ModuleError), + #[error("Builtin function error: {0}")] + BuiltinError(#[from] RuntimeFunctionError), + #[error("Internal variable lookup error")] + VariableLookupFailure, + #[error(transparent)] + CodegenError(#[from] CodegenError), + #[error(transparent)] + SetError(#[from] SetError), + #[error(transparent)] + LookupError(#[from] LookupError), +} + +impl From for Diagnostic { + fn from(value: BackendError) -> Self { + match value { + BackendError::Cranelift(me) => { + Diagnostic::error().with_message(format!("Internal cranelift error: {}", me)) + } + BackendError::BuiltinError(me) => { + Diagnostic::error().with_message(format!("Internal runtime function error: {}", me)) + } + BackendError::VariableLookupFailure => { + Diagnostic::error().with_message("Internal variable lookup error!") + } + BackendError::CodegenError(me) => { + Diagnostic::error().with_message(format!("Internal codegen error: {}", me)) + } + BackendError::SetError(me) => { + Diagnostic::error().with_message(format!("Internal backend setup error: {}", me)) + } + BackendError::LookupError(me) => { + Diagnostic::error().with_message(format!("Internal error: {}", me)) + } + } + } +} diff --git a/src/backend/into_crane.rs b/src/backend/into_crane.rs new file mode 100644 index 0000000..8605feb --- /dev/null +++ b/src/backend/into_crane.rs @@ -0,0 +1,174 @@ +use std::collections::HashMap; + +use crate::ir::{Expression, Primitive, Program, Statement, Value, ValueOrRef}; +use cranelift_codegen::entity::EntityRef; +use cranelift_codegen::ir::{ + entities, types, Function, GlobalValue, InstBuilder, MemFlags, Signature, UserFuncName, +}; +use cranelift_codegen::isa::CallConv; +use cranelift_codegen::Context; +use cranelift_frontend::{FunctionBuilder, FunctionBuilderContext, Variable}; +use cranelift_module::{FuncId, Linkage, Module, ModuleError}; +use internment::ArcIntern; + +use crate::backend::error::BackendError; +use crate::backend::Backend; + +type StringTable = HashMap, GlobalValue>; + +impl Backend { + pub fn compile_function( + &mut self, + function_name: &str, + mut program: Program, + ) -> Result { + let basic_signature = Signature { + params: vec![], + returns: vec![], + call_conv: CallConv::SystemV, + }; + + let func_id = + self.module + .declare_function(function_name, Linkage::Export, &basic_signature)?; + let mut ctx = Context::new(); + ctx.func = + Function::with_name_signature(UserFuncName::user(0, func_id.as_u32()), basic_signature); + + let string_table = self.build_string_table(&mut ctx.func, &program)?; + let mut variable_table = HashMap::new(); + let mut next_var_num = 1; + let print_func_ref = self.runtime_functions.include_runtime_function( + "print", + &mut self.module, + &mut ctx.func, + )?; + let pre_defined_symbols: HashMap = self + .defined_symbols + .iter() + .map(|(k, v)| { + let local_data = self.module.declare_data_in_func(*v, &mut ctx.func); + (k.clone(), local_data) + }) + .collect(); + + let mut fctx = FunctionBuilderContext::new(); + let mut builder = FunctionBuilder::new(&mut ctx.func, &mut fctx); + let main_block = builder.create_block(); + builder.switch_to_block(main_block); + + for stmt in program.statements.drain(..) { + match stmt { + Statement::Print(ann, var) => { + let local_name_ref = string_table.get(&var).unwrap(); + let name_ptr = builder.ins().symbol_value(types::I64, *local_name_ref); + let val = ValueOrRef::Ref(ann, var).into_cranelift( + &mut builder, + &variable_table, + &pre_defined_symbols, + )?; + builder.ins().call(print_func_ref, &[name_ptr, val]); + } + + Statement::Binding(_, var_name, value) => { + let val = match value { + Expression::Value(_, Value::Number(_, v)) => { + builder.ins().iconst(types::I64, v) + } + + Expression::Reference(_, name) => { + let value_var_num = variable_table.get(&name).unwrap(); + builder.use_var(Variable::new(*value_var_num)) + } + + Expression::Primitive(_, prim, mut vals) => { + let right = vals.pop().unwrap().into_cranelift( + &mut builder, + &variable_table, + &pre_defined_symbols, + )?; + let left = vals.pop().unwrap().into_cranelift( + &mut builder, + &variable_table, + &pre_defined_symbols, + )?; + + match prim { + Primitive::Plus => builder.ins().iadd(left, right), + Primitive::Minus => builder.ins().isub(left, right), + Primitive::Times => builder.ins().imul(left, right), + Primitive::Divide => builder.ins().sdiv(left, right), + } + } + }; + + if let Some(global_id) = pre_defined_symbols.get(var_name.as_str()) { + let val_ptr = builder.ins().symbol_value(types::I64, *global_id); + builder.ins().store(MemFlags::new(), val, val_ptr, 0); + } else { + let var = Variable::new(next_var_num); + variable_table.insert(var_name, next_var_num); + next_var_num += 1; + builder.declare_var(var, types::I64); + builder.def_var(var, val); + } + } + } + } + + builder.ins().return_(&[]); + builder.seal_block(main_block); + builder.finalize(); + + let _ = self.module.define_function(func_id, &mut ctx)?; + + Ok(func_id) + } + + fn build_string_table( + &mut self, + func: &mut Function, + program: &Program, + ) -> Result { + let mut string_table = HashMap::new(); + + for interned_value in program.strings().drain() { + let global_id = match self.defined_strings.get(interned_value.as_str()) { + Some(x) => *x, + None => self.define_string(interned_value.as_str())?, + }; + let local_data = self.module.declare_data_in_func(global_id, func); + string_table.insert(interned_value, local_data); + } + + Ok(string_table) + } +} + +impl ValueOrRef { + fn into_cranelift( + self, + builder: &mut FunctionBuilder, + local_variables: &HashMap, usize>, + global_variables: &HashMap, + ) -> Result { + match self { + ValueOrRef::Value(_, value) => match value { + Value::Number(_base, numval) => Ok(builder.ins().iconst(types::I64, numval)), + }, + + ValueOrRef::Ref(_, name) => { + if let Some(local_num) = local_variables.get(&name) { + return Ok(builder.use_var(Variable::new(*local_num))); + } + + if let Some(global_id) = global_variables.get(name.as_str()) { + let val_ptr = builder.ins().symbol_value(types::I64, *global_id); + return Ok(builder.ins().load(types::I64, MemFlags::new(), val_ptr, 0)); + } + + Err(ModuleError::Undeclared(name.to_string())) + } + } + } +} diff --git a/src/backend/object.rs b/src/backend/object.rs new file mode 100644 index 0000000..d17fd03 --- /dev/null +++ b/src/backend/object.rs @@ -0,0 +1,8 @@ +struct BackendObject { +} + +impl BackendObject { + pub fn new() -> Result { + unimplemented!() + } +} \ No newline at end of file diff --git a/src/backend/runtime.rs b/src/backend/runtime.rs new file mode 100644 index 0000000..ecf0e7b --- /dev/null +++ b/src/backend/runtime.rs @@ -0,0 +1,69 @@ +use cranelift_codegen::ir::{types, AbiParam, FuncRef, Function, Signature}; +use cranelift_codegen::isa::CallConv; +use cranelift_jit::JITBuilder; +use cranelift_module::{FuncId, Linkage, Module, ModuleResult}; +use std::collections::HashMap; +use std::ffi::CStr; +use target_lexicon::Triple; +use thiserror::Error; + +pub struct RuntimeFunctions { + builtin_functions: HashMap, + _referenced_functions: Vec, +} + +#[derive(Debug, Error)] +pub enum RuntimeFunctionError { + #[error("Could not find runtime function named '{0}'")] + CannotFindRuntimeFunction(String), +} + +extern "C" fn runtime_print(name: *const i8, value: u64) { + let cstr = unsafe { CStr::from_ptr(name) }; + let reconstituted = cstr.to_string_lossy(); + println!("{} = {}", reconstituted, value); +} + +impl RuntimeFunctions { + pub fn new(platform: &Triple, module: &mut M) -> ModuleResult { + let mut builtin_functions = HashMap::new(); + let _referenced_functions = Vec::new(); + + let string_param = AbiParam::new(types::I64); + let int64_param = AbiParam::new(types::I64); + + let print_id = module.declare_function( + "print", + Linkage::Import, + &Signature { + params: vec![string_param, int64_param], + returns: vec![], + call_conv: CallConv::triple_default(platform), + }, + )?; + builtin_functions.insert("print".to_string(), print_id); + + Ok(RuntimeFunctions { + builtin_functions, + _referenced_functions, + }) + } + + pub fn include_runtime_function( + &self, + name: &str, + module: &mut M, + func: &mut Function, + ) -> Result { + match self.builtin_functions.get(name) { + None => Err(RuntimeFunctionError::CannotFindRuntimeFunction( + name.to_string(), + )), + Some(func_id) => Ok(module.declare_func_in_func(*func_id, func)), + } + } + + pub fn register_jit_implementations(builder: &mut JITBuilder) { + builder.symbol("print", runtime_print as *const u8); + } +} diff --git a/src/bin/ngrc.rs b/src/bin/ngrc.rs new file mode 100644 index 0000000..23a9021 --- /dev/null +++ b/src/bin/ngrc.rs @@ -0,0 +1,98 @@ +use clap::Parser; +use codespan_reporting::diagnostic::Diagnostic; +use codespan_reporting::files::SimpleFiles; +use codespan_reporting::term; +use codespan_reporting::term::termcolor::{ColorChoice, StandardStream}; +use cranelift_object::object; + +use ngr::backend::Backend; +use ngr::backend::BackendError; +use ngr::ir::Program as IR; +use ngr::syntax::{ParserError, Program as Syntax}; +use target_lexicon::Triple; +use thiserror::Error; + +#[derive(Parser, Debug)] +#[clap(author, version, about, long_about = None)] +struct CommandLineArguments { + /// Optional output file name + #[clap(short, long)] + output: Option, + + /// The file to parse + file: String, +} + +#[derive(Debug, Error)] +enum MainError { + #[error(transparent)] + Backend(#[from] BackendError), + #[error("Parser error")] + ParserError(#[from] ParserError), + #[error("IO error")] + IoError(#[from] std::io::Error), + #[error("write error")] + WriteError(#[from] object::write::Error), +} + +impl From for Diagnostic { + fn from(value: MainError) -> Self { + match value { + MainError::Backend(be) => be.into(), + MainError::ParserError(pe) => (&pe).into(), + MainError::IoError(e) => Diagnostic::error().with_message(format!("IO error: {}", e)), + MainError::WriteError(e) => { + Diagnostic::error().with_message(format!("Module write error: {}", e)) + } + } + } +} + +fn compile(file_database: &mut SimpleFiles) -> Result<(), MainError> { + let args = CommandLineArguments::parse(); + + let syntax = Syntax::parse_file(file_database, &args.file)?; + let (mut errors, mut warnings) = syntax.validate(); + let stop = !errors.is_empty(); + let messages = errors + .drain(..) + .map(Into::into) + .chain(warnings.drain(..).map(Into::into)); + let writer = StandardStream::stderr(ColorChoice::Auto); + let config = codespan_reporting::term::Config::default(); + + for message in messages { + term::emit(&mut writer.lock(), &config, file_database, &message).unwrap(); + } + + if stop { + return Ok(()); + } + + let ir = IR::from(syntax.simplify()); + let mut backend = Backend::object_file(Triple::host())?; + backend.compile_function("gogogo", ir)?; + let bytes = backend.bytes()?; + std::fs::write(args.output.unwrap_or_else(|| "output.o".to_string()), bytes)?; + Ok(()) +} + +fn main() { + let mut file_database = SimpleFiles::new(); + + match compile(&mut file_database) { + Ok(()) => {} + Err(e) => { + let writer = StandardStream::stderr(ColorChoice::Auto); + let config = codespan_reporting::term::Config::default(); + + term::emit( + &mut writer.lock(), + &config, + &file_database, + &Diagnostic::from(e), + ) + .unwrap(); + } + } +} diff --git a/src/bin/ngri.rs b/src/bin/ngri.rs new file mode 100644 index 0000000..772016b --- /dev/null +++ b/src/bin/ngri.rs @@ -0,0 +1,157 @@ +use codespan_reporting::diagnostic::Diagnostic; +use codespan_reporting::files::SimpleFiles; +use codespan_reporting::term::{self, Config}; +use cranelift_jit::JITModule; +use cranelift_module::ModuleError; +use ngr::backend::{Backend, BackendError}; +use ngr::ir::Program as IR; +use ngr::syntax::{Location, ParserError, Statement}; +use pretty::termcolor::{ColorChoice, StandardStream, WriteColor}; +use rustyline::error::ReadlineError; +use rustyline::DefaultEditor; +use std::collections::HashMap; + +pub struct RunLoop<'a> { + file_database: SimpleFiles<&'a str, String>, + jitter: Backend, + variable_binding_sites: HashMap, + gensym_index: usize, + writer: &'a mut dyn WriteColor, + config: Config, +} + +#[allow(clippy::upper_case_acronyms)] +#[derive(Debug, thiserror::Error)] +enum REPLError { + #[error("Error parsing statement: {0}")] + Parser(#[from] ParserError), + #[error("JIT error: {0}")] + JIT(#[from] BackendError), + #[error("Internal cranelift error: {0}")] + Cranelift(#[from] ModuleError), + #[error(transparent)] + Reporting(#[from] codespan_reporting::files::Error), +} + +impl From for Diagnostic { + fn from(value: REPLError) -> Self { + match value { + REPLError::Parser(err) => Diagnostic::from(&err), + REPLError::JIT(err) => Diagnostic::from(err), + REPLError::Cranelift(err) => Diagnostic::bug().with_message(format!("{}", err)), + REPLError::Reporting(err) => Diagnostic::bug().with_message(format!("{}", err)), + } + } +} + +impl<'a> RunLoop<'a> { + pub fn new(writer: &'a mut dyn WriteColor, config: Config) -> Result { + Ok(RunLoop { + file_database: SimpleFiles::new(), + jitter: Backend::jit()?, + variable_binding_sites: HashMap::new(), + gensym_index: 1, + writer, + config, + }) + } + + fn emit_diagnostic( + &mut self, + diagnostic: Diagnostic, + ) -> Result<(), codespan_reporting::files::Error> { + term::emit(self.writer, &self.config, &self.file_database, &diagnostic) + } + + fn process_input(&mut self, line_no: usize, command: String) { + if let Err(err) = self.process(line_no, command) { + if let Err(e) = self.emit_diagnostic(Diagnostic::from(err)) { + eprintln!( + "WOAH! System having trouble printing error messages. This is very bad. ({})", + e + ); + } + } + } + + fn process(&mut self, line_no: usize, command: String) -> Result<(), REPLError> { + let entry = self.file_database.add("entry", command); + let source = self + .file_database + .get(entry) + .expect("entry exists") + .source(); + let syntax = Statement::parse(entry, source)?; + + // if this is a variable binding, and we've never defined this variable before, + // we should tell cranelift about it. this is optimistic; if we fail to compile, + // then we won't use this definition until someone tries again. + if let Statement::Binding(_, ref name, _) = syntax { + if !self.variable_binding_sites.contains_key(name.as_str()) { + self.jitter.define_string(name)?; + self.jitter.define_variable(name.clone())?; + } + }; + + let (mut errors, mut warnings) = syntax.validate(&mut self.variable_binding_sites); + let stop = !errors.is_empty(); + let messages = errors + .drain(..) + .map(Into::into) + .chain(warnings.drain(..).map(Into::into)); + + for message in messages { + self.emit_diagnostic(message)?; + } + + if stop { + return Ok(()); + } + + let ir = IR::from(syntax.simplify(&mut self.gensym_index)); + let name = format!("line{}", line_no); + let function_id = self.jitter.compile_function(&name, ir)?; + self.jitter.module.finalize_definitions()?; + let compiled_bytes = self.jitter.bytes(function_id); + let compiled_function = unsafe { std::mem::transmute::<_, fn() -> ()>(compiled_bytes) }; + compiled_function(); + Ok(()) + } +} + +fn main() -> Result<(), BackendError> { + let mut editor = DefaultEditor::new().expect("rustyline works"); + let mut line_no = 0; + let mut writer = StandardStream::stdout(ColorChoice::Auto); + let config = codespan_reporting::term::Config::default(); + let mut state = RunLoop::new(&mut writer, config)?; + + println!("No Good Reason, the Interpreter!"); + loop { + line_no += 1; + match editor.readline("> ") { + Ok(command) => match command.trim() { + "" => continue, + ":quit" => break, + _ => state.process_input(line_no, command), + }, + Err(ReadlineError::Io(e)) => { + eprintln!("IO error: {}", e); + break; + } + Err(ReadlineError::Eof) => break, + Err(ReadlineError::Interrupted) => break, + Err(ReadlineError::Errno(e)) => { + eprintln!("Unknown syscall error: {}", e); + break; + } + Err(ReadlineError::WindowResized) => continue, + Err(e) => { + eprintln!("Unknown internal error: {}", e); + break; + } + } + } + + Ok(()) +} diff --git a/src/ir.rs b/src/ir.rs new file mode 100644 index 0000000..9b5157d --- /dev/null +++ b/src/ir.rs @@ -0,0 +1,5 @@ +mod ast; +mod from_syntax; +mod strings; + +pub use ast::*; diff --git a/src/ir/ast.rs b/src/ir/ast.rs new file mode 100644 index 0000000..97cda55 --- /dev/null +++ b/src/ir/ast.rs @@ -0,0 +1,172 @@ +use internment::ArcIntern; +use pretty::{DocAllocator, Pretty}; + +use crate::syntax::Location; + +type Variable = ArcIntern; + +pub struct Program { + pub statements: Vec, +} + +impl<'a, 'b, D, A> Pretty<'a, D, A> for &'b Program +where + A: 'a, + D: ?Sized + DocAllocator<'a, A>, +{ + fn pretty(self, allocator: &'a D) -> pretty::DocBuilder<'a, D, A> { + let mut result = allocator.nil(); + + for stmt in self.statements.iter() { + result = result + .append(stmt.pretty(allocator)) + .append(allocator.text(";")) + .append(allocator.hardline()); + } + + result + } +} + +pub enum Statement { + Binding(Location, Variable, Expression), + Print(Location, Variable), +} + +impl<'a, 'b, D, A> Pretty<'a, D, A> for &'b Statement +where + A: 'a, + D: ?Sized + DocAllocator<'a, A>, +{ + fn pretty(self, allocator: &'a D) -> pretty::DocBuilder<'a, D, A> { + match self { + Statement::Binding(_, var, expr) => allocator + .text(var.as_ref().to_string()) + .append(allocator.space()) + .append(allocator.text("=")) + .append(allocator.space()) + .append(expr.pretty(allocator)), + Statement::Print(_, var) => allocator + .text("print") + .append(allocator.space()) + .append(allocator.text(var.as_ref().to_string())), + } + } +} + +pub enum Expression { + Value(Location, Value), + Reference(Location, Variable), + Primitive(Location, Primitive, Vec), +} + +impl<'a, 'b, D, A> Pretty<'a, D, A> for &'b Expression +where + A: 'a, + D: ?Sized + DocAllocator<'a, A>, +{ + fn pretty(self, allocator: &'a D) -> pretty::DocBuilder<'a, D, A> { + match self { + Expression::Value(_, val) => val.pretty(allocator), + Expression::Reference(_, var) => allocator.text(var.as_ref().to_string()), + Expression::Primitive(_, op, exprs) if exprs.len() == 1 => { + op.pretty(allocator).append(exprs[0].pretty(allocator)) + } + Expression::Primitive(_, op, exprs) if exprs.len() == 2 => { + let left = exprs[0].pretty(allocator); + let right = exprs[1].pretty(allocator); + + left.append(allocator.space()) + .append(op.pretty(allocator)) + .append(allocator.space()) + .append(right) + .parens() + } + Expression::Primitive(_, op, exprs) => { + allocator.text(format!("!!{:?} with {} arguments!!", op, exprs.len())) + } + } + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Primitive { + Plus, + Minus, + Times, + Divide, +} + +impl<'a> TryFrom<&'a str> for Primitive { + type Error = String; + + fn try_from(value: &str) -> Result { + match value { + "+" => Ok(Primitive::Plus), + "-" => Ok(Primitive::Minus), + "*" => Ok(Primitive::Times), + "/" => Ok(Primitive::Divide), + _ => Err(format!("Illegal primitive {}", value)), + } + } +} + +impl<'a, 'b, D, A> Pretty<'a, D, A> for &'b Primitive +where + A: 'a, + D: ?Sized + DocAllocator<'a, A>, +{ + fn pretty(self, allocator: &'a D) -> pretty::DocBuilder<'a, D, A> { + match self { + Primitive::Plus => allocator.text("+"), + Primitive::Minus => allocator.text("-"), + Primitive::Times => allocator.text("*"), + Primitive::Divide => allocator.text("/"), + } + } +} + +pub enum ValueOrRef { + Value(Location, Value), + Ref(Location, ArcIntern), +} + +impl<'a, 'b, D, A> Pretty<'a, D, A> for &'b ValueOrRef +where + A: 'a, + D: ?Sized + DocAllocator<'a, A>, +{ + fn pretty(self, allocator: &'a D) -> pretty::DocBuilder<'a, D, A> { + match self { + ValueOrRef::Value(_, v) => v.pretty(allocator), + ValueOrRef::Ref(_, v) => allocator.text(v.as_ref().to_string()), + } + } +} + +pub enum Value { + Number(Option, i64), +} + +impl<'a, 'b, D, A> Pretty<'a, D, A> for &'b Value +where + A: 'a, + D: ?Sized + DocAllocator<'a, A>, +{ + fn pretty(self, allocator: &'a D) -> pretty::DocBuilder<'a, D, A> { + match self { + Value::Number(opt_base, value) => { + let value_str = match opt_base { + None => format!("{}", value), + Some(2) => format!("0b{:b}", value), + Some(8) => format!("0o{:o}", value), + Some(10) => format!("0d{}", value), + Some(16) => format!("0x{:x}", value), + Some(_) => format!("!!{:x}!!", value), + }; + + allocator.text(value_str) + } + } + } +} diff --git a/src/ir/from_syntax.rs b/src/ir/from_syntax.rs new file mode 100644 index 0000000..c4d710d --- /dev/null +++ b/src/ir/from_syntax.rs @@ -0,0 +1,73 @@ +use internment::ArcIntern; + +use crate::ir::ast as ir; +use crate::syntax::ast as syntax; + +impl From for ir::Program { + fn from(mut value: syntax::Program) -> Self { + ir::Program { + statements: value.statements.drain(..).map(Into::into).collect(), + } + } +} + +impl From> for ir::Program { + fn from(mut value: Vec) -> Self { + ir::Program { + statements: value.drain(..).map(Into::into).collect(), + } + } +} + +impl From for ir::Statement { + fn from(value: syntax::Statement) -> Self { + match value { + syntax::Statement::Binding(loc, name, expr) => { + ir::Statement::Binding(loc, ArcIntern::from(name), ir::Expression::from(expr)) + } + syntax::Statement::Print(loc, name) => ir::Statement::Print(loc, ArcIntern::from(name)), + } + } +} + +impl From for ir::Expression { + fn from(value: syntax::Expression) -> Self { + match value { + syntax::Expression::Primitive(loc, name, mut exprs) => ir::Expression::Primitive( + loc, + ir::Primitive::try_from(name.as_str()).unwrap(), + exprs.drain(..).map(Into::into).collect(), + ), + syntax::Expression::Reference(loc, name) => { + ir::Expression::Reference(loc, ArcIntern::from(name)) + } + syntax::Expression::Value(loc, value) => { + ir::Expression::Value(loc, ir::Value::from(value)) + } + } + } +} + +impl From for ir::ValueOrRef { + fn from(value: syntax::Expression) -> Self { + match value { + syntax::Expression::Primitive(loc, _, _) => { + panic!("{:?}: couldn't convert to valueorref", loc) + } + + syntax::Expression::Reference(loc, var) => { + ir::ValueOrRef::Ref(loc, ArcIntern::new(var)) + } + + syntax::Expression::Value(loc, val) => ir::ValueOrRef::Value(loc, val.into()), + } + } +} + +impl From for ir::Value { + fn from(x: syntax::Value) -> Self { + match x { + syntax::Value::Number(base, value) => ir::Value::Number(base, value), + } + } +} diff --git a/src/ir/strings.rs b/src/ir/strings.rs new file mode 100644 index 0000000..d0e57a2 --- /dev/null +++ b/src/ir/strings.rs @@ -0,0 +1,36 @@ +use super::ast::{Expression, Program, Statement}; +use internment::ArcIntern; +use std::collections::HashSet; + +impl Program { + pub fn strings(&self) -> HashSet> { + let mut result = HashSet::new(); + + for stmt in self.statements.iter() { + stmt.register_strings(&mut result); + } + + result + } +} + +impl Statement { + fn register_strings(&self, string_set: &mut HashSet>) { + match self { + Statement::Binding(_, name, expr) => { + string_set.insert(name.clone()); + expr.register_strings(string_set); + } + + Statement::Print(_, name) => { + string_set.insert(name.clone()); + } + } + } +} + +impl Expression { + fn register_strings(&self, _string_set: &mut HashSet>) { + // nothing has a string in here, at the moment + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..6ed733f --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,3 @@ +pub mod backend; +pub mod ir; +pub mod syntax; diff --git a/src/syntax.rs b/src/syntax.rs new file mode 100644 index 0000000..39e992d --- /dev/null +++ b/src/syntax.rs @@ -0,0 +1,272 @@ +use codespan_reporting::{diagnostic::Diagnostic, files::SimpleFiles}; +use lalrpop_util::lalrpop_mod; +use logos::Logos; + +mod arbitrary; +pub mod ast; +mod location; +mod simplify; +mod tokens; +lalrpop_mod!( + #[allow(clippy::just_underscores_and_digits, clippy::clone_on_copy)] + parser, + "/syntax/parser.rs" +); +mod pretty; +mod validate; + +pub use crate::syntax::ast::*; +pub use crate::syntax::location::Location; +use crate::syntax::parser::ProgramParser; +pub use crate::syntax::tokens::{LexerError, Token}; +#[cfg(test)] +use ::pretty::{Arena, Pretty}; +use lalrpop_util::ParseError; +#[cfg(test)] +use proptest::{prop_assert, prop_assert_eq}; +#[cfg(test)] +use std::str::FromStr; +use thiserror::Error; + +use self::parser::StatementParser; + +#[derive(Debug, Error)] +pub enum ParserError { + #[error("Invalid token")] + InvalidToken(Location), + #[error("Unrecognized EOF")] + UnrecognizedEOF(Location, Vec), + #[error("Unrecognized token")] + UnrecognizedToken(Location, Location, Token, Vec), + #[error("Extra token")] + ExtraToken(Location, Token, Location), + #[error("Lexing failure")] + LexFailure(Location), + #[error("File database error")] + FileDatabaseError(#[from] codespan_reporting::files::Error), + #[error("Read error")] + ReadError(#[from] std::io::Error), +} + +impl ParserError { + fn convert(file_idx: usize, err: ParseError) -> Self { + match err { + ParseError::InvalidToken { location } => { + ParserError::InvalidToken(Location::new(file_idx, location)) + } + ParseError::UnrecognizedEOF { location, expected } => { + ParserError::UnrecognizedEOF(Location::new(file_idx, location), expected) + } + ParseError::UnrecognizedToken { + token: (start, token, end), + expected, + } => ParserError::UnrecognizedToken( + Location::new(file_idx, start), + Location::new(file_idx, end), + token, + expected, + ), + ParseError::ExtraToken { + token: (start, token, end), + } => ParserError::ExtraToken( + Location::new(file_idx, start), + token, + Location::new(file_idx, end), + ), + ParseError::User { error } => match error { + LexerError::LexFailure(offset) => { + ParserError::LexFailure(Location::new(file_idx, offset)) + } + }, + } + } +} + +fn display_expected(expected: &[String]) -> String { + match expected.len() { + 0 => "".to_string(), + 1 => format!("; expected {}", expected[0]), + 2 => format!("; expected {} or {}", expected[0], expected[1]), + n => format!( + "; expected {}or {}", + comma_separate(&expected[0..n - 1]), + expected[n - 1] + ), + } +} + +fn comma_separate(strings: &[String]) -> String { + let mut result = String::new(); + + for s in strings.iter() { + result.push_str(s); + result.push_str(", "); + } + + result +} + +impl<'a> From<&'a ParserError> for Diagnostic { + fn from(value: &ParserError) -> Self { + match value { + // this was just a token we didn't understand + ParserError::InvalidToken(location) => location + .labelled_error("extremely odd token") + .with_message("encountered extremely confusing token"), + + // unexpected EOF! + ParserError::UnrecognizedEOF(location, expected) => location.error().with_message( + format!("expected enf of file{}", display_expected(expected)), + ), + + // encountered a token where it shouldn't be + ParserError::UnrecognizedToken(start, end, token, expected) => { + let expected_str = + format!("unexpected token {}{}", token, display_expected(expected)); + let unexpected_str = format!("unexpected token {}", token); + let mut labels = start.range_label(end); + + Diagnostic::error() + .with_labels( + labels + .drain(..) + .map(|l| l.with_message(unexpected_str.clone())) + .collect(), + ) + .with_message(expected_str) + } + + // I think we get this when we get a token, but were expected EOF + ParserError::ExtraToken(start, token, end) => { + let expected_str = + format!("unexpected token {} after the expected end of file", token); + let unexpected_str = format!("unexpected token {}", token); + let mut labels = start.range_label(end); + + Diagnostic::error() + .with_labels( + labels + .drain(..) + .map(|l| l.with_message(unexpected_str.clone())) + .collect(), + ) + .with_message(expected_str) + } + + // simple lexer errors + ParserError::LexFailure(location) => { + location.error().with_message("unexpected character") + } + + ParserError::FileDatabaseError(e) => Diagnostic::error().with_message(e.to_string()), + + ParserError::ReadError(e) => Diagnostic::error().with_message(e.to_string()), + } + } +} + +impl Program { + pub fn parse_file( + file_database: &mut SimpleFiles, + file_name: &str, + ) -> Result { + let file_contents = std::fs::read_to_string(file_name)?; + let file_handle = file_database.add(file_name.to_string(), file_contents); + let file_db_info = file_database.get(file_handle)?; + Program::parse(file_handle, file_db_info.source()) + } + + pub fn parse(file_idx: usize, buffer: &str) -> Result { + let lexer = Token::lexer(buffer) + .spanned() + .map(|(token, range)| (range.start, token, range.end)); + ProgramParser::new() + .parse(file_idx, lexer) + .map_err(|e| ParserError::convert(file_idx, e)) + } +} + +impl Statement { + pub fn parse(file_idx: usize, buffer: &str) -> Result { + let lexer = Token::lexer(buffer) + .spanned() + .map(|(token, range)| (range.start, token, range.end)); + StatementParser::new() + .parse(file_idx, lexer) + .map_err(|e| ParserError::convert(file_idx, e)) + } +} + +#[cfg(test)] +impl FromStr for Program { + type Err = ParserError; + + fn from_str(s: &str) -> Result { + Program::parse(0, s) + } +} + +#[test] +fn order_of_operations() { + let muladd1 = "x = 1 + 2 * 3;"; + let testfile = 0; + assert_eq!( + Program::from_str(muladd1).unwrap(), + Program { + statements: vec![Statement::Binding( + Location::new(testfile, 0), + "x".to_string(), + Expression::Primitive( + Location::new(testfile, 6), + "+".to_string(), + vec![ + Expression::Value(Location::new(testfile, 4), Value::Number(None, 1)), + Expression::Primitive( + Location::new(testfile, 10), + "*".to_string(), + vec![ + Expression::Value( + Location::new(testfile, 8), + Value::Number(None, 2), + ), + Expression::Value( + Location::new(testfile, 12), + Value::Number(None, 3), + ), + ] + ) + ] + ) + ),], + } + ); +} + +proptest::proptest! { + #[test] + fn random_render_parses_equal(program: Program) { + let mut file_database = SimpleFiles::new(); + let writer = ::pretty::termcolor::StandardStream::stderr(::pretty::termcolor::ColorChoice::Auto); + let config = codespan_reporting::term::Config::default(); + let allocator = Arena::<()>::new(); + + let mut out_vector = vec![]; + prop_assert!(program.pretty(&allocator).render(80, &mut out_vector).is_ok()); + let string = std::str::from_utf8(&out_vector).expect("emitted valid string"); + let file_handle = file_database.add("test", string); + let file_db_info = file_database.get(file_handle).expect("find thing just inserted"); + let parsed = Program::parse(file_handle, file_db_info.source()); + + if let Err(e) = &parsed { + eprintln!("failed to parse:\n{}", string); + codespan_reporting::term::emit(&mut writer.lock(), &config, &file_database, &e.into()).unwrap(); + } + prop_assert_eq!(program, parsed.unwrap()); + } + + #[test] + fn random_syntaxes_validate(program: Program) { + let (errors, _) = program.validate(); + prop_assert!(errors.is_empty()); + } +} diff --git a/src/syntax/arbitrary.rs b/src/syntax/arbitrary.rs new file mode 100644 index 0000000..52f43ab --- /dev/null +++ b/src/syntax/arbitrary.rs @@ -0,0 +1,159 @@ +use std::collections::HashSet; + +use crate::syntax::ast::{Expression, Program, Statement, Value}; +use crate::syntax::location::Location; +use proptest::sample::select; +use proptest::{ + prelude::{Arbitrary, BoxedStrategy, Strategy}, + strategy::{Just, Union}, +}; + +const VALID_VARIABLE_NAMES: &str = r"[a-z][a-zA-Z0-9_]*"; + +#[derive(Debug)] +struct Name(String); + +impl Arbitrary for Name { + type Parameters = (); + type Strategy = BoxedStrategy; + + fn arbitrary_with(_: Self::Parameters) -> Self::Strategy { + VALID_VARIABLE_NAMES.prop_map(Name).boxed() + } +} + +impl Arbitrary for Program { + type Parameters = (); + type Strategy = BoxedStrategy; + + fn arbitrary_with(_: Self::Parameters) -> Self::Strategy { + let optionals = Vec::>::arbitrary(); + + optionals + .prop_flat_map(|mut possible_names| { + let mut statements = Vec::new(); + let mut defined_variables: HashSet = HashSet::new(); + + for possible_name in possible_names.drain(..) { + match possible_name { + None if defined_variables.is_empty() => continue, + None => statements.push( + Union::new(defined_variables.iter().map(|name| { + Just(Statement::Print(Location::manufactured(), name.to_string())) + })) + .boxed(), + ), + Some(new_name) => { + let closures_name = new_name.0.clone(); + let retval = + Expression::arbitrary_with(Some(defined_variables.clone())) + .prop_map(move |exp| { + Statement::Binding( + Location::manufactured(), + closures_name.clone(), + exp, + ) + }) + .boxed(); + + defined_variables.insert(new_name.0); + statements.push(retval); + } + } + } + + statements + }) + .prop_map(|statements| Program { statements }) + .boxed() + } +} + +impl Arbitrary for Statement { + type Parameters = Option>; + type Strategy = BoxedStrategy; + + fn arbitrary_with(args: Self::Parameters) -> Self::Strategy { + let duplicated_args = args.clone(); + let defined_variables = args.unwrap_or_default(); + + let binding_strategy = ( + VALID_VARIABLE_NAMES, + Expression::arbitrary_with(duplicated_args), + ) + .prop_map(|(name, exp)| Statement::Binding(Location::manufactured(), name, exp)) + .boxed(); + + if defined_variables.is_empty() { + binding_strategy + } else { + let print_strategy = Union::new( + defined_variables + .iter() + .map(|x| Just(Statement::Print(Location::manufactured(), x.to_string()))), + ) + .boxed(); + + Union::new([binding_strategy, print_strategy]).boxed() + } + } +} + +impl Arbitrary for Expression { + type Parameters = Option>; + type Strategy = BoxedStrategy; + + fn arbitrary_with(args: Self::Parameters) -> Self::Strategy { + let defined_variables = args.unwrap_or_default(); + + let value_strategy = Value::arbitrary() + .prop_map(move |x| Expression::Value(Location::manufactured(), x)) + .boxed(); + + let leaf_strategy = if defined_variables.is_empty() { + value_strategy + } else { + let reference_strategy = Union::new(defined_variables.iter().map(|x| { + Just(Expression::Reference( + Location::manufactured(), + x.to_owned(), + )) + })) + .boxed(); + Union::new([value_strategy, reference_strategy]).boxed() + }; + + leaf_strategy + .prop_recursive(3, 64, 2, move |inner| { + ( + select(super::BINARY_OPERATORS), + proptest::collection::vec(inner, 2), + ) + .prop_map(move |(operator, exprs)| { + Expression::Primitive(Location::manufactured(), operator.to_string(), exprs) + }) + }) + .boxed() + } +} + +impl Arbitrary for Value { + type Parameters = (); + type Strategy = BoxedStrategy; + + fn arbitrary_with(_: Self::Parameters) -> Self::Strategy { + let base_strategy = Union::new([ + Just(None::), + Just(Some(2)), + Just(Some(8)), + Just(Some(10)), + Just(Some(16)), + ]); + + let value_strategy = i64::arbitrary(); + + (base_strategy, value_strategy) + .prop_map(move |(base, value)| Value::Number(base, value)) + .boxed() + } +} diff --git a/src/syntax/ast.rs b/src/syntax/ast.rs new file mode 100644 index 0000000..ad28025 --- /dev/null +++ b/src/syntax/ast.rs @@ -0,0 +1,60 @@ +use crate::syntax::Location; + +pub static BINARY_OPERATORS: &[&str] = &["+", "-", "*", "/"]; + +#[derive(Clone, Debug, PartialEq)] +pub struct Program { + pub statements: Vec, +} + +#[derive(Clone, Debug)] +pub enum Statement { + Binding(Location, String, Expression), + Print(Location, String), +} + +impl PartialEq for Statement { + fn eq(&self, other: &Self) -> bool { + match self { + Statement::Binding(_, name1, expr1) => match other { + Statement::Binding(_, name2, expr2) => name1 == name2 && expr1 == expr2, + _ => false, + }, + Statement::Print(_, name1) => match other { + Statement::Print(_, name2) => name1 == name2, + _ => false, + }, + } + } +} + +#[derive(Clone, Debug)] +pub enum Expression { + Value(Location, Value), + Reference(Location, String), + Primitive(Location, String, Vec), +} + +impl PartialEq for Expression { + fn eq(&self, other: &Self) -> bool { + match self { + Expression::Value(_, val1) => match other { + Expression::Value(_, val2) => val1 == val2, + _ => false, + }, + Expression::Reference(_, var1) => match other { + Expression::Reference(_, var2) => var1 == var2, + _ => false, + }, + Expression::Primitive(_, prim1, args1) => match other { + Expression::Primitive(_, prim2, args2) => prim1 == prim2 && args1 == args2, + _ => false, + }, + } + } +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum Value { + Number(Option, i64), +} diff --git a/src/syntax/location.rs b/src/syntax/location.rs new file mode 100644 index 0000000..65e1402 --- /dev/null +++ b/src/syntax/location.rs @@ -0,0 +1,56 @@ +use codespan_reporting::diagnostic::{Diagnostic, Label}; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Location { + file_idx: usize, + offset: usize, +} + +impl Location { + pub fn new(file_idx: usize, offset: usize) -> Self { + Location { file_idx, offset } + } + + pub fn manufactured() -> Self { + Location { + file_idx: 0, + offset: 0, + } + } + + pub fn primary_label(&self) -> Label { + Label::primary(self.file_idx, self.offset..self.offset) + } + + pub fn secondary_label(&self) -> Label { + Label::secondary(self.file_idx, self.offset..self.offset) + } + + pub fn range_label(&self, end: &Location) -> Vec> { + if self.file_idx == end.file_idx { + vec![Label::primary(self.file_idx, self.offset..end.offset)] + } else if self.file_idx == 0 { + // if this is a manufactured item, then ... just try the other one + vec![Label::primary(end.file_idx, end.offset..end.offset)] + } else { + // we'll just pick the first location if this is in two different + // files + vec![Label::primary(self.file_idx, self.offset..self.offset)] + } + } + + pub fn error(&self) -> Diagnostic { + Diagnostic::error().with_labels(vec![Label::primary( + self.file_idx, + self.offset..self.offset, + )]) + } + + pub fn labelled_error(&self, msg: &str) -> Diagnostic { + Diagnostic::error().with_labels(vec![Label::primary( + self.file_idx, + self.offset..self.offset, + ) + .with_message(msg)]) + } +} diff --git a/src/syntax/parser.lalrpop b/src/syntax/parser.lalrpop new file mode 100644 index 0000000..85b21b3 --- /dev/null +++ b/src/syntax/parser.lalrpop @@ -0,0 +1,78 @@ +use crate::syntax::{LexerError, Location}; +use crate::syntax::ast::{Program,Statement,Expression,Value}; +use crate::syntax::tokens::Token; +use internment::ArcIntern; + +grammar(file_idx: usize); + +extern { + type Location = usize; + type Error = LexerError; + + enum Token { + "=" => Token::Equals, + ";" => Token::Semi, + "(" => Token::LeftParen, + ")" => Token::RightParen, + + "print" => Token::Print, + + "+" => Token::Operator('+'), + "-" => Token::Operator('-'), + "*" => Token::Operator('*'), + "/" => Token::Operator('/'), + + "" => Token::Number((>,)), + "" => Token::Variable(>), + } +} + +pub Program: Program = { + => Program { + statements: stmts + } +} + +Statements: Vec = { + => { + stmts.push(stmt); + stmts + }, + => { + Vec::new() + } +} + +pub Statement: Statement = { + "> "=" ";" => Statement::Binding(Location::new(file_idx, l), v.to_string(), e), + "print" "> ";" => Statement::Print(Location::new(file_idx, l), v.to_string()), +} + +Expression: Expression = { + AdditiveExpression, +} + +AdditiveExpression: Expression = { + "+" => Expression::Primitive(Location::new(file_idx, l), "+".to_string(), vec![e1, e2]), + "-" => Expression::Primitive(Location::new(file_idx, l), "-".to_string(), vec![e1, e2]), + MultiplicativeExpression, +} + +MultiplicativeExpression: Expression = { + "*" => Expression::Primitive(Location::new(file_idx, l), "*".to_string(), vec![e1, e2]), + "/" => Expression::Primitive(Location::new(file_idx, l), "/".to_string(), vec![e1, e2]), + AtomicExpression, +} + +AtomicExpression: Expression = { + "> => Expression::Reference(Location::new(file_idx, l), v.to_string()), + "> => { + let val = Value::Number(n.0, n.1); + Expression::Value(Location::new(file_idx, l), val) + }, + "-" "> => { + let val = Value::Number(n.0, -n.1); + Expression::Value(Location::new(file_idx, l), val) + }, + "(" ")" => e, +} \ No newline at end of file diff --git a/src/syntax/pretty.rs b/src/syntax/pretty.rs new file mode 100644 index 0000000..46a59fb --- /dev/null +++ b/src/syntax/pretty.rs @@ -0,0 +1,115 @@ +use crate::syntax::ast::{Expression, Program, Statement, Value, BINARY_OPERATORS}; +use pretty::{DocAllocator, DocBuilder, Pretty}; + +impl<'a, 'b, D, A> Pretty<'a, D, A> for &'b Program +where + A: 'a, + D: ?Sized + DocAllocator<'a, A>, +{ + fn pretty(self, allocator: &'a D) -> DocBuilder<'a, D, A> { + let mut result = allocator.nil(); + + for stmt in self.statements.iter() { + result = result + .append(stmt.pretty(allocator)) + .append(allocator.text(";")) + .append(allocator.hardline()); + } + + result + } +} + +impl<'a, 'b, D, A> Pretty<'a, D, A> for &'b Statement +where + A: 'a, + D: ?Sized + DocAllocator<'a, A>, +{ + fn pretty(self, allocator: &'a D) -> DocBuilder<'a, D, A> { + match self { + Statement::Binding(_, var, expr) => allocator + .text(var.to_string()) + .append(allocator.space()) + .append(allocator.text("=")) + .append(allocator.space()) + .append(expr.pretty(allocator)), + Statement::Print(_, var) => allocator + .text("print") + .append(allocator.space()) + .append(allocator.text(var.to_string())), + } + } +} + +impl<'a, 'b, D, A> Pretty<'a, D, A> for &'b Expression +where + A: 'a, + D: ?Sized + DocAllocator<'a, A>, +{ + fn pretty(self, allocator: &'a D) -> DocBuilder<'a, D, A> { + match self { + Expression::Value(_, val) => val.pretty(allocator), + Expression::Reference(_, var) => allocator.text(var.to_string()), + Expression::Primitive(_, op, exprs) if BINARY_OPERATORS.contains(&op.as_ref()) => { + assert_eq!( + exprs.len(), + 2, + "Found binary operator with {} components?", + exprs.len() + ); + + let left = exprs[0].pretty(allocator); + let right = exprs[1].pretty(allocator); + + left.append(allocator.space()) + .append(allocator.text(op.to_string())) + .append(allocator.space()) + .append(right) + .parens() + } + Expression::Primitive(_, op, exprs) => { + let call = allocator.text(op.to_string()); + let args = exprs.iter().map(|x| x.pretty(allocator)); + let comma_sepped_args = allocator.intersperse(args, CommaSep {}); + call.append(comma_sepped_args.parens()) + } + } + } +} + +impl<'a, 'b, D, A> Pretty<'a, D, A> for &'b Value +where + A: 'a, + D: ?Sized + DocAllocator<'a, A>, +{ + fn pretty(self, allocator: &'a D) -> DocBuilder<'a, D, A> { + match self { + Value::Number(opt_base, value) => { + let sign = if *value < 0 { "-" } else { "" }; + let value_str = match opt_base { + None => format!("{}", value), + Some(2) => format!("{}0b{:b}", sign, value.abs()), + Some(8) => format!("{}0o{:o}", sign, value.abs()), + Some(10) => format!("{}0d{}", sign, value.abs()), + Some(16) => format!("{}0x{:x}", sign, value.abs()), + Some(_) => format!("!!{}{:x}!!", sign, value.abs()), + }; + + allocator.text(value_str) + } + } + } +} + +#[derive(Clone, Copy)] +struct CommaSep {} + +impl<'a, D, A> Pretty<'a, D, A> for CommaSep +where + A: 'a, + D: ?Sized + DocAllocator<'a, A>, +{ + fn pretty(self, allocator: &'a D) -> DocBuilder<'a, D, A> { + allocator.text(",").append(allocator.space()) + } +} diff --git a/src/syntax/simplify.rs b/src/syntax/simplify.rs new file mode 100644 index 0000000..28ad377 --- /dev/null +++ b/src/syntax/simplify.rs @@ -0,0 +1,63 @@ +use crate::syntax::ast::{Expression, Program, Statement}; + +impl Program { + pub fn simplify(mut self) -> Self { + let mut new_statements = Vec::new(); + let mut gensym_index = 1; + + for stmt in self.statements.drain(..) { + new_statements.append(&mut stmt.simplify(&mut gensym_index)); + } + + self.statements = new_statements; + self + } +} + +impl Statement { + pub fn simplify(self, gensym_index: &mut usize) -> Vec { + let mut new_statements = vec![]; + + match self { + Statement::Print(_, _) => new_statements.push(self), + Statement::Binding(_, _, Expression::Reference(_, _)) => new_statements.push(self), + Statement::Binding(_, _, Expression::Value(_, _)) => new_statements.push(self), + Statement::Binding(loc, name, value) => { + let (mut prereqs, new_value) = value.rebind(&name, gensym_index); + new_statements.append(&mut prereqs); + new_statements.push(Statement::Binding(loc, name, new_value)) + } + } + + new_statements + } +} + +impl Expression { + fn rebind(self, base_name: &str, gensym_index: &mut usize) -> (Vec, Expression) { + match self { + Expression::Value(_, _) => (vec![], self), + Expression::Reference(_, _) => (vec![], self), + Expression::Primitive(loc, prim, mut expressions) => { + let mut prereqs = Vec::new(); + let mut new_exprs = Vec::new(); + + for expr in expressions.drain(..) { + let (mut cur_prereqs, arg) = expr.rebind(base_name, gensym_index); + prereqs.append(&mut cur_prereqs); + new_exprs.push(arg); + } + + let new_name = format!("<{}:{}>", base_name, *gensym_index); + *gensym_index += 1; + prereqs.push(Statement::Binding( + loc.clone(), + new_name.clone(), + Expression::Primitive(loc.clone(), prim, new_exprs), + )); + + (prereqs, Expression::Reference(loc, new_name)) + } + } + } +} diff --git a/src/syntax/token_stream.rs b/src/syntax/token_stream.rs new file mode 100644 index 0000000..8743bb3 --- /dev/null +++ b/src/syntax/token_stream.rs @@ -0,0 +1,117 @@ +use crate::syntax::tokens::Token; +use logos::{Logos, SpannedIter}; +use std::fmt; +use thiserror::Error; + +pub struct TokenStream<'s> { + file_idx: usize, + lexer: SpannedIter<'s, Token>, +} + +impl<'s> TokenStream<'s> { + pub fn new(file_idx: usize, s: &'s str) -> TokenStream<'s> { + TokenStream { + file_idx, + lexer: Token::lexer(s).spanned(), + } + } +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum Location { + InFile(usize, usize), + Manufactured, +} + +impl fmt::Display for Location { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Location::InFile(s, off) => write!(f, "{}:{}", s, off), + Location::Manufactured => write!(f, ""), + } + } +} + +impl Location { + fn new(file_idx: usize, offset: usize) -> Location { + Location::InFile(file_idx, offset) + } +} + +impl Default for Location { + fn default() -> Self { + Location::Manufactured + } +} + +type LocatedToken = Result<(Location, Token, Location), LexerError>; + +impl<'s> Iterator for TokenStream<'s> { + type Item = LocatedToken; + + fn next(&mut self) -> Option { + match self.lexer.next() { + None => None, + Some((Token::Error, span)) => Some(Err(LexerError::new(self.file_idx, span.start))), + Some((token, span)) => { + let start = Location::new(self.file_idx, span.start); + let end = Location::new(self.file_idx, span.end); + Some(Ok((start, token, end))) + } + } + } +} + +#[test] +fn stream_works() { + let fidx = 42; + let mut lex0 = TokenStream::new(42, "y = x + 1//foo"); + assert_eq!( + lex0.next(), + Some(Ok(( + Location::new(fidx, 0), + Token::var("y"), + Location::new(fidx, 1) + ))) + ); + assert_eq!( + lex0.next(), + Some(Ok(( + Location::new(fidx, 2), + Token::Equals, + Location::new(fidx, 3) + ))) + ); + assert_eq!( + lex0.next(), + Some(Ok(( + Location::new(fidx, 4), + Token::var("x"), + Location::new(fidx, 5) + ))) + ); + assert_eq!( + lex0.next(), + Some(Ok(( + Location::new(fidx, 6), + Token::Operator('+'), + Location::new(fidx, 7) + ))) + ); + assert_eq!( + lex0.next(), + Some(Ok(( + Location::new(fidx, 8), + Token::Number((None, 1)), + Location::new(fidx, 9) + ))) + ); + assert_eq!(lex0.next(), None); +} + +#[test] +fn errors_work() { + let fidx = 2; + let mut lex0 = TokenStream::new(2, "\u{2639}"); + assert_eq!(lex0.next(), Some(Err(LexerError::new(fidx, 0)))); +} diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs new file mode 100644 index 0000000..78d0c8a --- /dev/null +++ b/src/syntax/tokens.rs @@ -0,0 +1,123 @@ +use internment::ArcIntern; +use logos::{Lexer, Logos}; +use std::fmt; +use std::num::ParseIntError; +use thiserror::Error; + +#[derive(Logos, Clone, Debug, PartialEq, Eq)] +pub enum Token { + #[token("=")] + Equals, + + #[token(";")] + Semi, + + #[token("(")] + LeftParen, + + #[token(")")] + RightParen, + + #[token("print")] + Print, + + #[regex(r"[+\-*/]", |v| v.slice().chars().next())] + Operator(char), + + #[regex(r"0b[01]+", |v| parse_number(Some(2), v))] + #[regex(r"0o[0-7]+", |v| parse_number(Some(8), v))] + #[regex(r"0d[0-9]+", |v| parse_number(Some(10), v))] + #[regex(r"0x[0-9a-fA-F]+", |v| parse_number(Some(16), v))] + #[regex(r"[0-9]+", |v| parse_number(None, v))] + Number((Option, i64)), + + #[regex(r"[a-z][a-zA-Z0-9_]*", |v| ArcIntern::new(v.slice().to_string()))] + Variable(ArcIntern), + + #[error] + #[regex(r"[ \t\r\n\f]+", logos::skip)] + #[regex(r"//.*", logos::skip)] + Error, +} + +impl fmt::Display for Token { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Token::Equals => write!(f, "'='"), + Token::Semi => write!(f, "';'"), + Token::LeftParen => write!(f, "'('"), + Token::RightParen => write!(f, "')'"), + Token::Print => write!(f, "'print'"), + Token::Operator(c) => write!(f, "'{}'", c), + Token::Number((None, v)) => write!(f, "'{}'", v), + Token::Number((Some(2), v)) => write!(f, "'0b{:b}'", v), + Token::Number((Some(8), v)) => write!(f, "'0o{:o}'", v), + Token::Number((Some(10), v)) => write!(f, "'{}'", v), + Token::Number((Some(16), v)) => write!(f, "'0x{:x}'", v), + Token::Number((Some(b), v)) => { + write!(f, "Invalidly-based-number", b, v) + } + Token::Variable(s) => write!(f, "'{}'", s), + Token::Error => write!(f, ""), + } + } +} + +#[derive(Debug, Error, PartialEq, Eq)] +pub enum LexerError { + #[error("Failed lexing at {0}")] + LexFailure(usize), +} + +#[cfg(test)] +impl Token { + pub(crate) fn var(s: &str) -> Token { + Token::Variable(ArcIntern::new(s.to_string())) + } +} + +fn parse_number( + base: Option, + value: &Lexer, +) -> Result<(Option, i64), ParseIntError> { + let (radix, strval) = match base { + None => (10, value.slice()), + Some(radix) => (radix, &value.slice()[2..]), + }; + + let intval = i64::from_str_radix(strval, radix as u32)?; + Ok((base, intval)) +} + +#[test] +fn lex_numbers() { + let mut lex0 = Token::lexer("12 0b1100 0o14 0d12 0xc // 9"); + assert_eq!(lex0.next(), Some(Token::Number((None, 12)))); + assert_eq!(lex0.next(), Some(Token::Number((Some(2), 12)))); + assert_eq!(lex0.next(), Some(Token::Number((Some(8), 12)))); + assert_eq!(lex0.next(), Some(Token::Number((Some(10), 12)))); + assert_eq!(lex0.next(), Some(Token::Number((Some(16), 12)))); + assert_eq!(lex0.next(), None); +} + +#[test] +fn lex_symbols() { + let mut lex0 = Token::lexer("x + \t y * \n z // rest"); + assert_eq!(lex0.next(), Some(Token::var("x"))); + assert_eq!(lex0.next(), Some(Token::Operator('+'))); + assert_eq!(lex0.next(), Some(Token::var("y"))); + assert_eq!(lex0.next(), Some(Token::Operator('*'))); + assert_eq!(lex0.next(), Some(Token::var("z"))); + assert_eq!(lex0.next(), None); +} + +#[test] +fn lexer_spans() { + let mut lex0 = Token::lexer("y = x + 1//foo").spanned(); + assert_eq!(lex0.next(), Some((Token::var("y"), 0..1))); + assert_eq!(lex0.next(), Some((Token::Equals, 2..3))); + assert_eq!(lex0.next(), Some((Token::var("x"), 4..5))); + assert_eq!(lex0.next(), Some((Token::Operator('+'), 6..7))); + assert_eq!(lex0.next(), Some((Token::Number((None, 1)), 8..9))); + assert_eq!(lex0.next(), None); +} diff --git a/src/syntax/validate.rs b/src/syntax/validate.rs new file mode 100644 index 0000000..da2410c --- /dev/null +++ b/src/syntax/validate.rs @@ -0,0 +1,116 @@ +use crate::syntax::{Expression, Location, Program, Statement}; +use codespan_reporting::diagnostic::Diagnostic; +use std::collections::HashMap; + +pub enum Error { + UnboundVariable(Location, String), +} + +impl From for Diagnostic { + fn from(x: Error) -> Self { + match &x { + Error::UnboundVariable(location, name) => location + .labelled_error("unbound here") + .with_message(format!("Unbound variable '{}'", name)), + } + } +} + +#[derive(Debug, PartialEq, Eq)] +pub enum Warning { + ShadowedVariable(Location, Location, String), +} + +impl From for Diagnostic { + fn from(x: Warning) -> Self { + match &x { + Warning::ShadowedVariable(original, new, name) => Diagnostic::warning() + .with_labels(vec![ + new.primary_label().with_message("variable rebound here"), + original + .secondary_label() + .with_message("original binding site"), + ]) + .with_message(format!("Variable '{}' is rebound", name)), + } + } +} + +impl Program { + pub fn validate(&self) -> (Vec, Vec) { + let mut errors = vec![]; + let mut warnings = vec![]; + let mut bound_variables = HashMap::new(); + + for stmt in self.statements.iter() { + let (mut new_errors, mut new_warnings) = stmt.validate(&mut bound_variables); + errors.append(&mut new_errors); + warnings.append(&mut new_warnings); + } + + (errors, warnings) + } +} + +impl Statement { + pub fn validate( + &self, + bound_variables: &mut HashMap, + ) -> (Vec, Vec) { + let mut errors = vec![]; + let mut warnings = vec![]; + + match self { + Statement::Binding(loc, var, val) => { + // we're going to make the decision that a variable is not bound in the right + // hand side of its binding, which makes a lot of things easier. So we'll just + // immediately check the expression, and go from there. + let (mut exp_errors, mut exp_warnings) = val.validate(bound_variables); + + errors.append(&mut exp_errors); + warnings.append(&mut exp_warnings); + if let Some(original_binding_site) = bound_variables.get(var) { + warnings.push(Warning::ShadowedVariable( + original_binding_site.clone(), + loc.clone(), + var.clone(), + )); + } else { + bound_variables.insert(var.clone(), loc.clone()); + } + } + + Statement::Print(_, var) if bound_variables.contains_key(var) => {} + Statement::Print(loc, var) => { + errors.push(Error::UnboundVariable(loc.clone(), var.clone())) + } + } + + (errors, warnings) + } +} + +impl Expression { + fn validate(&self, variable_map: &HashMap) -> (Vec, Vec) { + match self { + Expression::Value(_, _) => (vec![], vec![]), + Expression::Reference(_, var) if variable_map.contains_key(var) => (vec![], vec![]), + Expression::Reference(loc, var) => ( + vec![Error::UnboundVariable(loc.clone(), var.clone())], + vec![], + ), + Expression::Primitive(_, _, args) => { + let mut errors = vec![]; + let mut warnings = vec![]; + + for expr in args.iter() { + let (mut err, mut warn) = expr.validate(variable_map); + errors.append(&mut err); + warnings.append(&mut warn); + } + + (errors, warnings) + } + } + } +}