Some basic parsing, with interned strings.

This commit is contained in:
2020-08-02 17:58:51 -07:00
parent 81f98cc2c9
commit 2881c5104a
9 changed files with 315 additions and 22 deletions

View File

@@ -13,8 +13,9 @@ name = "ngrc"
path = "src/bin.rs" path = "src/bin.rs"
[dependencies] [dependencies]
logos = "0.11.4"
lalrpop-util = "0.19.0" lalrpop-util = "0.19.0"
lazy_static = "1.4.0"
logos = "0.11.4"
[build-dependencies] [build-dependencies]
lalrpop = "0.19.0" lalrpop = "0.19.0"

View File

@@ -1 +1,2 @@
pub mod syntax; pub mod syntax;
pub mod util;

View File

@@ -1,5 +1,40 @@
//use lalrpop_util::lalrpop_mod; use lalrpop_util::lalrpop_mod;
pub mod tokens; mod tokens;
//lalrpop_mod!(pub parser); mod token_stream;
pub mod ast; lalrpop_mod!(parser, "/syntax/parser.rs");
mod ast;
pub use crate::syntax::ast::*;
use crate::syntax::parser::ProgramParser;
use crate::syntax::tokens::Token;
use crate::syntax::token_stream::{LexerError, Location, TokenStream};
use lalrpop_util::ParseError;
use std::fs;
use std::io;
pub enum ParserError {
IOError(io::Error),
ParseError(ParseError<Location,Token,LexerError>),
}
impl From<io::Error> for ParserError {
fn from(x: io::Error) -> Self {
ParserError::IOError(x)
}
}
impl From<ParseError<Location,Token,LexerError>> for ParserError {
fn from(x: ParseError<Location, Token, LexerError>) -> Self {
ParserError::ParseError(x)
}
}
impl Program {
pub fn from_file(filename: &str) -> Result<Program, ParserError> {
let metadata = fs::metadata(filename)?;
let mut buffer = String::with_capacity(metadata.len() as usize);
let lexer = TokenStream::from_file(filename, &mut buffer)?;
Ok(ProgramParser::new().parse(lexer)?)
}
}

View File

@@ -1,12 +1,19 @@
pub enum Stmt { use crate::syntax::token_stream::Location;
Binding(String, Expr),
Expr(Expr), pub struct Program {
pub statements: Vec<Statement>,
pub result: Expression,
} }
pub enum Expr { pub enum Statement {
Value(Value), Binding(Location, String, Expression),
Reference(String), Expr(Location, Expression),
Primitive(String, Vec<Expr>), }
pub enum Expression {
Value(Location, Value),
Reference(Location, String),
Primitive(Location, String, Vec<Expression>),
} }
pub enum Value { pub enum Value {

70
src/syntax/parser.lalrpop Normal file
View File

@@ -0,0 +1,70 @@
use crate::syntax::ast::{Program,Statement,Expression,Value};
use crate::syntax::tokens::Token;
use crate::syntax::token_stream::{LexerError, Location};
use crate::util::istring::InternedString;
grammar;
extern {
type Location = Location;
type Error = LexerError;
enum Token {
"=" => Token::Equals,
";" => Token::Semi,
"+" => Token::Operator('+'),
"-" => Token::Operator('-'),
"*" => Token::Operator('*'),
"/" => Token::Operator('/'),
"<num>" => Token::Number((<Option<u8>>,<i128>)),
"<var>" => Token::Variable(<InternedString>),
}
}
pub Program: Program = {
<stmts:Statements> <result:Expression> => Program {
statements: stmts,
result
}
}
Statements: Vec<Statement> = {
<mut stmts:Statements> <stmt:Statement> => {
stmts.push(stmt);
stmts
},
=> {
Vec::new()
}
}
Statement: Statement = {
<l:@L> <v:"<var>"> "=" <e:Expression> ";" => Statement::Binding(l, v.to_string(), e),
<l:@L> <e:Expression> ";" => Statement::Expr(l, e),
}
Expression: Expression = {
MultiplicativeExpression
}
MultiplicativeExpression: Expression = {
<l:@L> <e1:MultiplicativeExpression> "*" <e2:AdditiveExpression> => Expression::Primitive(l, "*".to_string(), vec![e1, e2]),
<l:@L> <e1:MultiplicativeExpression> "/" <e2:AdditiveExpression> => Expression::Primitive(l, "/".to_string(), vec![e1, e2]),
AdditiveExpression,
}
AdditiveExpression: Expression = {
<l:@L> <e1:AdditiveExpression> "+" <e2:AtomicExpression> => Expression::Primitive(l, "*".to_string(), vec![e1, e2]),
<l:@L> <e1:AdditiveExpression> "-" <e2:AtomicExpression> => Expression::Primitive(l, "/".to_string(), vec![e1, e2]),
AtomicExpression,
}
AtomicExpression: Expression = {
<l:@L> <v:"<var>"> => Expression::Reference(l, v.to_string()),
<l:@L> <n:"<num>"> => {
let val = Value::Number(n.0, n.1);
Expression::Value(l, val)
}
}

View File

@@ -0,0 +1,99 @@
use crate::syntax::tokens::Token;
use crate::util::istring::InternedString;
use logos::{Logos,SpannedIter};
use std::fs::File;
use std::io;
use std::io::Read;
pub struct TokenStream<'s> {
filename: InternedString,
lexer: SpannedIter<'s, Token>,
}
impl<'s> TokenStream<'s> {
pub fn new(filename: &str, s: &'s str) -> TokenStream<'s> {
TokenStream {
filename: InternedString::new(filename),
lexer: Token::lexer(s).spanned()
}
}
pub fn from_file(filename: &str, buffer: &'s mut String) -> io::Result<TokenStream<'s>> {
let mut file = File::open(filename)?;
file.read_to_string(buffer)?;
Ok(TokenStream::new(filename, buffer))
}
}
#[derive(Clone,Debug,PartialEq)]
pub enum Location {
InFile(InternedString, usize),
Manufactured
}
impl Location {
fn new(filename: InternedString, offset: usize) -> Location {
Location::InFile(filename, offset)
}
}
impl Default for Location {
fn default() -> Self {
Location::Manufactured
}
}
#[derive(Debug,PartialEq)]
pub struct LexerError {
filename: InternedString,
offset: usize
}
#[cfg(test)]
impl LexerError {
fn new(filename: InternedString, offset: usize) -> LexerError {
LexerError{ filename, offset, }
}
}
type LocatedToken = Result<(Location, Token, Location),LexerError>;
impl<'s> Iterator for TokenStream<'s> {
type Item = LocatedToken;
fn next(&mut self) -> Option<Self::Item> {
match self.lexer.next() {
None => None,
Some((Token::Error, span)) => {
Some(Err(LexerError {
filename: self.filename,
offset: span.start,
}))
}
Some((token, span)) => {
let start = Location::new(self.filename, span.start);
let end = Location::new(self.filename, span.end);
Some(Ok((start, token, end)))
}
}
}
}
#[test]
fn stream_works() {
let fname = InternedString::new("<file>");
let mut lex0 = TokenStream::new("<file>", "y = x + 1//foo");
assert_eq!(lex0.next(), Some(Ok((Location::new(fname, 0), Token::var("y"), Location::new(fname, 1)))));
assert_eq!(lex0.next(), Some(Ok((Location::new(fname, 2), Token::Equals, Location::new(fname, 3)))));
assert_eq!(lex0.next(), Some(Ok((Location::new(fname, 4), Token::var("x"), Location::new(fname, 5)))));
assert_eq!(lex0.next(), Some(Ok((Location::new(fname, 6), Token::Operator('+'), Location::new(fname, 7)))));
assert_eq!(lex0.next(), Some(Ok((Location::new(fname, 8), Token::Number((None, 1)), Location::new(fname, 9)))));
assert_eq!(lex0.next(), None);
}
#[test]
fn errors_work() {
let fname = InternedString::new("<file>");
let mut lex0 = TokenStream::new("<file>", "\u{2639}");
assert_eq!(lex0.next(), Some(Err(LexerError::new(fname, 0))));
}

View File

@@ -1,14 +1,18 @@
use crate::util::istring::InternedString;
use logos::{Lexer, Logos}; use logos::{Lexer, Logos};
use std::num::ParseIntError; use std::num::ParseIntError;
#[derive(Logos,Debug,PartialEq)] #[derive(Logos,Clone,Debug,PartialEq)]
enum Token<'src> { pub enum Token {
#[regex(r"[ \t\n\f]+", logos::skip)] #[regex(r"[ \t\n\f]+", logos::skip)]
#[regex(r"//.*", logos::skip)] #[regex(r"//.*", logos::skip)]
#[token("=")] #[token("=")]
Equals, Equals,
#[token(";")]
Semi,
#[regex(r"[+\-*/]", |v| v.slice().chars().nth(0))] #[regex(r"[+\-*/]", |v| v.slice().chars().nth(0))]
Operator(char), Operator(char),
@@ -19,14 +23,21 @@ enum Token<'src> {
#[regex(r"[0-9]+", |v| parse_number(None, v))] #[regex(r"[0-9]+", |v| parse_number(None, v))]
Number((Option<u8>, i128)), Number((Option<u8>, i128)),
#[regex(r"[a-z][a-zA-Z0-9_]*")] #[regex(r"[a-z][a-zA-Z0-9_]*", |v| InternedString::new(v.slice()))]
Variable(&'src str), Variable(InternedString),
#[error] #[error]
Error, Error,
} }
fn parse_number<'a,'src>(base: Option<u8>, value: &'a Lexer<'src, Token<'src>>) -> Result<(Option<u8>, i128), ParseIntError> { #[cfg(test)]
impl Token {
pub(crate) fn var(s: &str) -> Token {
Token::Variable(InternedString::new(s))
}
}
fn parse_number<'a,'src>(base: Option<u8>, value: &'a Lexer<'src, Token>) -> Result<(Option<u8>, i128), ParseIntError> {
let (radix, strval) = match base { let (radix, strval) = match base {
None => (10, value.slice()), None => (10, value.slice()),
Some(radix) => (radix, &value.slice()[2..]), Some(radix) => (radix, &value.slice()[2..]),
@@ -51,20 +62,20 @@ fn lex_numbers() {
#[test] #[test]
fn lex_symbols() { fn lex_symbols() {
let mut lex0 = Token::lexer("x + \t y * \n z // rest"); let mut lex0 = Token::lexer("x + \t y * \n z // rest");
assert_eq!(lex0.next(), Some(Token::Variable("x"))); assert_eq!(lex0.next(), Some(Token::var("x")));
assert_eq!(lex0.next(), Some(Token::Operator('+'))); assert_eq!(lex0.next(), Some(Token::Operator('+')));
assert_eq!(lex0.next(), Some(Token::Variable("y"))); assert_eq!(lex0.next(), Some(Token::var("y")));
assert_eq!(lex0.next(), Some(Token::Operator('*'))); assert_eq!(lex0.next(), Some(Token::Operator('*')));
assert_eq!(lex0.next(), Some(Token::Variable("z"))); assert_eq!(lex0.next(), Some(Token::var("z")));
assert_eq!(lex0.next(), None); assert_eq!(lex0.next(), None);
} }
#[test] #[test]
fn lexer_spans() { fn lexer_spans() {
let mut lex0 = Token::lexer("y = x + 1//foo").spanned(); let mut lex0 = Token::lexer("y = x + 1//foo").spanned();
assert_eq!(lex0.next(), Some((Token::Variable("y"), 0..1))); assert_eq!(lex0.next(), Some((Token::var("y"), 0..1)));
assert_eq!(lex0.next(), Some((Token::Equals, 2..3))); assert_eq!(lex0.next(), Some((Token::Equals, 2..3)));
assert_eq!(lex0.next(), Some((Token::Variable("x"), 4..5))); assert_eq!(lex0.next(), Some((Token::var("x"), 4..5)));
assert_eq!(lex0.next(), Some((Token::Operator('+'), 6..7))); assert_eq!(lex0.next(), Some((Token::Operator('+'), 6..7)));
assert_eq!(lex0.next(), Some((Token::Number((None, 1)), 8..9))); assert_eq!(lex0.next(), Some((Token::Number((None, 1)), 8..9)));
assert_eq!(lex0.next(), None); assert_eq!(lex0.next(), None);

1
src/util.rs Normal file
View File

@@ -0,0 +1 @@
pub mod istring;

68
src/util/istring.rs Normal file
View File

@@ -0,0 +1,68 @@
use lazy_static::lazy_static;
use std::cmp::{Ordering, max};
use std::collections::HashMap;
use std::fmt;
use std::sync::RwLock;
lazy_static! {
static ref STRING_TABLE: RwLock<HashMap<u64, String>> = RwLock::new(HashMap::new());
}
#[derive(Clone, Copy, PartialEq, Eq)]
pub struct InternedString {
index: u64,
}
impl InternedString {
/// Return the `InternedString` equivalent of the provided string. This function is slow, and
/// should be used somewhat sparingly.
pub fn new(s: &str) -> Self {
let mut biggest_index = 0;
let mut table = STRING_TABLE.write().unwrap();
for (k, v) in table.iter() {
if v == s {
return InternedString{ index: *k }
}
biggest_index = max(biggest_index, *k);
}
let res = biggest_index + 1;
table.insert(res, s.to_string());
InternedString {
index: res
}
}
}
impl fmt::Debug for InternedString {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match STRING_TABLE.read().unwrap().get(&self.index) {
None => write!(f, "<BROKEN-INTERN>"),
Some(x) => write!(f, "{:?}", x),
}
}
}
impl fmt::Display for InternedString {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match STRING_TABLE.read().unwrap().get(&self.index) {
None => write!(f, "<BROKEN-INTERN>"),
Some(x) => write!(f, "{}", x),
}
}
}
impl PartialOrd<InternedString> for InternedString {
fn partial_cmp(&self, other: &InternedString) -> Option<Ordering> {
let table = STRING_TABLE.read().unwrap();
if let Some(me) = table.get(&self.index) {
if let Some(them) = table.get(&other.index) {
return me.partial_cmp(them);
}
}
None
}
}