Lexer integration.

This commit is contained in:
2016-04-30 21:52:37 -07:00
parent edc2a7161a
commit a9dc45d93d
9 changed files with 271 additions and 199 deletions

View File

@@ -1,4 +1,4 @@
module Syntax.CommandLine(
module Bang.CommandLine(
BangCommand(..)
, BangOperation(..)
, LexerOptions(..)
@@ -7,8 +7,6 @@ module Syntax.CommandLine(
)
where
import Data.Version(showVersion)
import Paths_bang(version)
import Options.Applicative
import Options.Applicative.Help
@@ -32,12 +30,14 @@ outputFile = strOption (short 'o' <> long "output-file" <> metavar "FILE"
<> value "/dev/stdout" <> showDefault)
data BangOperation = Help
| Version
| Lex LexerOptions
deriving (Show)
bangOperation :: Parser BangOperation
bangOperation = subparser $
command "help" (pure Help `withInfo` "Describe common commands.") <>
command "version" (pure Version `withInfo` "Display version information.") <>
command "lex" (parseLex `withInfo` "Lex a file into its component tokens.")
withInfo :: Parser a -> String -> ParserInfo a

107
src/Bang/Syntax/Lexer.x Normal file
View File

@@ -0,0 +1,107 @@
{
{-# LANGUAGE OverloadedStrings #-}
{-# OPTIONS -w #-}
module Bang.Syntax.Lexer(lexer)
where
import Bang.Syntax.Location
import Bang.Syntax.Name
import Bang.Syntax.Token
import Data.Char(isSpace, isAscii, ord)
import Data.Maybe(fromMaybe)
import Data.Text.Lazy(Text)
import qualified Data.Text.Lazy as T
import Data.Word(Word8)
}
-- Digits
$decdigit = 0-9
$hexdigit = [0-9a-fA-f]
$octdigit = 0-7
$bindigit = [01]
-- Identifier Characters
$typestart = [A-Z\_]
$valstart = [a-z\_]
$identrest = [a-zA-Z0-9\_\.]
$opident = [\~\!\@\#\$\%\^\&\*\+\-\=\.\<\>\?\_\|:]
$escape_char = [abfnrtv'\"\\]
:-
-- Whitespace
$white+ ;
"/*".*"*/" ;
-- Numbers
$decdigit+ { emitS (IntTok 10) }
"0x"$hexdigit+ { emitS (IntTok 16) }
"0o"$octdigit+ { emitS (IntTok 8) }
"0b"$bindigit+ { emitS (IntTok 2) }
$decdigit+"."$decdigit+ ("e""-"?$decdigit+)? { emitS FloatTok}
$decdigit+"e""-"?$decdigit+ { emitS FloatTok}
-- Identifier
$typestart $identrest* { emitS TypeIdent }
$valstart $identrest* { emitS ValIdent }
$opident+ { emitS OpIdent }
-- Characters and Strings
['].['] { emitS CharTok }
['] [\\] $escape_char ['] { emitS CharTok }
[\"] ([^\"] | [\n] | ([\\] $escape_char))* [\"] { emitS StringTok }
-- Symbols
"(" { emitT "(" }
")" { emitT ")" }
"[" { emitT "[" }
"]" { emitT "]" }
"{" { emitT "{" }
"}" { emitT "}" }
";" { emitT ";" }
"," { emitT "," }
"`" { emitT "`" }
[\\] { emitT "\\" }
{
lexer :: Origin -> Maybe Position -> Text -> [Located Token]
lexer src mbPos txt = go (AlexInput startPos txt)
where
startPos = fromMaybe initialPosition mbPos
go input =
case alexScan input 0 of
AlexEOF -> []
AlexError input' -> let AlexInput pos text = input'
(as, bs) = T.break isSpace text
pos' = advanceWith' pos as
input'' = AlexInput pos' bs
loc = Location src pos pos'
in (ErrorTok as `locatedAt` loc) : go input''
AlexSkip input' _ -> go input'
AlexToken input' len act -> act src len input : go input'
data AlexInput = AlexInput !Position Text
alexGetByte :: AlexInput -> Maybe (Word8, AlexInput)
alexGetByte (AlexInput p t) =
do (c, rest) <- T.uncons t
return (byteForChar c, (AlexInput (p `advanceWith` c) rest))
where
byteForChar c | isAscii c = fromIntegral (ord c)
| otherwise = 0
type AlexAction = Origin -> Int -> AlexInput -> Located Token
emitT :: Text -> AlexAction
emitT str = emitS (const (Special str))
emitS :: (Text -> Token) -> AlexAction
emitS mk src len (AlexInput pos t) = token `locatedAt` loc
where
txt = T.take (fromIntegral len) t
token = mk txt
loc = Location src pos (pos `advanceWith'` txt)
}

View File

@@ -0,0 +1,68 @@
{-# LANGUAGE TemplateHaskell #-}
module Bang.Syntax.Location(
Position, posRow, posColumn, posOffset
, initialPosition
, advanceWith, advanceWith'
, showPosition
, Origin(..)
, Location(Location)
, locSource, locStart, locEnd
, Located(..)
, locatedAt
)
where
import Control.Lens.TH(makeLenses)
import Data.Text.Lazy(Text)
import qualified Data.Text.Lazy as T
data Position = Position {
_posRow :: Word
, _posColumn :: Word
, _posOffset :: Word
}
deriving (Show)
makeLenses ''Position
initialPosition :: Position
initialPosition = Position 1 1 0
instance Eq Position where
a == b = _posOffset a == _posOffset b
advanceWith :: Position -> Char -> Position
advanceWith (Position r c o) '\t' = Position r (c+8) (o+1)
advanceWith (Position r _ o) '\n' = Position (r+1) 1 (o+1)
advanceWith (Position r c o) _ = Position r (c+1) (o+1)
advanceWith' :: Position -> Text -> Position
advanceWith' pos txt =
case T.uncons txt of
Nothing -> pos
Just (c, rest) -> advanceWith' (pos `advanceWith` c) rest
showPosition :: Position -> String
showPosition (Position r c _) = show r ++ ":" ++ show c
data Origin = Unknown
| Interactive
| File FilePath
deriving (Eq, Show)
data Location = Location {
_locSource :: Origin
, _locStart :: Position
, _locEnd :: Position
}
deriving (Eq, Show)
makeLenses ''Location
data Located a = Located !Location a
instance Show a => Show (Located a) where
show (Located l x) = show x ++ " `locatedAt` " ++ show l
locatedAt :: a -> Location -> Located a
locatedAt a p = Located p a

32
src/Bang/Syntax/Name.hs Normal file
View File

@@ -0,0 +1,32 @@
{-# LANGUAGE TemplateHaskell #-}
module Bang.Syntax.Name(
Name
, nameId
, nameString
, nameGenerated
)
where
import Control.Lens.TH(makeLenses)
data Name = Name {
_nameId :: !Word
, _nameString :: !String
, _nameGenerated :: !Bool
}
makeLenses ''Name
instance Eq Name where
a == b = _nameId a == _nameId b
a /= b = _nameId a /= _nameId b
instance Ord Name where
compare a b = compare (_nameId a) (_nameId b)
max a b = if a > b then a else b
min a b = if a > b then b else a
a < b = _nameId a < _nameId b
a <= b = _nameId a <= _nameId b
a > b = _nameId a > _nameId b
a >= b = _nameId a >= _nameId b

18
src/Bang/Syntax/Token.hs Normal file
View File

@@ -0,0 +1,18 @@
module Bang.Syntax.Token(
Token(..)
)
where
import Data.Text.Lazy(Text)
data Token = CharTok Text
| FloatTok Text
| IntTok Word Text
| OpIdent Text
| Special Text
| StringTok Text
| TypeIdent Text
| ValIdent Text
| ErrorTok Text
| EOFTok
deriving (Show)

View File

@@ -1,66 +1,26 @@
import Syntax.CommandLine
import Bang.CommandLine
import Bang.Syntax.Lexer
import Bang.Syntax.Location
import Control.Exception(tryJust)
import Control.Monad(guard)
import qualified Data.Text.Lazy.IO as T
import Data.Version(showVersion)
import Paths_bang(version)
import System.IO.Error(isDoesNotExistError)
main :: IO ()
main = getCommand >>= \ cmd ->
case cmdCommand cmd of
Lex _lexOpts -> putStrLn ("LEX: " ++ show cmd)
Help -> putStrLn helpString
Lex o -> runLexer cmd o
Help -> putStrLn helpString
Version -> putStrLn ("Bang tool, version " ++ showVersion version)
{-
import Control.Exception
import Control.Monad
import qualified Data.ByteString as S
import System.Environment
import System.Exit
import System.IO.Error
import Syntax.AST
import Syntax.Lexer
import Syntax.Parser
import Syntax.ParserCore
main :: IO ()
main = do
args <- getArgs
case args of
[file] -> do
ast <- loadModule file
putStrLn "Successful parse!"
putStrLn (show ast)
["-lex",path] -> do
mtxt <- tryJust (guard . isDoesNotExistError) $ S.readFile path
case mtxt of
Left _ -> fail $ "Unable to open file: " ++ path
Right txt -> do
case runParser path txt pullTokens of
Left err -> printError err >> exitWith (ExitFailure 1)
Right ress -> do
mapM_ putStrLn ress
putStrLn "Successful lex."
["-parse",path] -> do
ast <- loadModule path
putStrLn "Successful parse!"
putStrLn (show ast)
_ -> fail "Unacceptable arguments."
pullTokens :: Parser [String]
pullTokens = do
tok <- scan
case tok of
Lexeme pos tok' -> do
let res = show pos ++ " " ++ show tok'
if tok' == TokEOF
then return [res]
else return (res :) `ap` pullTokens
loadModule :: FilePath -> IO (Module Position)
loadModule path = do
mtxt <- tryJust (guard . isDoesNotExistError) $ S.readFile path
case mtxt of
Left _ -> fail $ "Unable to open file: " ++ path
Right txt ->
case runParser path txt parseModule of
Left err -> printError err >> exitWith (ExitFailure 1)
Right ast -> return ast
-}
runLexer :: BangCommand -> LexerOptions -> IO ()
runLexer _cmd opts =
do let path = lexInputFile opts
mtxt <- tryJust (guard . isDoesNotExistError) (T.readFile path)
case mtxt of
Left _ -> fail ("Unable to opten file: " ++ path)
Right txt ->
do let tokens = lexer (File path) (Just initialPosition) txt
mapM_ (putStrLn . show) tokens

View File

@@ -1,129 +0,0 @@
{
{-# LANGUAGE DeriveDataTypeable #-}
{-# OPTIONS -w #-}
module Syntax.Lexer where
import qualified Codec.Binary.UTF8.Generic as UTF8
import qualified Data.ByteString as S
import MonadLib
import Syntax.ParserCore
}
-- Digits
$decdigit = 0-9
$hexdigit = [0-9a-fA-f]
$octdigit = 0-7
$bindigit = [01]
-- Identifier Characters
$typestart = [A-Z\_]
$valstart = [a-z\_]
$identrest = [a-zA-Z0-9\_\.]
$opident = [\~\!\@\#\$\%\^\&\*\+\-\=\.\<\>\?\_\|]
$escape_char = [abfnrtv'\"\\]
:-
-- Whitespace
$white+ ;
"/*".*"*/" ;
-- Numbers
$decdigit+ { emitS (buildInt 10) }
"0x"$hexdigit+ { emitS (buildInt 16) }
"0o"$octdigit+ { emitS (buildInt 8) }
"0b"$bindigit+ { emitS (buildInt 2) }
$decdigit+"."$decdigit+ ("e""-"?$decdigit+)? { emitS TokFloat}
$decdigit+"e""-"?$decdigit+ { emitS TokFloat}
-- Identifier
$typestart $identrest* { emitS TokTypeIdent }
"prim%" $typestart $identrest* { emitS TokTypeIdent }
$valstart $identrest* { emitS TokValIdent }
"prim%" $valstart $identrest* { emitS TokValIdent }
$opident+ { emitS TokOpIdent }
":"+ { emitS TokOpIdent }
-- Characters and Strings
['].['] { emitS TokChar }
['] [\\] $escape_char ['] { emitS TokChar }
[\"] ([^\"] | [\n] | ([\\] $escape_char))* [\"] { emitS TokString }
-- Symbols
"(" { emitT LParen }
")" { emitT RParen }
"[" { emitT LSquare }
"]" { emitT RSquare }
"{" { emitT LBrace }
"}" { emitT RBrace }
";" { emitT Semi }
"," { emitT Comma }
"`" { emitT BTick }
[\\] { emitT LLambda }
{
type AlexInput = (Position,Char,S.ByteString)
emitT :: Token -> AlexInput -> Int -> Parser Lexeme
emitT tok (pos,_,_) _ = return $! Lexeme pos tok
emitS :: (String -> Token) -> AlexInput -> Int -> Parser Lexeme
emitS mk (pos,c,bs) len = return $! Lexeme pos (mk input)
where input = UTF8.toString (S.take len bs)
scan :: Parser Lexeme
scan = do
inp@(pos,_,_) <- alexGetInput
sc <- alexGetStartCode
case alexScan inp sc of
AlexEOF -> return $! Lexeme pos TokEOF
AlexError inp' -> do
let posStr = pprtPosition pos
alexError $ posStr ++ ": Lexical error."
AlexSkip inp' len' -> alexSetInput inp' >> scan
AlexToken inp' len action -> do
alexSetInput inp'
action inp len
alexGetInput :: Parser AlexInput
alexGetInput = do
s <- get
return (psPos s, psChar s, psInput s)
alexSetInput :: AlexInput -> Parser ()
alexSetInput (pos,c,bs) = do
s <- get
set $! s {
psPos = pos
, psChar = c
, psInput = bs
}
alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
alexGetChar (p,_,bs) = do
(c,bs') <- UTF8.uncons bs
return (c, (movePos p c, c, bs'))
alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar (_,c,_) = c
alexError :: String -> Parser a
alexError = raiseL
alexGetStartCode :: Parser Int
alexGetStartCode = psLexCode `fmap` get
alexSetStartCode :: Int -> Parser ()
alexSetStartCode code = do
s <- get
set $! s { psLexCode = code }
begin code _ _ = alexSetStartCode code >> scan
buildInt :: Int -> String -> Token
buildInt base val = TokInt (base, val)
}