diff --git a/.gitignore b/.gitignore index 0a249ca..74561ce 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,6 @@ *.bak hsrc/Syntax/Lexer.hs hsrc/Syntax/Parser.hs -bang .cabal-sandbox/ dist/ diff --git a/bang.cabal b/bang.cabal index d22afbd..54d1794 100644 --- a/bang.cabal +++ b/bang.cabal @@ -14,14 +14,31 @@ cabal-version: >=1.10 executable bang main-is: Main.hs - build-depends: base >= 4.8 && < 4.9, + build-depends: + array >= 0.5.1.0 && < 0.7, + base >= 4.8 && < 4.9, bytestring >= 0.10 && < 0.11, - optparse-applicative >= 0.12.1 && < 0.14 + lens >= 4.14 && < 4.18, + monadLib >= 3.7.3 && < 3.9, + optparse-applicative >= 0.12.1 && < 0.14, + text >= 1.2.2.1 && < 1.4 hs-source-dirs: src build-tools: alex, happy + ghc-options: -Wall default-language: Haskell2010 - other-extensions: DeriveDataTypeable + other-extensions: CPP, + DeriveDataTypeable, + DeriveFunctor, + GeneralizedNewtypeDeriving, + MagicHash, + MultiParamTypeClasses, + OverloadedStrings, + TemplateHaskell other-modules: - Paths_bang, - Syntax.CommandLine + Bang.CommandLine, + Bang.Syntax.Lexer, + Bang.Syntax.Location, + Bang.Syntax.Name, + Bang.Syntax.Token, + Paths_bang diff --git a/src/Syntax/CommandLine.hs b/src/Bang/CommandLine.hs similarity index 93% rename from src/Syntax/CommandLine.hs rename to src/Bang/CommandLine.hs index 1731f06..af59ed6 100644 --- a/src/Syntax/CommandLine.hs +++ b/src/Bang/CommandLine.hs @@ -1,4 +1,4 @@ -module Syntax.CommandLine( +module Bang.CommandLine( BangCommand(..) , BangOperation(..) , LexerOptions(..) @@ -7,8 +7,6 @@ module Syntax.CommandLine( ) where -import Data.Version(showVersion) -import Paths_bang(version) import Options.Applicative import Options.Applicative.Help @@ -32,12 +30,14 @@ outputFile = strOption (short 'o' <> long "output-file" <> metavar "FILE" <> value "/dev/stdout" <> showDefault) data BangOperation = Help + | Version | Lex LexerOptions deriving (Show) bangOperation :: Parser BangOperation bangOperation = subparser $ command "help" (pure Help `withInfo` "Describe common commands.") <> + command "version" (pure Version `withInfo` "Display version information.") <> command "lex" (parseLex `withInfo` "Lex a file into its component tokens.") withInfo :: Parser a -> String -> ParserInfo a diff --git a/src/Bang/Syntax/Lexer.x b/src/Bang/Syntax/Lexer.x new file mode 100644 index 0000000..9334ae9 --- /dev/null +++ b/src/Bang/Syntax/Lexer.x @@ -0,0 +1,107 @@ +{ +{-# LANGUAGE OverloadedStrings #-} +{-# OPTIONS -w #-} +module Bang.Syntax.Lexer(lexer) + where + +import Bang.Syntax.Location +import Bang.Syntax.Name +import Bang.Syntax.Token +import Data.Char(isSpace, isAscii, ord) +import Data.Maybe(fromMaybe) +import Data.Text.Lazy(Text) +import qualified Data.Text.Lazy as T +import Data.Word(Word8) + +} + +-- Digits +$decdigit = 0-9 +$hexdigit = [0-9a-fA-f] +$octdigit = 0-7 +$bindigit = [01] + +-- Identifier Characters +$typestart = [A-Z\_] +$valstart = [a-z\_] +$identrest = [a-zA-Z0-9\_\.] +$opident = [\~\!\@\#\$\%\^\&\*\+\-\=\.\<\>\?\_\|:] +$escape_char = [abfnrtv'\"\\] + +:- + +-- Whitespace + $white+ ; + "/*".*"*/" ; + +-- Numbers + $decdigit+ { emitS (IntTok 10) } + "0x"$hexdigit+ { emitS (IntTok 16) } + "0o"$octdigit+ { emitS (IntTok 8) } + "0b"$bindigit+ { emitS (IntTok 2) } + $decdigit+"."$decdigit+ ("e""-"?$decdigit+)? { emitS FloatTok} + $decdigit+"e""-"?$decdigit+ { emitS FloatTok} + +-- Identifier + $typestart $identrest* { emitS TypeIdent } + $valstart $identrest* { emitS ValIdent } + $opident+ { emitS OpIdent } + +-- Characters and Strings + ['].['] { emitS CharTok } + ['] [\\] $escape_char ['] { emitS CharTok } + [\"] ([^\"] | [\n] | ([\\] $escape_char))* [\"] { emitS StringTok } + +-- Symbols + "(" { emitT "(" } + ")" { emitT ")" } + "[" { emitT "[" } + "]" { emitT "]" } + "{" { emitT "{" } + "}" { emitT "}" } + ";" { emitT ";" } + "," { emitT "," } + "`" { emitT "`" } + [\\] { emitT "\\" } + +{ + +lexer :: Origin -> Maybe Position -> Text -> [Located Token] +lexer src mbPos txt = go (AlexInput startPos txt) + where + startPos = fromMaybe initialPosition mbPos + go input = + case alexScan input 0 of + AlexEOF -> [] + AlexError input' -> let AlexInput pos text = input' + (as, bs) = T.break isSpace text + pos' = advanceWith' pos as + input'' = AlexInput pos' bs + loc = Location src pos pos' + in (ErrorTok as `locatedAt` loc) : go input'' + AlexSkip input' _ -> go input' + AlexToken input' len act -> act src len input : go input' + +data AlexInput = AlexInput !Position Text + +alexGetByte :: AlexInput -> Maybe (Word8, AlexInput) +alexGetByte (AlexInput p t) = + do (c, rest) <- T.uncons t + return (byteForChar c, (AlexInput (p `advanceWith` c) rest)) + where + byteForChar c | isAscii c = fromIntegral (ord c) + | otherwise = 0 + +type AlexAction = Origin -> Int -> AlexInput -> Located Token + +emitT :: Text -> AlexAction +emitT str = emitS (const (Special str)) + +emitS :: (Text -> Token) -> AlexAction +emitS mk src len (AlexInput pos t) = token `locatedAt` loc + where + txt = T.take (fromIntegral len) t + token = mk txt + loc = Location src pos (pos `advanceWith'` txt) + +} diff --git a/src/Bang/Syntax/Location.hs b/src/Bang/Syntax/Location.hs new file mode 100644 index 0000000..25b1c74 --- /dev/null +++ b/src/Bang/Syntax/Location.hs @@ -0,0 +1,68 @@ +{-# LANGUAGE TemplateHaskell #-} +module Bang.Syntax.Location( + Position, posRow, posColumn, posOffset + , initialPosition + , advanceWith, advanceWith' + , showPosition + , Origin(..) + , Location(Location) + , locSource, locStart, locEnd + , Located(..) + , locatedAt + ) + where + +import Control.Lens.TH(makeLenses) +import Data.Text.Lazy(Text) +import qualified Data.Text.Lazy as T + +data Position = Position { + _posRow :: Word + , _posColumn :: Word + , _posOffset :: Word + } + deriving (Show) + +makeLenses ''Position + +initialPosition :: Position +initialPosition = Position 1 1 0 + +instance Eq Position where + a == b = _posOffset a == _posOffset b + +advanceWith :: Position -> Char -> Position +advanceWith (Position r c o) '\t' = Position r (c+8) (o+1) +advanceWith (Position r _ o) '\n' = Position (r+1) 1 (o+1) +advanceWith (Position r c o) _ = Position r (c+1) (o+1) + +advanceWith' :: Position -> Text -> Position +advanceWith' pos txt = + case T.uncons txt of + Nothing -> pos + Just (c, rest) -> advanceWith' (pos `advanceWith` c) rest + +showPosition :: Position -> String +showPosition (Position r c _) = show r ++ ":" ++ show c + +data Origin = Unknown + | Interactive + | File FilePath + deriving (Eq, Show) + +data Location = Location { + _locSource :: Origin + , _locStart :: Position + , _locEnd :: Position + } + deriving (Eq, Show) + +makeLenses ''Location + +data Located a = Located !Location a + +instance Show a => Show (Located a) where + show (Located l x) = show x ++ " `locatedAt` " ++ show l + +locatedAt :: a -> Location -> Located a +locatedAt a p = Located p a diff --git a/src/Bang/Syntax/Name.hs b/src/Bang/Syntax/Name.hs new file mode 100644 index 0000000..3f0fc91 --- /dev/null +++ b/src/Bang/Syntax/Name.hs @@ -0,0 +1,32 @@ +{-# LANGUAGE TemplateHaskell #-} +module Bang.Syntax.Name( + Name + , nameId + , nameString + , nameGenerated + ) + where + +import Control.Lens.TH(makeLenses) + +data Name = Name { + _nameId :: !Word + , _nameString :: !String + , _nameGenerated :: !Bool + } + +makeLenses ''Name + +instance Eq Name where + a == b = _nameId a == _nameId b + a /= b = _nameId a /= _nameId b + +instance Ord Name where + compare a b = compare (_nameId a) (_nameId b) + max a b = if a > b then a else b + min a b = if a > b then b else a + a < b = _nameId a < _nameId b + a <= b = _nameId a <= _nameId b + a > b = _nameId a > _nameId b + a >= b = _nameId a >= _nameId b + diff --git a/src/Bang/Syntax/Token.hs b/src/Bang/Syntax/Token.hs new file mode 100644 index 0000000..e02de6a --- /dev/null +++ b/src/Bang/Syntax/Token.hs @@ -0,0 +1,18 @@ +module Bang.Syntax.Token( + Token(..) + ) + where + +import Data.Text.Lazy(Text) + +data Token = CharTok Text + | FloatTok Text + | IntTok Word Text + | OpIdent Text + | Special Text + | StringTok Text + | TypeIdent Text + | ValIdent Text + | ErrorTok Text + | EOFTok + deriving (Show) diff --git a/src/Main.hs b/src/Main.hs index e088424..ac5ac1d 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -1,66 +1,26 @@ -import Syntax.CommandLine +import Bang.CommandLine +import Bang.Syntax.Lexer +import Bang.Syntax.Location +import Control.Exception(tryJust) +import Control.Monad(guard) +import qualified Data.Text.Lazy.IO as T +import Data.Version(showVersion) +import Paths_bang(version) +import System.IO.Error(isDoesNotExistError) main :: IO () main = getCommand >>= \ cmd -> case cmdCommand cmd of - Lex _lexOpts -> putStrLn ("LEX: " ++ show cmd) - Help -> putStrLn helpString + Lex o -> runLexer cmd o + Help -> putStrLn helpString + Version -> putStrLn ("Bang tool, version " ++ showVersion version) - -{- -import Control.Exception -import Control.Monad -import qualified Data.ByteString as S -import System.Environment -import System.Exit -import System.IO.Error - -import Syntax.AST -import Syntax.Lexer -import Syntax.Parser -import Syntax.ParserCore - -main :: IO () -main = do - args <- getArgs - case args of - [file] -> do - ast <- loadModule file - putStrLn "Successful parse!" - putStrLn (show ast) - ["-lex",path] -> do - mtxt <- tryJust (guard . isDoesNotExistError) $ S.readFile path - case mtxt of - Left _ -> fail $ "Unable to open file: " ++ path - Right txt -> do - case runParser path txt pullTokens of - Left err -> printError err >> exitWith (ExitFailure 1) - Right ress -> do - mapM_ putStrLn ress - putStrLn "Successful lex." - ["-parse",path] -> do - ast <- loadModule path - putStrLn "Successful parse!" - putStrLn (show ast) - _ -> fail "Unacceptable arguments." - -pullTokens :: Parser [String] -pullTokens = do - tok <- scan - case tok of - Lexeme pos tok' -> do - let res = show pos ++ " " ++ show tok' - if tok' == TokEOF - then return [res] - else return (res :) `ap` pullTokens - -loadModule :: FilePath -> IO (Module Position) -loadModule path = do - mtxt <- tryJust (guard . isDoesNotExistError) $ S.readFile path - case mtxt of - Left _ -> fail $ "Unable to open file: " ++ path - Right txt -> - case runParser path txt parseModule of - Left err -> printError err >> exitWith (ExitFailure 1) - Right ast -> return ast --} +runLexer :: BangCommand -> LexerOptions -> IO () +runLexer _cmd opts = + do let path = lexInputFile opts + mtxt <- tryJust (guard . isDoesNotExistError) (T.readFile path) + case mtxt of + Left _ -> fail ("Unable to opten file: " ++ path) + Right txt -> + do let tokens = lexer (File path) (Just initialPosition) txt + mapM_ (putStrLn . show) tokens diff --git a/src/Syntax/Lexer.x b/src/Syntax/Lexer.x deleted file mode 100644 index 1c0fe45..0000000 --- a/src/Syntax/Lexer.x +++ /dev/null @@ -1,129 +0,0 @@ -{ -{-# LANGUAGE DeriveDataTypeable #-} -{-# OPTIONS -w #-} -module Syntax.Lexer where - -import qualified Codec.Binary.UTF8.Generic as UTF8 -import qualified Data.ByteString as S -import MonadLib - -import Syntax.ParserCore - -} - --- Digits -$decdigit = 0-9 -$hexdigit = [0-9a-fA-f] -$octdigit = 0-7 -$bindigit = [01] - --- Identifier Characters -$typestart = [A-Z\_] -$valstart = [a-z\_] -$identrest = [a-zA-Z0-9\_\.] -$opident = [\~\!\@\#\$\%\^\&\*\+\-\=\.\<\>\?\_\|] -$escape_char = [abfnrtv'\"\\] - -:- - --- Whitespace - $white+ ; - "/*".*"*/" ; - --- Numbers - $decdigit+ { emitS (buildInt 10) } - "0x"$hexdigit+ { emitS (buildInt 16) } - "0o"$octdigit+ { emitS (buildInt 8) } - "0b"$bindigit+ { emitS (buildInt 2) } - $decdigit+"."$decdigit+ ("e""-"?$decdigit+)? { emitS TokFloat} - $decdigit+"e""-"?$decdigit+ { emitS TokFloat} - --- Identifier - $typestart $identrest* { emitS TokTypeIdent } - "prim%" $typestart $identrest* { emitS TokTypeIdent } - $valstart $identrest* { emitS TokValIdent } - "prim%" $valstart $identrest* { emitS TokValIdent } - $opident+ { emitS TokOpIdent } - ":"+ { emitS TokOpIdent } - --- Characters and Strings - ['].['] { emitS TokChar } - ['] [\\] $escape_char ['] { emitS TokChar } - [\"] ([^\"] | [\n] | ([\\] $escape_char))* [\"] { emitS TokString } - --- Symbols - "(" { emitT LParen } - ")" { emitT RParen } - "[" { emitT LSquare } - "]" { emitT RSquare } - "{" { emitT LBrace } - "}" { emitT RBrace } - ";" { emitT Semi } - "," { emitT Comma } - "`" { emitT BTick } - [\\] { emitT LLambda } - -{ - -type AlexInput = (Position,Char,S.ByteString) - -emitT :: Token -> AlexInput -> Int -> Parser Lexeme -emitT tok (pos,_,_) _ = return $! Lexeme pos tok - -emitS :: (String -> Token) -> AlexInput -> Int -> Parser Lexeme -emitS mk (pos,c,bs) len = return $! Lexeme pos (mk input) - where input = UTF8.toString (S.take len bs) - -scan :: Parser Lexeme -scan = do - inp@(pos,_,_) <- alexGetInput - sc <- alexGetStartCode - case alexScan inp sc of - AlexEOF -> return $! Lexeme pos TokEOF - AlexError inp' -> do - let posStr = pprtPosition pos - alexError $ posStr ++ ": Lexical error." - AlexSkip inp' len' -> alexSetInput inp' >> scan - AlexToken inp' len action -> do - alexSetInput inp' - action inp len - -alexGetInput :: Parser AlexInput -alexGetInput = do - s <- get - return (psPos s, psChar s, psInput s) - -alexSetInput :: AlexInput -> Parser () -alexSetInput (pos,c,bs) = do - s <- get - set $! s { - psPos = pos - , psChar = c - , psInput = bs - } - -alexGetChar :: AlexInput -> Maybe (Char,AlexInput) -alexGetChar (p,_,bs) = do - (c,bs') <- UTF8.uncons bs - return (c, (movePos p c, c, bs')) - -alexInputPrevChar :: AlexInput -> Char -alexInputPrevChar (_,c,_) = c - -alexError :: String -> Parser a -alexError = raiseL - -alexGetStartCode :: Parser Int -alexGetStartCode = psLexCode `fmap` get - -alexSetStartCode :: Int -> Parser () -alexSetStartCode code = do - s <- get - set $! s { psLexCode = code } - -begin code _ _ = alexSetStartCode code >> scan - -buildInt :: Int -> String -> Token -buildInt base val = TokInt (base, val) - -}