dear-imgui.hs/generator/DearImGui/Generator/Tokeniser.hs

{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE ViewPatterns #-}

module DearImGui.Generator.Tokeniser where

-- base
import Control.Arrow
  ( first, second )
import Control.Applicative
  ( (<|>), some )
import Data.Char
  ( isAlpha, isAlphaNum, isDigit, isPunctuation, isSpace, isSymbol, toLower )
import Data.Function
  ( (&) )
import Data.Functor
  ( ($>) )
import Data.Monoid
  ( Sum(..) )

-- megaparsec
import Text.Megaparsec
  ( MonadParsec, VisualStream(..)
  , chunk, parseMaybe, satisfy, try
  )
import Text.Megaparsec.Char.Lexer
  ( hexadecimal, scientific )

-- parser-combinators
import Control.Monad.Combinators
  ( optional )

-- scientific
import Data.Scientific
  ( Scientific )

-- text
import Data.Text
  ( Text )
import qualified Data.Text as Text
  ( break, breakOn, cons, drop, dropWhile
  , head, last, length
  , pack, snoc, span, strip, tail, take
  , uncons, unpack
  )

-- unordered-containers
import Data.HashSet
  ( HashSet )
import qualified Data.HashSet as HashSet
  ( fromList, member )

--------------------------------------------------------------------------------

data TokeniserError
  = Couldn'tParseNumber { problem :: !Text }
  | UnhandledCase { unhandled :: !( Char, Text ) }
  deriving stock ( Eq, Ord, Show )

data Tok
  = Keyword        !Text
  | ReservedSymbol !Char
  | Symbolic       !Text
  | Identifier     !Text
  | Comment        !Text
  | Char           !Char
  | String         !Text
  | Number         !Scientific !Text
  | BeginCPP       !Text
  | EndCPPLine
  deriving stock ( Show, Eq, Ord )

showToken :: Tok -> String
showToken = \case
  Keyword        t -> Text.unpack t
  ReservedSymbol c -> [c]
  Symbolic       t -> Text.unpack t
  Identifier     t -> Text.unpack t
  Comment        t -> Text.unpack t
  Char           c -> [c]
  String         t -> Text.unpack t
  Number       s t -> show s <> Text.unpack t
  BeginCPP       t -> "#" <> Text.unpack t
  EndCPPLine       -> "EndCppLine"

tokenLength :: Tok -> Int
tokenLength = \case
  Keyword        t -> Text.length t
  ReservedSymbol _ -> 1
  Symbolic       t -> Text.length t
  Identifier     t -> Text.length t
  Comment        t -> Text.length t
  Char           _ -> 1
  String         t -> Text.length t
  Number       s t -> length ( show s ) + Text.length t
  BeginCPP       t -> 1 + Text.length t
  EndCPPLine       -> length ( "EndCPPLine" :: String )

instance VisualStream [Tok] where
  showTokens   _ = foldMap showToken
  tokensLength _ = getSum . foldMap ( Sum . tokenLength )

keywords :: HashSet Text
keywords = HashSet.fromList
  [ "auto", "break", "case", "char", "const", "continue", "default", "do", "double"
  , "else", "enum", "extern", "float", "for", "goto", "if", "inline", "int", "long"
  , "register", "restrict", "return", "short", "signed", "sizeof", "static", "struct"
  , "switch", "typedef", "union", "unsigned", "void", "volatile", "while"
  ]

reservedSymbols :: HashSet Char
reservedSymbols = HashSet.fromList [ '(', ')', '{', '}', ',', ';', '=', '#' ]

tokenise :: Text -> Either TokeniserError [ Tok ]
tokenise ( Text.uncons -> Just ( c, cs ) )
  | isSpace c
  = tokenise ( Text.dropWhile isSpace cs )
  | isAlpha c || c == '_'
  , let
      this, rest :: Text
      ( this, rest ) = first ( c `Text.cons` ) $ Text.span ( \ x -> isAlphaNum x || x == '_' ) cs
  = if this `HashSet.member` keywords
    then ( Keyword    this : ) <$> tokenise rest
    else ( Identifier this : ) <$> tokenise rest
  | isDigit c
  , let
      this, rest :: Text
      ( this, rest ) = continuePastExponent $ first ( c `Text.cons` ) $ Text.span ( \ x -> isAlphaNum x || x == '.' ) cs
  = case parseMaybe @() parseNumber this of
      Just numTok -> ( numTok : ) <$> tokenise rest
      Nothing     -> Left ( Couldn'tParseNumber { problem = this } )
  | c == '\''
  , Just ( '\'', rest ) <- Text.uncons ( Text.drop 1 cs )
  = ( Char ( Text.head cs ) : ) <$> tokenise rest
  | c == '\"'
  , let
      this, rest :: Text
      ( this, rest ) = second Text.tail $ Text.break ( == '"') cs
  = ( String this : ) <$> tokenise rest
  | c == '#'
  , let
      directive, line, rest :: Text
      ( directive, ( line, rest ) )
        = cs
        & Text.break ( isSpace )
        & second ( Text.break ( `elem` [ '\n', '\r' ] ) )
  = do
      lineTokens <- tokenise line
      restTokens <- tokenise rest
      pure ( ( BeginCPP directive : lineTokens ) <> ( EndCPPLine : restTokens ) )
  | c `HashSet.member` reservedSymbols
  = ( ReservedSymbol c : ) <$> tokenise cs
  | c == '/'
  = case Text.take 1 cs of
    "/" ->
      let
        comm, rest :: Text
        ( comm, rest ) = first Text.strip $ Text.break ( `elem` [ '\n', '\r' ] ) ( Text.drop 1 cs )
      in ( Comment comm : ) <$> tokenise rest
    "*" ->
      let
        comm, rest :: Text
        ( comm, rest ) = Text.breakOn "*/" ( Text.drop 1 cs )
      in ( Comment comm : ) <$> tokenise rest
    _ ->
      let
        this, rest :: Text
        ( this, rest ) = first ( c `Text.cons` ) $ Text.span ( \ x -> x /= '_' && ( isSymbol x || isPunctuation x ) ) cs
      in ( Symbolic this : ) <$> tokenise rest
  | isSymbol c || isPunctuation c
  , let
      this, rest :: Text
      ( this, rest ) = first ( c `Text.cons` ) $ Text.span ( \ x -> x /= '_' && ( isSymbol x || isPunctuation x ) ) cs
  = ( Symbolic this : ) <$> tokenise rest
  | otherwise
  = Left $ UnhandledCase { unhandled = ( c, cs ) }
tokenise _ = Right []

continuePastExponent :: ( Text, Text ) -> ( Text, Text )
continuePastExponent ( this, rest )
  | toLower ( Text.last this ) `elem` [ 'e', 'p' ]
  , Just ( r, rs ) <- Text.uncons rest
  , r `elem` [ '+', '-' ]
  , ( this', rest' ) <- Text.span isAlphaNum rs
  = ( this `Text.snoc` r <> this', rest' )
  | otherwise
  = ( this, rest )

parseNumber :: MonadParsec e Text m => m Tok
parseNumber = try ( chunk "0.f" $> Number 0 "f" ) <|> do
  value <- try ( chunk "0x" *> hexadecimal ) <|> scientific
  mbSuffix <- fmap ( maybe "" Text.pack ) . optional . some $ satisfy ( \ s -> toLower s `elem` ( "uflz" :: String ) )
  pure ( Number value mbSuffix )