------------------------------------------------------------------------------ --- The Parser for Curry with Integrated Code --- ========================================= --- --- @author Jasper Sikorra - jsi@informatik.uni-kiel.de --- @version March 2014 ------------------------------------------------------------------------------ module CPP.ICode.ICodeParser(parse) where import Data.List import Data.Char import CPP.ICode.ParseTypes -- The identificators for Integrated Expressions s_ident :: Char s_ident = '`' e_ident :: Char e_ident = '\'' min_number :: Int min_number = 2 -- Identifiers for block comments and line comments s_block_comment :: String s_block_comment = "{-" e_block_comment :: String e_block_comment = "-}" s_line_comment :: String s_line_comment = "--" -- Error messages err_missing_quote :: String err_missing_quote = "Missing corresponding closing quote!" err_missing_block :: String err_missing_block = "Missing corresponding closing comment block delimiter " ++ e_block_comment ++ " !" err_missing_integ :: String err_missing_integ = "Integrated code not terminated with correct number " ++ "of " ++ [e_ident] ++ " chars" err_no_langtag :: String err_no_langtag = "Missing language tag for integrated code!" err_layout_code :: String err_layout_code = "Bad layout. Check amount of white spaces!" --- The parse function is the main function of the Code Integration Parser. --- The functions partitions the input in normal code and integrated --- expressions, disassembles the integrated code and removes its offset. --- @param filename - The filename of the input file --- @param input - The input string containing language with integrated code --- @return - A list of StandardTokens which contain either the common --- language or the DSL code with some extra information parse :: Filename -> String -> IO (PM [StandardToken]) parse fn input = return $ bindPM (parserL1 (initPos fn) input) parserL2 {- FIRST LEVEL OF THE PARSER The first level of the parser starts here. It recognizes the integrated expressions which are introduced by multiple s_idents and terminated by the same amount of e_idents. -} data L1Token = Normal Pos String | Exp Pos -- Integrated Expression with Postion Int -- number of start identifiers String -- content --- Function for the first level of the parser --- which recognizes integrated expressions and normal Curry expressions --- @param p - The starting position of the expression --- @param s - The input parserL1 :: Pos -> String -> PM [L1Token] parserL1 p s = parserL1Iter "" p p s parserL1Iter :: String -> Pos -> Pos -> String -> PM [L1Token] parserL1Iter acc accP _ "" = cleanPM (ifThenElse (null acc) [] [Normal accP (reverse acc)]) parserL1Iter acc accP p s@(c:cs) -- Parse Quotations | c == '\"' = passThrough parseQuotation (movePosByChar p '\"') cs -- Parse Quotations with single quotes | c == '\'' = passThrough parseSingleQuote (movePosByChar p '\'') cs | otherwise = -- Recognize line comments let (isprefco,rstco) = isPrefixAndDrop s_line_comment s in if isprefco -- Parse line comments then passThrough parseLineComment (movePosByString p s_line_comment) rstco else -- Recognize block comments let (isprefbc,rstbc) = isPrefixAndDrop s_block_comment s in if isprefbc -- Parse block comments then passThrough parseBlockComment (movePosByString p s_block_comment) rstbc else -- Recognize integrated expressions let (n,r) = countAndDrop s_ident s in if (min_number <= n) -- Parse integrated expressions then passThroughInt parseIntegrated (movePosByString p (replicate n s_ident)) n r -- Parse common code else parserL1Iter (c:acc) accP (movePosByChar p c) cs where passThrough f np st = if (null acc) then f p np st else liftPM ((:) (Normal accP (reverse acc))) $ f p np st passThroughInt f np i st = if (null acc) then f p np i st else liftPM ((:) (Normal accP (reverse acc))) $ f p np i st --- The function isPrefixAndDrop checks wether a list is a prefix of --- another and drops the prefix --- @l1 - The possible prefix --- @l2 - The list --- @return (b,l) where b states wether l1 is a prefix of l2 and --- l is the remaining list isPrefixAndDrop :: Eq a => [a] -> [a] -> (Bool,[a]) isPrefixAndDrop [] [] = (True,[]) isPrefixAndDrop (_:_) [] = (False,[]) isPrefixAndDrop [] (c:cs) = (True,(c:cs)) isPrefixAndDrop (c:cs) (d:ds) | c == d = isPrefixAndDrop cs ds | otherwise = (False,(d:ds)) --- The function countAndDrop drops all elements of one kind --- from the beginning of the list, counts how many elements are --- dropped and returns the remaining list --- @param c - The element which should be dropped --- @param l - The list --- @return (i,l) where i states the amount of dropped elements and --- l is the remaining list countAndDrop :: Eq a => a -> [a] -> (Int,[a]) countAndDrop c s = countAndDropIter 0 s where countAndDropIter n [] = (n,[]) countAndDropIter n (c1:cs) | c == c1 = countAndDropIter (n+1) cs | otherwise = (n,(c1:cs)) --- The function parseQuotation parses a quotation if a '\"' is already found --- @param pos1 - The starting position before the already found '\"' --- @param pos2 - The starting position after the already found '\"' --- @param s - The input --- @return tok - The parsed quote and the parsed rest of input parseQuotation :: Pos -> Pos -> String -> PM [L1Token] parseQuotation accP p s = let (qu,rst,fnd,pos) = findUnescapedChar p '\"' '\\' s in if fnd then liftPM ((:) (Normal accP ('\"':qu))) $ parserL1 pos rst else throwPM accP err_missing_quote --- The function findUnescapedChar finds a specific Char, which is not --- introduces by another Char in a String --- @param pos - The starting position of the string --- @param c1 - The Char which should be searched --- @param c2 - The Char which escapes c1 --- @param s - The input String --- @return (s1,s2,b,p) - s1 is the input before c1, s2 is the input after --- c1, b states wether c1 was found, pos points to the start of s2 findUnescapedChar :: Pos -> Char -> Char -> String -> (String,String,Bool,Pos) findUnescapedChar pos ch escaper s = findUnescapedCharHelper pos "" s where findUnescapedCharHelper p acc "" = (reverse acc,"",False,p) findUnescapedCharHelper p acc (c:[]) | c == ch = (reverse (c:acc),[],True,movePosByChar p c) | c /= ch = (reverse (c:acc),[],False,movePosByChar p c) findUnescapedCharHelper p acc (c:d:ds) | c == ch = (reverse (c:acc),(d:ds),True,movePosByChar p c) | c /= ch && c == escaper = findUnescapedCharHelper (movePosByString p [d,c]) (d:c:acc) ds | otherwise = findUnescapedCharHelper (movePosByChar p c) (c:acc) (d:ds) --- The function parseSingleQuote parses a single quote, but only if --- it escapes a quote or starting identifiers for integrated expressions. --- @param p1 - The staring position of the single quote --- @param p2 - The position after the first single quote --- @param s - The input --- @return - The parse result of the complete input parseSingleQuote :: Pos -> Pos -> String -> PM [L1Token] parseSingleQuote accP _ "" = cleanPM [Normal accP "\'"] parseSingleQuote accP p s@(c:_) = case s of ('\\':d:'\'':rst) -> liftPM ((:) (Normal accP ['\'','\\',d,'\''])) $ parserL1 (moveAbs (moveCol p 4) 4) rst (_:'\'':rst) -> liftPM ((:) (Normal accP ['\'',c,'\''])) $ parserL1 (moveAbs (moveCol p 3) 3) rst _ -> liftPM ((:) (Normal accP "\'")) $ parserL1 p s --- The function parseLineComment parses a line comment introduced --- by s_line_comment --- @param p1 - The position before the line comment --- @param p2 - The position after s_line_comment --- @param s - The input --- @return - The parsed input parseLineComment :: Pos -> Pos -> String -> PM [L1Token] parseLineComment accP p s = let (co,rst,fnd,pos) = findSequence p "\n" s in if fnd then liftPM ((:) (Normal accP (s_line_comment ++ co))) $ parserL1 pos rst else cleanPM [Normal accP (s_line_comment ++ co)] --- The function parseBlockComment parses a block comment introduced --- by s_block_comment and terminated by e_block_comment --- @param p1 - The position before the block comment --- @param p2 - The position after s_block_comment --- @param s - The input --- @return - The parsed output parseBlockComment :: Pos -> Pos -> String -> PM [L1Token] parseBlockComment accP p s = let (blc,rst,fnd,pos) = findSequence p e_block_comment s in if fnd then liftPM ((:) (Normal accP (s_block_comment ++ blc))) $ parserL1 pos rst else throwPM accP err_missing_block --- The function findSequence finds a sequence of characters in a string, --- and divides the string after the sequence --- @param p - The position at the start of the input --- @param seq - The sequence which should be found --- @param s - The input string --- @return (s1,s2,b,p) - s1 is the string before and with the seq --- s2 is the string after the seq --- b states wether seq was found --- p is the position of s2 findSequence :: Pos -> String -> String -> (String,String,Bool,Pos) findSequence p1 s1 s2 = findSequenceIter p1 [] s1 s2 where findSequenceIter p acc [] [] = (reverse acc,[],True,p) findSequenceIter p acc [] (c:cs) = (reverse acc,(c:cs),True,p) findSequenceIter p acc (_:_) [] = (reverse acc,[],False,p) findSequenceIter p acc (c:cs) (d:ds) | c == d = findSequenceIter (movePosByChar p d) (d:acc) cs ds | otherwise = findSequenceIter (movePosByChar p d) (d:acc) s1 ds --- The function parseIntegrated parses integrated expressions --- @param p1 - The position at the start of the integrated expression --- @param p2 - The position after all s_ident(s) introduction the exp. --- @param n - The number of s_ident(s) that introduced the exp. --- @param s - The input --- @return The parsed input parseIntegrated :: Pos -> Pos -> Int -> String -> PM [L1Token] parseIntegrated accP p n s = let (exp,rst,fnd,pos) = findReplicate p n e_ident s_ident s in if fnd then liftPM ((:) (Exp accP n exp))$ parserL1 pos rst else throwPM accP err_missing_integ --- The function findReplicate finds sequences of the same character in a --- string. This function is defined to find such sequences for two --- different characters. --- @param pos - The position at the start of the input --- @param n - The amount of same character in a row which should be searched --- @param c1 - The first character of which sequence should be searched --- @param c2 - The second character of which sequence should be searched --- @param s - The input --- @return (s1,s2,b,p) - s1 is the input before and with the sequence --- s2 is the input after the sequence --- b states wether a sequence of c1 was found --- p points to the position of s2 findReplicate :: Pos -> Int -> Char -> Char -> String -> (String,String,Bool,Pos) findReplicate p1 n e_i s_i s = findReplicateIter p1 [] 0 s where findReplicateIter p accS accN [] = if (accN == (-n)) then (s,s,False,p1) else (reverse (drop accN accS),[],(accN == n),p) findReplicateIter p accS accN (c2:cs) = if (accN == n) then (reverse (drop accN accS),(c2:cs),True,p) else if (accN == (-n)) then (s,s,False,p1) else if (e_i == c2) then if (accN < 0) then findReplicateIter (movePosByChar p c2)(c2:accS) 1 cs else findReplicateIter (movePosByChar p c2)(c2:accS) (accN+1) cs else if (s_i == c2) then if (accN < 0) then findReplicateIter (movePosByChar p c2) (c2:accS) (accN-1) cs else findReplicateIter (movePosByChar p c2) (c2:accS) (-1) cs else findReplicateIter (movePosByChar p c2) (c2:accS) 0 cs {- SECOND LEVEL OF THE PARSER The second level of the parser starts here. The integrated expressions are disassembled and converted to StandardTokens -} --- The function parserL2 converts Normals to StandardTokens and --- disassembles the Exps and converts them to StandardTokens --- @param input - A list of tokens from parser level 1 --- @result A list of StandardTokens utilizeable in the Translator parserL2 :: [L1Token] -> PM [StandardToken] parserL2 [] = cleanPM [] parserL2 (t:tks) | isNormal t = liftPM ((:) $ normalToStTk t) $ parserL2 tks | otherwise = bindPM (disassembleIntExp t) (\ptk -> liftPM ((:) ptk) $ parserL2 tks) isNormal :: L1Token -> Bool isNormal t = case t of (Normal _ _) -> True _ -> False --- normalToStTk converts Normal(s) to StandardToken(s) --- @param normal - A Normal L1Token --- @result - An equivalent StandardToken normalToStTk :: L1Token -> StandardToken normalToStTk (Normal p s) = StTk p p Nothing s normalToStTk (Exp _ _ _) = failed --- disassembleIntExp disassembles Exps, removes the offset from the --- DSL code and converts the outcome to StandardToken --- @param exp - The L1Token Exp which should be converted --- @return standardtk - A StandardToken equivalent to the exp but with removed --- offset in the DSL code disassembleIntExp :: L1Token -> PM StandardToken disassembleIntExp (Exp p i s) = let -- Recognize the language tag (langtag,rest1) = break isSpace s -- Recognize the whitespaces and the dsl (spaces,dsl) = span isSpace rest1 in if (null langtag) then throwPM p err_no_langtag else let -- calculate position of the DSL code posBeforeDSL = movePosByString p ((replicate i s_ident) ++ langtag ++ spaces) -- calculate the offset offset = getCol posBeforeDSL -- remove the offset from the DSL cleanDSL = removeOffset posBeforeDSL offset dsl in bindPM cleanDSL (\cDSL -> cleanPM (StTk p posBeforeDSL (Just langtag) cDSL)) disassembleIntExp (Normal _ _) = failed --- removeOffset removes the offset from a string --- @param n - Length of the offset --- @param s - The input --- @return - The input with removed offset removeOffset :: Pos -> Int -> String -> PM String removeOffset p n s = let linesOfDSL = lines s in case linesOfDSL of [] -> cleanPM "" [st] -> cleanPM st (x:xs) -> liftPM ((++) x) $ removeOffsetFromLines (movePosByString p (x ++ "\n")) n xs --- removeOffsetFromLines removes an offset of a certain length from --- each line of a string --- @param p - Position of the start of the input --- @param n - Length of the offset --- @param s - Input --- @return - The input with removed offset wrapped in the ParserMonad removeOffsetFromLines :: Pos -> Int -> [String] -> PM String removeOffsetFromLines _ _ [] = cleanPM "" removeOffsetFromLines p n (l:ls) = let rmoffl = removeOffsetFromLine p n l in bindPM rmoffl (\(np,s) -> liftPM ((++) ('\n':s)) $ removeOffsetFromLines (movePosByChar np '\n') n ls) --- removeOffsetFromLine is a helper function for removeOffsetFromLines --- @param p - Position pointing to the start of the line --- @param n - The length of the offset --- @param s - The input line --- @return - The input with removed offset wrapped in the ParserMonad removeOffsetFromLine :: Pos -> Int -> String -> PM (Pos,String) removeOffsetFromLine pos i st = removeOffsetFromLineIter pos st where removeOffsetFromLineIter p s = if (getCol p >= i) then cleanPM (p,s) else case s of [] -> cleanPM (p,"") -- The line is empty (c:cs) -> if (isWhiteSpace c) then removeOffsetFromLineIter (movePosByChar p c) cs else throwPM pos err_layout_code --- isWhiteSpace is similar to isSpace function but without newlines isWhiteSpace :: Char -> Bool isWhiteSpace c = c == ' ' || c == '\t'