------------------------------------------------------------------------------
--- Library for processing XML data.
---
--- Warning: the structure of this library is not stable and
--- might be changed in the future!
---
--- @author Michael Hanus
--- @version June 2018
------------------------------------------------------------------------------
module XML(XmlExp(..),Encoding(..),XmlDocParams(..),
tagOf,elemsOf,textOf,textOfXml,xtxt,xml,
showXmlDoc,showXmlDocWithParams,
writeXmlFile,writeXmlFileWithParams,parseXmlString,readXmlFile,
readUnsafeXmlFile,readFileWithXmlDocs,updateXmlFile) where
import Char
import Read
import List(intersperse)
------------------------------------------------------------------------------
--- The data type for representing XML expressions.
--- @cons XText - a text string (PCDATA)
--- @cons XElem - an XML element with tag field, attributes, and a list
--- of XML elements as contents
data XmlExp = XText String -- text string (PCDATA)
| XElem String [(String,String)] [XmlExp] -- (tag attrs contents)
deriving (Eq,Ord,Show)
------------------------------------------------------------------------------
--- The data type for encodings used in the XML document.
data Encoding = StandardEnc
| Iso88591Enc
-- Transform an encoding into its XML-attribute form
encoding2Attribute :: Encoding -> String
encoding2Attribute StandardEnc = ""
encoding2Attribute Iso88591Enc = "encoding=\"iso-8859-1\" "
-- Transform an encoding into its encoding-function
encoding2EncFunc :: Encoding -> String -> String
encoding2EncFunc StandardEnc = standardEncoding
encoding2EncFunc Iso88591Enc = iso88591Encoding
------------------------------------------------------------------------------
-- List of encoding maps
------------------------------------------------------------------------------
-- standard encoding map
standardEncoding :: String -> String
standardEncoding [] = []
standardEncoding (c:cs)
| c=='<' = "<" ++ standardEncoding cs
| c=='>' = ">" ++ standardEncoding cs
| c=='&' = "&" ++ standardEncoding cs
| c=='"' = """ ++ standardEncoding cs
| c=='\'' = "'" ++ standardEncoding cs
| ord c < 32 = "" ++ show (ord c) ++ ";" ++ standardEncoding cs
| ord c > 127 = "" ++ show (ord c) ++ ";" ++ standardEncoding cs
| otherwise = c : standardEncoding cs
-- iso-8859-1
iso88591Encoding :: String -> String
iso88591Encoding [] = []
iso88591Encoding (c:cs) =
if ord c `elem` iso88591list
then c : iso88591Encoding cs
else standardEncoding [c] ++ iso88591Encoding cs
-- iso-8859-1-list
-- not yet completed...
iso88591list :: [Int]
iso88591list = [192,193,194,195,196,197,198,199,200,201,202,203,204,205,207,
208,209,210,211,212,214,216,217,218,219,220,221,224,225,228,
229,226,227,230,231,233,232,235,234,236,237,239,240,241,248,
246,242,243,244,245,250,249,252,251,253,255]
------------------------------------------------------------------------------
--- The data type for XML document parameters.
--- @cons Enc - the encoding for a document
--- @cons DtdUrl - the url of the DTD for a document
data XmlDocParams = Enc Encoding
| DtdUrl String
-- get the right encoding (i.e., first or standard encoding if not present)
-- from a list of XmlDocParams
lookupEncoding :: [XmlDocParams] -> Encoding
lookupEncoding (Enc f:_) = f
lookupEncoding (DtdUrl _:l) = lookupEncoding l
lookupEncoding [] = StandardEnc
-- get the first DtdUrl from a list of XmlDocParams
lookupDtdUrl :: [XmlDocParams] -> String
lookupDtdUrl [] = ""
lookupDtdUrl (Enc _ : l) = lookupDtdUrl l
lookupDtdUrl (DtdUrl url : _) = url
-- does a XmlDocParam include a DtdUrl?
hasDtdUrl :: [XmlDocParams] -> Bool
hasDtdUrl [] = False
hasDtdUrl (DtdUrl _:_) = True
hasDtdUrl (Enc _:l) = hasDtdUrl l
------------------------------------------------------------------------------
-- useful selectors:
--- Returns the tag of an XML element (or empty for a textual element).
tagOf :: XmlExp -> String
tagOf (XElem tag _ _) = tag
tagOf (XText _) = ""
--- Returns the child elements an XML element.
elemsOf :: XmlExp -> [XmlExp]
elemsOf (XElem _ _ xexps) = xexps
elemsOf (XText _) = []
--- Extracts the textual contents of a list of XML expressions.
--- Useful auxiliary function when transforming XML expressions into
--- other data structures.
---
--- For instance,
--- textOf [XText "xy", XElem "a" [] [], XText "bc"] == "xy bc"
textOf :: [XmlExp] -> String
textOf = unwords . filter (not . null) . map textOfXmlExp
where
textOfXmlExp (XText s) = s
textOfXmlExp (XElem _ _ xs) = textOf xs
--- Included for backward compatibility, better use textOf
!
textOfXml :: [XmlExp] -> String
textOfXml = textOf
------------------------------------------------------------------------------
-- some useful abbreviations:
--- Basic text (maybe containing special XML chars).
xtxt :: String -> XmlExp
xtxt s = XText s
--- XML element without attributes.
xml :: String -> [XmlExp] -> XmlExp
xml t c = XElem t [] c
------------------------------------------------------------------------------
-- Pretty printer for XML documents
------------------------------------------------------------------------------
--- Writes a file with a given XML document.
writeXmlFile :: String -> XmlExp -> IO ()
writeXmlFile file xexp =
writeXmlFileWithParams file [Enc StandardEnc] xexp
--- Writes a file with a given XML document and XML parameters.
writeXmlFileWithParams :: String -> [XmlDocParams] -> XmlExp -> IO ()
writeXmlFileWithParams file ps xexp =
writeFile file (showXmlDocWithParams ps xexp)
------------------------------------------------------------------------------
--- Show an XML document in indented format as a string.
------------------------------------------------------------------------------
showXmlDoc :: XmlExp -> String
showXmlDoc xexp = showXmlDocWithParams [] xexp
showXmlDocWithParams :: [XmlDocParams] -> XmlExp -> String
showXmlDocWithParams ps (XElem root attrL xmlEL) =
"\n\n" ++
(if hasDtdUrl ps
then "\n\n"
else "") ++
showXmlExp 0 (encoding2EncFunc (lookupEncoding ps))
(XElem root attrL xmlEL)
showXmlDocWithParams _ (XText _) =
error "XML.showXmlDocWithParams: document without tags"
showXmlExp :: Int -> (String -> String) -> XmlExp -> String
showXmlExp i encFun (XText s) = xtab i ++ (encFun s) ++ "\n"
showXmlExp i encFun (XElem tag attrs xexps) =
xtab i ++ showXmlOpenTag tag attrs encFun ++
if null xexps
then " />\n"
else if length xexps == 1 && isXText (head xexps)
then let [XText s] = xexps
in ">" ++ (encFun s) ++ "" ++ tag ++ ">\n"
else ">\n" ++ showXmlExps (i+2) xexps encFun ++
xtab i ++ "" ++ tag ++ ">\n"
xtab :: Int -> String
xtab n = take n (repeat ' ')
showXmlOpenTag :: String -> [(String, a)] -> (a -> String) -> String
showXmlOpenTag tag attrs encFun =
"<" ++ tag ++ concat (map ((" "++) . attr2string) attrs)
where attr2string (attr,value) = attr ++ "=\""
++ (encFun value) ++ "\""
showXmlExps :: Int -> [XmlExp] -> (String -> String) -> String
showXmlExps encFun xexps i = concatMap (showXmlExp encFun i) xexps
isXText :: XmlExp -> Bool
isXText (XText _) = True
isXText (XElem _ _ _) = False
-- unquote special characters (<,>,&,',") in an XML string:
xmlUnquoteSpecials :: String -> String
xmlUnquoteSpecials [] = []
xmlUnquoteSpecials (c:cs)
| c=='&' = let (special,rest) = splitAtChar ';' cs
in xmlUnquoteSpecial special rest
| otherwise = c : xmlUnquoteSpecials cs
xmlUnquoteSpecial :: String -> String -> String
xmlUnquoteSpecial special cs
| special=="lt" = '<' : xmlUnquoteSpecials cs
| special=="gt" = '>' : xmlUnquoteSpecials cs
| special=="amp" = '&' : xmlUnquoteSpecials cs
| special=="quot" = '"' : xmlUnquoteSpecials cs
| special=="apos" = '\'' : xmlUnquoteSpecials cs
| special=="auml" = '\228' : xmlUnquoteSpecials cs
| special=="ouml" = '\246' : xmlUnquoteSpecials cs
| special=="uuml" = '\252' : xmlUnquoteSpecials cs
| special=="Auml" = '\196' : xmlUnquoteSpecials cs
| special=="Ouml" = '\214' : xmlUnquoteSpecials cs
| special=="Uuml" = '\220' : xmlUnquoteSpecials cs
| special=="szlig"= '\223' : xmlUnquoteSpecials cs
| otherwise = unquoteUnicode special ++ xmlUnquoteSpecials cs
unquoteUnicode :: String -> String
unquoteUnicode [] = []
unquoteUnicode (c:cs)
| c=='#' = case cs of
'x':cs' -> [chr (readHex cs')]
_ -> [chr (readInt cs)]
| otherwise = '&':(c:cs) ++ ";"
------------------------------------------------------------------------------
-- Parser for XML documents
------------------------------------------------------------------------------
--- Reads a file with an XML document and returns
--- the corresponding XML expression.
readXmlFile :: String -> IO XmlExp
readXmlFile file =
do xmlstring <- readFile file
let xexps = parseXmlString xmlstring
if null xexps
then error ("File "++file++" contains no XML document!")
else if null (tail xexps)
then return (head xexps)
else error ("File "++file++" contains more than one XML document!")
--- Tries to read a file with an XML document and returns
--- the corresponding XML expression, if possible.
--- If file or parse errors occur, Nothing is returned.
readUnsafeXmlFile :: String -> IO (Maybe XmlExp)
readUnsafeXmlFile file =
catch (readXmlFile file >>= return . Just) (\_ -> return Nothing)
--- Pretty prints the contents of an XML file.
showXmlFile :: String -> IO ()
showXmlFile file = readXmlFile file >>= putStr . showXmlDoc
--- Reads a file with an arbitrary sequence of XML documents and
--- returns the list of corresponding XML expressions.
readFileWithXmlDocs :: String -> IO [XmlExp]
readFileWithXmlDocs file = readFile file >>= return . parseXmlString
------------------------------------------------------------------------------
--- Transforms an XML string into a list of XML expressions.
--- If the XML string is a well structured document, the list
--- of XML expressions should contain exactly one element.
parseXmlString :: String -> [XmlExp]
parseXmlString s = fst (parseXmlTokens (scanXmlString s) Nothing)
-- parse a list of XML tokens into list of XML expressions:
-- parseXmlTokens tokens stoptoken = (xml_expressions, remaining_tokens)
parseXmlTokens :: [XmlExp] -> Maybe String -> ([XmlExp],[XmlExp])
parseXmlTokens [] Nothing = ([],[])
parseXmlTokens [] (Just _) = error "XML.parseXmlTokens: incomplete parse"
parseXmlTokens (XText s : xtokens) stop =
let (xexps, rem_xtokens) = parseXmlTokens xtokens stop
in (XText (xmlUnquoteSpecials s) : xexps, rem_xtokens)
parseXmlTokens (XElem (t:ts) args cont : xtokens) stop
| t == '<' && head ts /= '/'
= let (xexps1, xtokens1) = parseXmlTokens xtokens (Just ts)
(xexps, rem_xtokens) = parseXmlTokens xtokens1 stop
in (XElem ts args xexps1 : xexps, rem_xtokens)
| t == '<' && head ts == '/'
= if maybe False (==(tail ts)) stop
then ([], xtokens) -- stop this parser if appropriate stop token reached
else let (xexps, rem_xtokens) = parseXmlTokens xtokens stop
in (XElem ts args cont : xexps, rem_xtokens)
| otherwise = let (xexps, rem_xtokens) = parseXmlTokens xtokens stop
in (XElem (t:ts) args cont : xexps, rem_xtokens)
parseXmlTokens (XElem [] _ _ : _) _ =
error "XML.parseXmlTokens: incomplete parse"
-- scan an XML string into list of XML tokens:
-- here we reuse XML expressions for representing XML tokens:
-- single open or closing tags are returned by the scanner
-- as an XElem with no contents and first character '<' added to the tag field
scanXmlString :: String -> [XmlExp]
scanXmlString s = scanXml (dropBlanks s)
where
scanXml [] = []
scanXml (c:cs) = if c=='<'
then scanXmlElem cs
else let (initxt,remtag) = scanXmlText (c:cs)
in XText initxt : scanXml remtag
-- scan an XML text until next tag and remove superflous blanks:
scanXmlText :: String -> (String,String)
--original definition:
--scanXmlText s = let (s1,s2) = break (=='<') s
-- in (concat (intersperse " " (words s1)), s2)
--this implementation is more efficient:
scanXmlText [] = ([],[])
scanXmlText (c:cs) | c=='<' = ([],c:cs)
| isSpace c = let (txt,rem) = scanXmlText (dropBlanks cs)
in (if null txt then txt else ' ':txt, rem)
| otherwise = let (txt,rem) = scanXmlText cs
in (c:txt,rem)
-- scan an XML element:
scanXmlElem :: String -> [XmlExp]
scanXmlElem [] = []
scanXmlElem (c:cs)
| c=='!' = if take 2 cs == "--"
then scanXmlComment (drop 2 cs)
else scanXmlCData cs
| c=='?' = scanXmlProcInstr cs
| otherwise = scanXmlElemName [c] cs
scanXmlElemName :: String -> String -> [XmlExp]
scanXmlElemName ct [] = [XElem ('<':ct) [] []]
scanXmlElemName ct (c:cs)
| c=='>' = XElem ('<':ct) [] [] : scanXmlString cs
| isSpace c = let (attrs,rest) = parseAttrs (dropBlanks cs) in
if (head rest)=='/'
then XElem ct attrs [] : scanXmlString (drop 2 rest)
else XElem ('<':ct) attrs [] : scanXmlString (tail rest)
| c=='/' && head cs == '>' = XElem ct [] [] : scanXmlString (tail cs)
| otherwise = scanXmlElemName (ct++[c]) cs
-- scan (and drop) an XML comment:
scanXmlComment :: String -> [XmlExp]
scanXmlComment [] = []
scanXmlComment (c:cs) =
if c=='-' && take 2 cs == "->"
then scanXmlString (drop 2 cs)
else scanXmlComment cs
-- scan (and drop) an XML CDATA element (simplified version):
scanXmlCData :: String -> [XmlExp]
scanXmlCData cs =
let rest = dropCData cs
in if head rest == '>' then scanXmlString (tail rest)
else scanXmlCData rest
dropCData :: String -> String
dropCData [] = []
dropCData (c:cs)
| c=='[' = tail (dropWhile (/=']') cs) -- must be improved
| c=='>' = c:cs
| otherwise = dropCData cs
-- scan (and drop) an XML processing instructions:
scanXmlProcInstr :: String -> [XmlExp]
scanXmlProcInstr [] = []
scanXmlProcInstr (c:cs) =
if c=='?' && head cs == '>'
then scanXmlString (tail cs)
else scanXmlProcInstr cs
-- parse a string as an attribute list:
parseAttrs :: String -> ([(String,String)],String)
parseAttrs [] = ([],[])
parseAttrs (c:cs)
| isAlpha c = let (name,rest1) = splitAtChar '=' (c:cs)
(value,rest2) = splitAtChar '"' (tail rest1)
(rem_attrs,rem_inp) = parseAttrs (dropBlanks rest2)
in ((name,xmlUnquoteSpecials value):rem_attrs, rem_inp)
| otherwise = ([],c:cs)
-- drop blanks in input string:
dropBlanks :: String -> String
dropBlanks = dropWhile isSpace
-- split string at particular character, if possible:
splitAtChar :: Char -> String -> (String, String)
splitAtChar _ [] = ([],[])
splitAtChar char (c:cs) =
if c==char then ([],cs)
else let (first,rest) = splitAtChar char cs in (c:first,rest)
------------------------------------------------------------------------------
--- An action that updates the contents of an XML file by some transformation
--- on the XML document.
--- @param f - the function to transform the XML document in the file
--- @param file - the name of the XML file
updateXmlFile :: (XmlExp -> XmlExp) -> String -> IO ()
updateXmlFile xmltrans filename = do
xdoc <- readXmlFile filename
writeXmlFile filename $!! (xmltrans xdoc)
-- end of XML library