------------------------------------------------------------------------------
--- Library to translate
--- [markdown documents](http://en.wikipedia.org/wiki/Markdown)
--- into HTML or LaTeX.
--- The slightly restricted subset of the markdown syntax recognized by
--- this implementation is
--- [documented in this page](http://www.informatik.uni-kiel.de/~pakcs/markdown_syntax.html).
---
--- @author Michael Hanus
--- @version December 2011
--- @category web
------------------------------------------------------------------------------

module Markdown(MarkdownDoc,MarkdownElem(..),fromMarkdownText,
                removeEscapes, markdownEscapeChars,
                markdownText2HTML,markdownText2CompleteHTML,
                markdownText2LaTeX,markdownText2LaTeXWithFormat,
                markdownText2CompleteLaTeX,
                formatMarkdownFileAsPDF,formatMarkdownInputAsPDF)
 where

import Char
import IO   (getContents)
import HTML
import HTML.Parser
import List
import System

-----------------------------------------------------------------------
--- A markdown document is a list of markdown elements.
type MarkdownDoc = [MarkdownElem]

--- The data type for representing the different elements
--- occurring in a markdown document.
--- @cons Text s - a simple text in a markdown document
--- @cons Emph s - an emphasized text in a markdown document
--- @cons Strong s - a strongly emphaszed text in a markdown document
--- @cons Strong s - a code string in a markdown document
--- @cons Code s - a code string in a markdown document
--- @cons HRef s u - a reference to URL `u` with text `s` in a markdown document
--- @cons Par md - a paragraph in a markdown document
--- @cons CodeBlock s - a code block in a markdown document
--- @cons UList mds - an unordered list in a markdown document
--- @cons OList mds - an ordered list in a markdown document
--- @cons Quote md - a quoted paragraph in a markdown document
--- @cons HRule - a hoirzontal rule in a markdown document
--- @cons Header l s - a level `l` header with title `s`
---                    in a markdown document
data MarkdownElem = Text String
                  | Emph String
                  | Strong String
                  | Code String
                  | HRef String String
                  | Par MarkdownDoc
                  | CodeBlock String
                  | UList [MarkdownDoc]
                  | OList [MarkdownDoc]
                  | Quote MarkdownDoc
                  | HRule
                  | Header Int String

-----------------------------------------------------------------------
--- The data type for representing the different elements
--- of a source markdown document. Basically, it is the same
--- as the final markdown document execept for single list items
--- that occur in the source but are combined into a list in
--- the final document.
data SourceMDElem = SMDText String
                  | SMDEmph String
                  | SMDStrong String
                  | SMDCode String
                  | SMDHRef String String
                  | SMDPar MarkdownDoc
                  | SMDCodeBlock String
                  | SMDUItem String
                  | SMDOItem String
                  | SMDQuote MarkdownDoc
                  | SMDHRule
                  | SMDHeader Int String

isSMDUItem :: SourceMDElem -> Bool
isSMDUItem md = case md of SMDUItem _ -> True
                           _          -> False

isSMDOItem :: SourceMDElem -> Bool
isSMDOItem md = case md of SMDOItem _ -> True
                           _          -> False

textOfItem :: SourceMDElem -> String
textOfItem (SMDUItem txt) = txt
textOfItem (SMDOItem txt) = txt

-----------------------------------------------------------------------
--- Parse markdown document from its textual representation.
fromMarkdownText :: String -> MarkdownDoc
fromMarkdownText = groupMarkDownElems . markdownText

-- Group adjacent item elements together in a markdown list.
groupMarkDownElems :: [SourceMDElem] -> MarkdownDoc
groupMarkDownElems [] = []
groupMarkDownElems (SMDUItem itxt :mds) = joinItems UList isSMDUItem [itxt] mds
groupMarkDownElems (SMDOItem itxt :mds) = joinItems OList isSMDOItem [itxt] mds
groupMarkDownElems (SMDText s : mds) = Text s : groupMarkDownElems mds
groupMarkDownElems (SMDEmph s : mds) = Emph s : groupMarkDownElems mds
groupMarkDownElems (SMDStrong s : mds) = Strong s : groupMarkDownElems mds
groupMarkDownElems (SMDCode s : mds) = Code s : groupMarkDownElems mds
groupMarkDownElems (SMDHRef s u : mds) = HRef s u : groupMarkDownElems mds
groupMarkDownElems (SMDPar md : mds) = Par md : groupMarkDownElems mds
groupMarkDownElems (SMDCodeBlock s : mds) = CodeBlock s : groupMarkDownElems mds
groupMarkDownElems (SMDQuote md : mds) = Quote md : groupMarkDownElems mds
groupMarkDownElems (SMDHRule : mds) = HRule : groupMarkDownElems mds
groupMarkDownElems (SMDHeader l s : mds) = Header l s : groupMarkDownElems mds

joinItems :: ([[MarkdownElem]] -> MarkdownElem) -> (SourceMDElem -> Bool)
          -> [String] -> [SourceMDElem] -> [MarkdownElem]
joinItems mdlcons _ items [] = [mdlcons (reverse (map fromMarkdownText items))]
joinItems mdlcons isitem items (md:mds) =
  if isitem md
  then joinItems mdlcons isitem (textOfItem md : items) mds
  else mdlcons (reverse (map fromMarkdownText items))
        : groupMarkDownElems (md:mds)

-- Basic reader for a markdown text.
markdownText :: String -> [SourceMDElem]
markdownText [] = []
markdownText txt@(_:_) = markdownLine fstline (dropFirst remtxt)
 where (fstline,remtxt) = break (=='\n') txt

-- Analyze the first line of a markdown text:
markdownLine :: String -> String -> [SourceMDElem]
markdownLine fstline remtxt
 | all isSpace fstline   = markdownText remtxt
 | isLevel1Line = SMDHeader 1 fstline : markdownText (dropFirst furtherlines)
 | isLevel2Line = SMDHeader 2 fstline : markdownText (dropFirst furtherlines)
 | take 1 fstline == "#" = tryMDHeader fstline remtxt
 | isHRule fstline       = SMDHRule : markdownText remtxt
 | take 2 fstline == "> " -- start of a quoted text
  = markdownQuote (drop 2 fstline) remtxt
 | blanklen > 0 -- four space indent for code
  = markdownCodeBlock blanklen (removeEscapes (drop blanklen fstline)) remtxt
 | uitemlen > 0 -- start of an unordered item
  = markdownItem SMDUItem uitemlen (drop uitemlen fstline) remtxt
 | nitemlen > 0 -- start of a numbered item
  = markdownItem SMDOItem nitemlen (drop nitemlen fstline) remtxt
 | otherwise = markdownPar fstline remtxt
 where
  (sndline,furtherlines) = break (=='\n') remtxt
  isLevel1Line = not (null sndline) && all (=='=') sndline
  isLevel2Line = not (null sndline) && all (=='-') sndline
  nitemlen = isNumberedItemLine fstline
  uitemlen = isUnorderedItemLine fstline
  blanklen = isCodeLine fstline

dropFirst :: [a] -> [a]
dropFirst s = if null s then [] else tail s

-- translate a header line
tryMDHeader :: String -> String -> [SourceMDElem]
tryMDHeader s rtxt =
  let (sharps,htxt) = break (==' ') s
      level = length sharps
   in if null htxt || level>6
      then markdownPar s rtxt
      else SMDHeader level (dropFirst htxt) : markdownText rtxt

-- is a line a horizontal rule:
isHRule :: String -> Bool
isHRule l =
  (all (\c -> isSpace c || c=='-') l && length (filter (=='-') l) > 3) ||
  (all (\c -> isSpace c || c=='*') l && length (filter (=='*') l) > 3)

-- check whether a line starts with an unordered item indicator ("* ")
-- and return indent:
isUnorderedItemLine :: String -> Int
isUnorderedItemLine s =
  let (blanks,nonblanks) = span (==' ') s
   in if take 2 nonblanks `elem` ["* ","- ","+ "] then length blanks+2 else 0

-- check whether a line starts with an indented number and return indent value:
isNumberedItemLine :: String -> Int
isNumberedItemLine s =
  let (blanks,nonblanks) = span (==' ') s
      numblanks = length blanks
   in checkNumber numblanks nonblanks
 where
  checkNumber indt numtxt =
    let (ns,brt) = break (==' ') numtxt
        (blanks,rtxt) = break (/=' ') brt
        nsl = length ns
     in if nsl>0 && all isDigit (take (nsl-1) ns) && ns!!(nsl-1)=='.' &&
           not (null blanks) && not (null rtxt)
        then indt+nsl+length blanks
        else 0

-- check whether a line starts with at least four blanks and return indent value:
isCodeLine :: String -> Int
isCodeLine s =
  let (blanks,nonblanks) = span (==' ') s
      numblanks = length blanks
   in if not (null nonblanks) && numblanks >= 4 then numblanks else 0

-- parse a paragraph (where the initial part of the paragraph is given
-- as the first argument):
markdownPar :: String -> String -> [SourceMDElem]
markdownPar ptxt txt
 | null txt || head txt `elem` ['\n'] ||
   uitemlen>0 || nitemlen>0
  = SMDPar (groupMarkDownElems (outsideMarkdownElem "" ptxt)) : markdownText txt
 | null remtxt
  = [SMDPar (groupMarkDownElems (outsideMarkdownElem "" (ptxt++'\n':fstline)))]
 | otherwise = markdownPar (ptxt++'\n':fstline) (tail remtxt)
 where
  (fstline,remtxt) = break (=='\n') txt
  nitemlen = isNumberedItemLine fstline
  uitemlen = isUnorderedItemLine fstline

-- parse a quoted section:
markdownQuote :: String -> String -> [SourceMDElem]
markdownQuote qtxt alltxt =
  let txt = if take 2 alltxt == ">\n" -- allow empty quote lines
            then "> " ++ drop 1 alltxt
            else alltxt
  in if take 2 txt == "> "
     then let (fstline,remtxt) = break (=='\n') (drop 2 txt)
           in if null remtxt
              then [SMDQuote (fromMarkdownText (qtxt++'\n':fstline))]
              else markdownQuote (qtxt++'\n':fstline) (tail remtxt)
     else SMDQuote (fromMarkdownText qtxt) : markdownText txt

-- parse a program block (where the indent and the initial code block is given):
markdownCodeBlock :: Int -> String -> String -> [SourceMDElem]
markdownCodeBlock n ctxt txt =
  if take n txt == "    "
  then
   let (fstline,remtxt) = break (=='\n') (drop n txt)
    in if null remtxt
       then [SMDCodeBlock (ctxt++'\n':removeEscapes fstline)]
       else markdownCodeBlock n (ctxt++'\n':removeEscapes fstline)
                                (tail remtxt)
  else SMDCodeBlock ctxt : markdownText txt

-- parse a markdown list item:
markdownItem :: (String -> SourceMDElem) -> Int -> String -> String
             -> [SourceMDElem]
markdownItem icons n itxt txt =
  if take n txt == take n (repeat ' ')
  then let (fstline,remtxt) = break (=='\n') (drop n txt)
        in if null remtxt
           then [icons (itxt++'\n':fstline)]
           else markdownItem icons n (itxt++'\n':fstline) (tail remtxt)
  else let (fstline,remtxt) = break (=='\n') txt
        in if all isSpace fstline
           then if null remtxt
                then [icons itxt]
                else markdownItem icons n (itxt++"\n") (tail remtxt)
           else icons itxt : markdownText txt

--- Remove the backlash of escaped markdown characters in a string.
removeEscapes :: String -> String
removeEscapes s = case s of
  []          -> []
  ('\\':c:cs) -> if c `elem` markdownEscapeChars
                 then c : removeEscapes cs
                 else '\\' : removeEscapes (c:cs)
  (c:cs)      -> c : removeEscapes cs

--- Escape characters supported by markdown.
markdownEscapeChars :: [Char]
markdownEscapeChars =
  ['\\','`','*','_','{','}','[',']','(',')','#','+','-','.',' ','!']

-- Analyze markdown text outside an element like emphasis, code, strong:
outsideMarkdownElem :: String -> String -> [SourceMDElem]
outsideMarkdownElem txt s = case s of
  [] -> addPrevious txt []
  ('\\':c:cs)  -> if c `elem` markdownEscapeChars
                  then outsideMarkdownElem (c:'\\':txt) cs
                  else outsideMarkdownElem ('\\':txt) (c:cs)
  ('*':'*':cs) -> addPrevious txt $ insideMarkdownElem "**" [] cs
  ('_':'_':cs) -> addPrevious txt $ insideMarkdownElem "__" [] cs
  ('*':cs)     -> addPrevious txt $ insideMarkdownElem "*" [] cs
  ('_':cs)     -> addPrevious txt $ insideMarkdownElem "_" [] cs
  ('`':cs)     -> addPrevious txt $ insideMarkdownElem "`" [] cs
  ('[':cs) -> addPrevious txt $ tryParseLink cs
  ('<':cs) -> if take 4 cs == "http"
              then addPrevious txt $ markdownHRef cs
              else outsideMarkdownElem ('<':txt) cs
  (c:cs)   -> outsideMarkdownElem (c:txt) cs

addPrevious :: String -> [SourceMDElem] -> [SourceMDElem]
addPrevious ptxt xs = if null ptxt then xs else SMDText (reverse ptxt) : xs

-- Try to parse a link of the form [link test](url)
tryParseLink :: String -> [SourceMDElem]
tryParseLink txt = let (linktxt,rtxt) = break (==']') txt in
  if null rtxt || null (tail rtxt) || (rtxt!!1 /= '(')
  then outsideMarkdownElem "[" txt
  else let (url,mtxt) = break (==')') (drop 2 rtxt)
        in if null mtxt
           then outsideMarkdownElem "[" txt
           else SMDHRef linktxt url : outsideMarkdownElem "" (tail mtxt)

markdownHRef :: String -> [SourceMDElem]
markdownHRef txt = let (url,rtxt) = break (=='>') txt in
  if null rtxt
  then outsideMarkdownElem "" ('<':txt)
  else SMDHRef url url : outsideMarkdownElem "" (dropFirst rtxt)

insideMarkdownElem :: String -> String -> String -> [SourceMDElem]
insideMarkdownElem marker etext s =
  if marker `isPrefixOf` s
  then text2MDElem marker (reverse etext)
        : outsideMarkdownElem "" (drop (length marker) s)
  else case s of
        []     -> [SMDText (marker ++ reverse etext)] -- end marker missing
        ('\\':c:cs) -> if c `elem` markdownEscapeChars
                       then insideMarkdownElem marker (c:'\\':etext) cs
                       else insideMarkdownElem marker ('\\':etext) (c:cs)
        (c:cs)      -> insideMarkdownElem marker (c:etext) cs

text2MDElem :: String -> String -> SourceMDElem
text2MDElem marker txt = case marker of
  "**" -> SMDStrong txt
  "__" -> SMDStrong txt
  "*"  -> SMDEmph txt
  "_"  -> SMDEmph txt
  "`"  -> SMDCode txt
  _    -> error $ "Markdown.text2MDElem: unknown marker \""++marker++"\""


-----------------------------------------------------------------------
-- Translate markdown document to HTML.

mdDoc2html :: MarkdownDoc -> [HtmlExp]
mdDoc2html = map mdElem2html

-- translate markdown special characters in text to HTML
mdtxt2html :: String -> HtmlExp
mdtxt2html s = HtmlText (removeEscapes s)

mdElem2html :: MarkdownElem -> HtmlExp
mdElem2html (Text s) = mdtxt2html s
mdElem2html (Emph s) = emphasize [mdtxt2html s]
mdElem2html (Strong s) = HtmlStruct "strong" [] [mdtxt2html s]
mdElem2html (HRef s url) = if s==url
                             then href url [code [mdtxt2html s]]
                             else href url [mdtxt2html s]
mdElem2html (Code s) = code [HtmlText (htmlQuote s)]
mdElem2html (CodeBlock s) = verbatim s
mdElem2html (Quote md) = HtmlStruct "blockquote" [] (mdDoc2html md)
mdElem2html (Par md) = par (mdDoc2html md)
mdElem2html (UList mds) = ulist (map mdDoc2htmlWithoutPar mds)
mdElem2html (OList mds) = olist (map mdDoc2htmlWithoutPar mds)
mdElem2html HRule = hrule
mdElem2html (Header l s) = HtmlStruct ('h':show l) [] [mdtxt2html s]

mdDoc2htmlWithoutPar :: MarkdownDoc -> [HtmlExp]
mdDoc2htmlWithoutPar mdoc = case mdoc of
  [] -> []
  [Par md] -> mdDoc2html md
  [md] -> [mdElem2html md]
  (Par md1:md2:mds) -> mdDoc2html md1 ++ breakline : 
                         mdDoc2htmlWithoutPar (md2:mds)
  (md1:md2:mds) -> mdElem2html md1 : mdDoc2htmlWithoutPar (md2:mds)

--- Translate a markdown text into a (partial) HTML document.
markdownText2HTML :: String -> [HtmlExp]
markdownText2HTML = mdDoc2html . fromMarkdownText

--- Translate a markdown text into a complete HTML text
--- that can be viewed as a standalone document by a browser.
--- The first argument is the title of the document.
markdownText2CompleteHTML :: String -> String -> String
markdownText2CompleteHTML title mdtxt =
   showHtmlPage (page title (markdownText2HTML mdtxt))

-----------------------------------------------------------------------
--- Translate markdown document to a LaTeX string where the first
--- argument is a function to translate the basic text occurring
--- in markdown elements to a LaTeX string.
--- Note that the basic text (execept for code blocks)
--- contains escaped markdown characters
--- that needs also to be removed by the translation function.
mdDoc2latex :: (String->String) -> MarkdownDoc -> String
mdDoc2latex txt2latex = concatMap (mdElem2latex txt2latex)

mdElem2latex :: (String->String) -> MarkdownElem -> String
mdElem2latex txt2latex (Text s) = txt2latex s
mdElem2latex txt2latex (Emph s) = "\\emph{"++txt2latex s++"}"
mdElem2latex txt2latex (Strong s) = "\\textbf{"++txt2latex s++"}"
mdElem2latex txt2latex (HRef s url) =
  if s==url then "\\url{"++url++"}"
            else "\\href{"++url++"}{"++txt2latex s++"}"
mdElem2latex txt2latex (Code s) = "\\texttt{"++txt2latex (htmlQuote s)++"}"
mdElem2latex _ (CodeBlock s) =
  "\\begin{verbatim}\n"++s++"\n\\end{verbatim}\n"
mdElem2latex txt2latex (Quote md) =
  "\\begin{quote}\n"++mdDoc2latex txt2latex md++"\\end{quote}\n"
mdElem2latex txt2latex (Par md) = mdDoc2latex txt2latex md++"\n\n"
mdElem2latex txt2latex (UList s) =
  "\\begin{itemize}"++
    concatMap (\i -> "\n\\item\n"++mdDoc2latex txt2latex i) s ++
  "\\end{itemize}\n"
mdElem2latex txt2latex (OList s) =
  "\\begin{enumerate}"++
    concatMap (\i -> "\n\\item\n"++mdDoc2latex txt2latex i) s ++
  "\\end{enumerate}\n"
mdElem2latex _ HRule = "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n"
mdElem2latex txt2latex (Header l s) = case l of
  1 -> "\\section{"++txt2latex s++"}\n\n"
  2 -> "\\subsection{"++txt2latex s++"}\n\n"
  3 -> "\\subsubsection{"++txt2latex s++"}\n\n"
  4 -> "\\paragraph{"++txt2latex s++"}\n\n"
  5 -> "\\textbf{"++txt2latex s++"}\n\n"
  _ -> "\\emph{"++txt2latex s++"}\n\n"


--- Translator for basic text to LaTeX.
--- markdown escapes are removed and possible HTML markups
--- are translated to LaTeX.
html2latex :: String -> String
html2latex = showLatexExps . parseHtmlString . removeEscapes

--- Translate a markdown text into a (partial) LaTeX document.
--- All characters with a special meaning in LaTeX, like dollar
--- or ampersand signs, are quoted.
markdownText2LaTeX :: String -> String
markdownText2LaTeX = mdDoc2latex html2latex . fromMarkdownText

--- Translate a markdown text into a (partial) LaTeX document
--- where the first argument is a function to translate the basic text
--- occurring in markdown elements to a LaTeX string.
--- For instance, one can use a translation operation
--- that supports passing mathematical formulas in LaTeX style
--- instead of quoting all special characters.
markdownText2LaTeXWithFormat :: (String->String) -> String -> String
markdownText2LaTeXWithFormat txt2latex = mdDoc2latex txt2latex . fromMarkdownText

--- Translate a markdown text into a complete LaTeX document
--- that can be formatted as a standalone document.
markdownText2CompleteLaTeX :: String -> String
markdownText2CompleteLaTeX mds =
  latexHeader ++ mdDoc2latex html2latex (fromMarkdownText mds) ++
  "\\end{document}\n"

latexHeader :: String
latexHeader =
 "\\documentclass{article}\n"++
 "\\usepackage[utf8x]{inputenc}\n"++
 "\\usepackage{url}\n"++
 "\\usepackage[breaklinks=true,unicode=true]{hyperref}\n"++
 "\\setlength{\\parindent}{0pt}\n"++
 "\\setlength{\\parskip}{6pt plus 2pt minus 1pt}\n"++
 "\\setcounter{secnumdepth}{0}\n"++
 "\\begin{document}\n"


--- Format the standard input (containing markdown text) as PDF.
formatMarkdownInputAsPDF :: IO ()
formatMarkdownInputAsPDF = getContents >>= formatMarkdownAsPDF

--- Format a file containing markdown text as PDF.
formatMarkdownFileAsPDF :: String -> IO ()
formatMarkdownFileAsPDF fname = readFile fname >>= formatMarkdownAsPDF

--- Format a file containing markdown text as PDF.
formatMarkdownAsPDF :: String -> IO ()
formatMarkdownAsPDF mdstr = do
  pid <- getPID
  let tmp = "tmp_"++show pid
  writeFile (tmp++".tex") (markdownText2CompleteLaTeX mdstr)
  pdflatexFile tmp

-- Format a file tmp.tex with pdflatex and show the result
pdflatexFile :: String -> IO ()
pdflatexFile tmp = do
  system $ "pdflatex \'\\nonstopmode\\input{"++tmp++".tex}\'"
  system $ "/bin/rm -f "++tmp++".tex "++tmp++".aux "++tmp++".log "++tmp++".out"
  system $ "evince "++tmp++".pdf"
  system $ "/bin/rm -f "++tmp++".pdf"
  done

-----------------------------------------------------------------------