docster/src/Docster/Compiler.hs
Willem van den Ende 6b49db5801 refactor(Compiler): eliminate Maybe Text indirection in CompilationStrategy
- csWriter now writes files directly (WriterOptions -> Pandoc -> FilePath -> IO (Either DocsterError ()))
- csPostProcess no longer takes text content (String -> IO (Either DocsterError ()))
- Each strategy owns its complete output logic (PDF/HTML/DOCX)
- Remove generateOutputFile (eliminated unused CompilationStrategy parameter)
- Pipeline: generateOutputM >>= processOutput => writeAndProcessOutput (1 step)
- Pandoc 3.7 compatibility: writeDocx returns ByteString instead of ()

.gitignore: exclude stack/cabal config and generated files
2026-04-30 18:13:00 +01:00

340 lines
13 KiB
Haskell

{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE LambdaCase #-}
-- | Document compilation functionality for PDF and HTML output
module Docster.Compiler
( -- * Compilation Functions
compileToPDF
, compileToHTML
, compileToDOCX
) where
import Docster.Types
( DocsterError(..), OutputFormat(..), SourceDir(..), OutputDir(..), OutputPath(..)
, DiagramConfig(..), computeOutputDir, ensureOutputDir
)
import Text.Pandoc.Writers ()
import qualified Data.ByteString.Lazy as BSL
import Docster.Transform (transformDocument)
import Docster.LaTeX (latexTemplate)
import Text.Pandoc
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.IO as TIO
import System.FilePath (takeDirectory, takeBaseName, replaceExtension, (</>), (<.>))
import System.Process (callProcess, readProcessWithExitCode)
import System.IO.Temp (withSystemTempDirectory)
import System.Directory (copyFile, doesFileExist)
import System.Exit (ExitCode(..))
import Control.Exception (throwIO)
import Control.Monad (void)
import Control.Monad.Trans.Except (ExceptT, runExceptT, throwE)
import Control.Monad.Trans.Reader (ReaderT, runReaderT, asks)
import Control.Monad.Trans.Class (lift)
import Control.Monad.IO.Class (liftIO)
import Data.Maybe (mapMaybe)
import Data.Char (ord)
-- | Success indicator for user feedback
successEmoji :: Text
successEmoji = ""
-- | Compilation context for pipeline operations
data CompilationContext = CompilationContext
{ ccStrategy :: CompilationStrategy
, ccInputPath :: FilePath
, ccOutputPath :: FilePath
, ccDocName :: Text
, ccReaderOptions :: ReaderOptions
, ccConfig :: DiagramConfig
}
-- | Monad stack for compilation pipeline
type CompilationM = ReaderT CompilationContext (ExceptT DocsterError IO)
-- | Strategy pattern: Record of format-specific operations
data CompilationStrategy = CompilationStrategy
{ -- | Format for diagram configuration
csOutputFormat :: OutputFormat
-- | Pandoc writer: writes output directly to the given file path
, csWriter :: WriterOptions -> Pandoc -> FilePath -> IO (Either DocsterError ())
-- | Post-processing after write (PDF→xelatex, HTML→open browser, DOCX→noop)
, csPostProcess :: String -> IO (Either DocsterError ())
-- | Success message formatter
, csSuccessMessage :: String -> Text
}
-- | PDF compilation strategy
pdfStrategy :: CompilationStrategy
pdfStrategy = CompilationStrategy
{ csOutputFormat = PDF
, csWriter = \opts doc path -> do
result <- runIO (writeLaTeX opts doc)
case result of
Left err -> return $ Left $ FileError $ "LaTeX write failed: " <> T.pack (show err)
Right latex -> do
TIO.writeFile path (latexTemplate latex)
return $ Right ()
, csPostProcess = processPDFOutput
, csSuccessMessage = \path -> successEmoji <> " PDF written to " <> T.pack path
}
-- | HTML compilation strategy
htmlStrategy :: CompilationStrategy
htmlStrategy = CompilationStrategy
{ csOutputFormat = HTML
, csWriter = \opts doc path -> do
result <- runIO (writeHtml5String opts doc)
case result of
Left err -> return $ Left $ FileError $ "HTML write failed: " <> T.pack (show err)
Right html -> do
TIO.writeFile path html
return $ Right ()
, csPostProcess = processHTMLOutput
, csSuccessMessage = \path -> successEmoji <> " HTML written to " <> T.pack path
}
-- | DOCX compilation strategy (Pandoc writes file directly)
docxStrategy :: CompilationStrategy
docxStrategy = CompilationStrategy
{ csOutputFormat = DOCX
, csWriter = \opts doc path -> do
result <- runIO (writeDocx opts doc)
case result of
Left err -> return $ Left $ FileError $ "DOCX generation failed: " <> T.pack (show err)
Right docxBS -> do
BSL.writeFile path docxBS
return $ Right ()
, csPostProcess = \_ -> return $ Right () -- no post-processing needed
, csSuccessMessage = \path -> successEmoji <> " DOCX written to " <> T.pack path
}
-- | Parse LaTeX log content to extract meaningful error messages
parseLatexErrors :: Text -> Text
parseLatexErrors logContent =
let logLines = T.lines logContent
missingChars = extractMissingChars logLines
overfullBoxes = extractOverfullBoxes logLines
undefinedCommands = extractUndefinedCommands logLines
fatalErrors = extractFatalErrors logLines
errorCount = length missingChars + length overfullBoxes + length undefinedCommands + length fatalErrors
summary = if errorCount == 0
then "Unknown LaTeX error occurred."
else T.unlines $ filter (not . T.null) [
if not (null fatalErrors) then "Fatal errors:\n" <> T.unlines (map ("" <>) fatalErrors) else "",
if not (null undefinedCommands) then "Undefined commands:\n" <> T.unlines (map ("" <>) undefinedCommands) else "",
if not (null missingChars) then "Missing Unicode characters:\n" <> T.unlines (map ("" <>) missingChars) else "",
if not (null overfullBoxes) then T.pack (show (length overfullBoxes)) <> " overfull boxes (layout warnings)" else ""
]
in summary
-- | Extract missing character warnings from LaTeX log
extractMissingChars :: [Text] -> [Text]
extractMissingChars = mapMaybe extractChar
where
extractChar line
| "Missing character:" `T.isInfixOf` line =
case T.splitOn "(U+" line of
[_, rest] -> case T.splitOn ")" rest of
(unicode:_) -> Just $ "U+" <> unicode <> " " <> extractCharContext line
_ -> Nothing
_ -> Nothing
| otherwise = Nothing
extractCharContext line =
case T.splitOn " in font " line of
[_, rest] -> "in " <> T.takeWhile (/= ':') rest
_ -> ""
-- | Extract overfull box warnings
extractOverfullBoxes :: [Text] -> [Text]
extractOverfullBoxes = mapMaybe extractBox
where
extractBox line
| "Overfull \\hbox" `T.isInfixOf` line = Just $ T.takeWhile (/= '\n') line
| otherwise = Nothing
-- | Extract undefined command errors
extractUndefinedCommands :: [Text] -> [Text]
extractUndefinedCommands = mapMaybe extractUndef
where
extractUndef line
| "Undefined control sequence" `T.isInfixOf` line = Just line
| otherwise = Nothing
-- | Extract fatal LaTeX errors
extractFatalErrors :: [Text] -> [Text]
extractFatalErrors = mapMaybe extractFatal
where
extractFatal line
| "! " `T.isPrefixOf` line && not ("Missing character:" `T.isInfixOf` line) = Just $ T.drop 2 line
| otherwise = Nothing
-- | Process PDF output: direct XeLaTeX compilation (LaTeX already written by csWriter)
processPDFOutput :: String -> IO (Either DocsterError ())
processPDFOutput outputPath = do
let logOutputPath = replaceExtension outputPath "log"
-- Use temporary directory for LaTeX compilation
withSystemTempDirectory "docster-latex" $ \tempDir -> do
let texFile = tempDir </> "document.tex"
pdfFile = tempDir </> "document.pdf"
logFile = tempDir </> "document.log"
-- Run XeLaTeX compilation
(exitCode, _stdout, stderr) <- readProcessWithExitCode "xelatex"
[ "-output-directory=" <> tempDir
, "-interaction=nonstopmode" -- Don't stop on errors
, texFile
] ""
-- Always copy log file to output location for debugging
logExists <- doesFileExist logFile
logContent <- if logExists
then TIO.readFile logFile
else return (T.pack stderr)
TIO.writeFile logOutputPath logContent
case exitCode of
ExitSuccess -> do
-- Check if PDF was actually generated
pdfExists <- doesFileExist pdfFile
if pdfExists
then do
-- Copy the generated PDF to the final location
copyFile pdfFile outputPath
return $ Right ()
else do
return $ Left $ PDFGenerationError $
"PDF file not generated despite successful exit code.\n" <>
"Full LaTeX log written to: " <> T.pack logOutputPath
ExitFailure code -> do
-- LaTeX compilation failed - parse log for meaningful errors
let errorSummary = parseLatexErrors logContent
return $ Left $ PDFGenerationError $
"❌ LaTeX compilation failed (exit code " <> T.pack (show code) <> "):\n" <>
errorSummary <> "\n\n" <>
"Full LaTeX log written to: " <> T.pack logOutputPath
-- | Process HTML output: open browser (HTML already written by csWriter)
processHTMLOutput :: String -> IO (Either DocsterError ())
processHTMLOutput outputPath = do
-- Open the generated HTML file in browser for verification
putStrLn $ "🌐 Opening " <> outputPath <> " in browser for error checking..."
void $ callProcess "open" [outputPath]
return $ Right ()
-- | Helper function to lift IO (Either DocsterError a) into CompilationM
liftEitherM :: IO (Either DocsterError a) -> CompilationM a
liftEitherM action = do
result <- liftIO action
case result of
Left err -> lift $ throwE err
Right value -> return value
-- | Strip ANSI escape sequences (CSI codes like color/style) from text.
-- These appear in copy-pasted terminal output and break LaTeX compilation.
stripAnsiCodes :: Text -> Text
stripAnsiCodes input = case T.break (== '\x1b') input of
(before, rest)
| T.null rest -> before
| otherwise -> before <> stripAnsiCodes (skipEscape (T.tail rest))
where
-- Skip an ESC sequence: ESC [ <params> <final byte>
skipEscape t
| T.null t = t
| T.head t == '[' = skipCSIParams (T.tail t)
| otherwise = T.tail t -- non-CSI escape: skip one char after ESC
-- Skip CSI parameter/intermediate bytes until final byte (0x40-0x7E)
skipCSIParams t
| T.null t = t
| let c = ord (T.head t), c >= 0x40 && c <= 0x7E = T.tail t -- final byte, consume it
| otherwise = skipCSIParams (T.tail t)
-- | Pipeline step: Read content from input file
readContent :: CompilationM Text
readContent = do
inputPath <- asks ccInputPath
liftIO $ stripAnsiCodes <$> TIO.readFile inputPath
-- | Pipeline step: Parse markdown content into Pandoc AST
parseDocument :: Text -> CompilationM Pandoc
parseDocument content = do
readerOptions <- asks ccReaderOptions
liftEitherM $ parseMarkdown readerOptions content
-- | Pipeline step: Transform document (process Mermaid diagrams)
transformDocumentM :: Pandoc -> CompilationM Pandoc
transformDocumentM pandoc = do
config <- asks ccConfig
docName <- asks ccDocName
liftEitherM $ transformDocument config docName pandoc
-- | Pipeline step: Write output and post-process (format-specific)
writeAndProcessOutput :: Pandoc -> CompilationM ()
writeAndProcessOutput pandoc = do
strategy <- asks ccStrategy
outputPath <- asks ccOutputPath
liftEitherM $ csWriter strategy def pandoc outputPath
liftEitherM $ (csPostProcess strategy) outputPath
-- | Pipeline step: Print success message
printSuccess :: CompilationM ()
printSuccess = do
strategy <- asks ccStrategy
outputPath <- asks ccOutputPath
liftIO $ putStrLn $ T.unpack $ csSuccessMessage strategy outputPath
-- | Higher-order compilation function that takes a strategy and executes the pipeline
compileWithStrategy :: CompilationStrategy -> SourceDir -> OutputDir -> Text -> OutputPath -> OutputPath -> IO (Either DocsterError ())
compileWithStrategy strategy sourceDir outputDir docName (OutputPath inputPath) (OutputPath outputPath) = do
let readerOptions = def { readerExtensions = getDefaultExtensions "markdown" }
config = DiagramConfig sourceDir outputDir (csOutputFormat strategy)
context = CompilationContext strategy inputPath outputPath docName readerOptions config
pipeline = readContent >>= parseDocument >>= transformDocumentM >>= writeAndProcessOutput >> printSuccess
runExceptT $ runReaderT pipeline context
-- | Parse markdown with error handling
parseMarkdown :: ReaderOptions -> Text -> IO (Either DocsterError Pandoc)
parseMarkdown readerOptions content = do
pandocResult <- runIO $ readMarkdown readerOptions content
return $ case pandocResult of
Left err -> Left $ FileError $ "Failed to parse markdown: " <> T.pack (show err)
Right pandoc -> Right pandoc
-- | Compile markdown to PDF using XeLaTeX
compileToPDF :: FilePath -> IO ()
compileToPDF = compileWithFormat pdfStrategy "pdf"
-- | Compile markdown to HTML
compileToHTML :: FilePath -> IO ()
compileToHTML = compileWithFormat htmlStrategy "html"
-- | Compile markdown to DOCX
compileToDOCX :: FilePath -> IO ()
compileToDOCX = compileWithFormat docxStrategy "docx"
-- | Higher-order function to compile with any format strategy
compileWithFormat :: CompilationStrategy -> String -> FilePath -> IO ()
compileWithFormat strategy extension path = do
let sourceDir = SourceDir $ takeDirectory path
outputDir = computeOutputDir path
OutputDir outDirPath = outputDir
baseName = takeBaseName path
docName = T.pack baseName
outputPath = OutputPath $ outDirPath </> baseName <.> extension
-- Ensure output directory exists before compilation
ensureOutputDir outputDir
result <- compileWithStrategy strategy sourceDir outputDir docName (OutputPath path) outputPath
case result of
Left err -> throwIO err
Right _ -> return ()