Path: blob/main/src/resources/pandoc/datadir/readqmd.lua
12926 views
-- read qmd with quarto syntax extensions and produce quarto's extended AST1-- Copyright (C) 2023-2024 Posit Software, PBC2--3-- Originally by Albert Krewinkel45local md_shortcode = require("lpegshortcode")6local md_fenced_div = require("lpegfenceddiv")78-- Support the same format extensions as pandoc's Markdown reader9Extensions = pandoc.format.extensions 'markdown'1011-- we replace invalid tags with random strings of the same size12-- to safely allow code blocks inside pipe tables13-- note that we can't use uppercase letters here14-- because pandoc canonicalizes classes to lowercase.15local function random_string(size)16local chars = "abcdefghijklmnopqrstuvwxyz"17local lst = {}18for _ = 1,size do19local ix = math.random(1, #chars)20table.insert(lst, string.sub(chars, ix, ix))21end22return table.concat(lst, "")23end2425local function find_invalid_tags(str)26-- [^.=\n]27-- we disallow "." to avoid catching {.python}28-- we disallow "=" to avoid catching {foo="bar"}29-- we disallow "\n" to avoid multiple lines3031-- no | in lua patterns...3233-- (c standard, 7.4.1.10, isspace function)34-- %s catches \n and \r, so we must use [ \t\f\v] instead3536local patterns = {37"^[ \t\f\v]*(```+[ \t\f\v]*)(%{+[^.=\n\r]*%}+)",38"\n[ \t\f\v]*(```+[ \t\f\v]*)(%{+[^.=\n\r]+%}+)"39}40local function find_it(init)41for _, pattern in ipairs(patterns) do42local range_start, range_end, ticks, tag = str:find(pattern, init)43if range_start ~= nil then44return range_start, range_end, ticks, tag45end46end47return nil48end4950local init = 151local range_start, range_end, ticks, tag = find_it(init)52local tag_set = {}53local tags = {}54while tag ~= nil do55init = range_end + 156if not tag_set[tag] then57tag_set[tag] = true58table.insert(tags, tag)59end60range_start, range_end, ticks, tag = find_it(init)61end62return tags63end6465local function escape_invalid_tags(str)66local tags = find_invalid_tags(str)67-- we must now replace the tags in a careful order. Specifically,68-- we can't replace a key that's a substring of a larger key without69-- first replacing the larger key.70--71-- ie. if we replace {python} before {{python}}, Bad Things Happen.72-- so we sort the tags by descending size, which suffices73table.sort(tags, function(a, b) return #b < #a end)7475local replacements = {}76for _, k in ipairs(tags) do77local replacement78local attempts = 179repeat80replacement = random_string(#k)81attempts = attempts + 182until str:find(replacement, 1, true) == nil or attempts == 10083if attempts == 100 then84-- luacov: disable85print("Internal error, could not find safe replacement for "..k.." after 100 tries")86print("Please file a bug at https://github.com/quarto-dev/quarto-cli")87os.exit(1)88-- luacov: enable89end90-- replace all lua special pattern characters with their91-- escaped versions92local safe_pattern = k:gsub("([%^%$%(%)%%%.%[%]%*%+%-%?])", "%%%1")93replacements[replacement] = k94local patterns = {95"^([ \t\f\v]*```+[ \t\f\v]*)" .. safe_pattern,96"(\n[ \t\f\v]*```+[ \t\f\v]*)" .. safe_pattern97}9899str = str:gsub(patterns[1], "%1" .. replacement):gsub(patterns[2], "%1" .. replacement)100end101return str, replacements102end103104local function unescape_invalid_tags(str, tags)105for replacement, k in pairs(tags) do106-- replace all lua special replacement characters with their107-- escaped versions, so that when we restore the behavior,108-- we don't accidentally create a pattern109local result = k:gsub("([$%%])", "%%%1")110str = str:gsub(replacement, result)111end112return str113end114115-- Convert a hexadecimal string back to the original string116local function hex_to_string(hex)117return (hex:gsub('..', function(cc)118return string.char(tonumber(cc, 16))119end))120end121122local function readqmd(txt, opts)123local uuid_pattern = "b58fc729%-690b%-4000%-b19f%-365a4093b2ff;([A-Fa-f0-9]+);"124local tags125txt = md_fenced_div.attempt_to_fix_fenced_div(txt)126txt, tags = escape_invalid_tags(txt)127txt = md_shortcode.parse_md_shortcode_2(txt)128local flavor = {129format = "markdown",130extensions = {},131}132if param("user-defined-from") then133flavor = _quarto.format.parse_format(param("user-defined-from"))134else135for k, v in pairs(opts.extensions) do136flavor.extensions[v] = true137end138end139140-- ### Opt-out some extensions that we know we won't support for now ###141-- https://pandoc.org/MANUAL.html#extension-table_attributes142-- https://github.com/quarto-dev/quarto-cli/pull/13249#issuecomment-3715267414143-- Only disable if the extension is actually supported by the format144local all_exts = pandoc.format.all_extensions(flavor.format)145if all_exts:includes('table_attributes') then146flavor.extensions["table_attributes"] = false147end148149-- Format flavor, i.e., which extensions should be enabled/disabled.150local function restore_invalid_tags(tag)151return tags[tag] or tag152end153154-- parse_shortcode overparses shortcodes inside code blocks, link targets, etc.155-- so we need to undo that damage here156157local unshortcode_text = function (c)158c.text = c.text:gsub(uuid_pattern, hex_to_string)159return c160end161162local function filter_attrs(el)163for k,v in pairs(el.attributes) do164if type(v) == "string" then165local new_str = v:gsub(uuid_pattern, hex_to_string)166-- we avoid always assigning to slightly workaround167-- what appears to be a foundational problem with Pandoc's Lua API168-- while accessing attributes with repeated keys.169-- Quarto is still going to be broken for the case170-- where there are shortcodes inside values of attributes with171-- repeated keys:172--173-- []{k='{{< meta k1 >}}' k='{{< meta k2 >}}'}174--175-- But I don't know how to work around this.176if new_str ~= v then177el.attributes[k] = new_str178end179end180end181return el182end183184local doc = pandoc.read(txt or "", flavor, opts):walk {185CodeBlock = function (cb)186cb.classes = cb.classes:map(restore_invalid_tags)187cb.text = cb.text:gsub(uuid_pattern, hex_to_string)188cb.text = unescape_invalid_tags(cb.text, tags)189return cb190end,191Code = unshortcode_text,192RawInline = unshortcode_text,193RawBlock = unshortcode_text,194Math = unshortcode_text,195Header = filter_attrs,196Span = filter_attrs,197Div = filter_attrs,198Link = function (l)199l = filter_attrs(l)200l.target = l.target:gsub(uuid_pattern, hex_to_string)201return l202end,203Image = function (i)204i = filter_attrs(i)205-- Replace UUID-encoded shortcodes in i.src206i.src = i.src:gsub(uuid_pattern, hex_to_string)207return i208end,209Str = function(str_node)210local str = str_node.text211-- Quick check: if UUID not present at all, return as-is212if not str:find("b58fc729-690b-4000-b19f-365a4093b2ff", 1, true) then213return nil214end215216local result = pandoc.Inlines{}217local pos = 1218219while true do220local match_start, match_end, hex_content = str:find(uuid_pattern, pos)221222if not match_start then223-- No more matches; append remaining string if any224if pos <= #str then225table.insert(result, pandoc.Str(str:sub(pos)))226end227break228end229230-- Append prefix before the match as a Str node (if non-empty)231if match_start > pos then232table.insert(result, pandoc.Str(str:sub(pos, match_start - 1)))233end234235-- Convert hex to original shortcode string236local shortcode_text = hex_to_string(hex_content)237238-- Parse the shortcode to markdown span syntax239local parsed_md = md_shortcode.parse_md_shortcode(shortcode_text) or ""240241-- Convert to Pandoc inlines via pandoc.read242local doc = pandoc.read(parsed_md, "markdown")243local inlines = doc.blocks[1] and doc.blocks[1].content or pandoc.Inlines{}244-- Append the inlines to result245for _, inline in ipairs(inlines) do246table.insert(result, inline)247end248249-- Move position past the match250pos = match_end + 1251end252253return result254end255}256return doc257end258259local reader_option_keys = {260"abbreviations",261"columns",262"default_image_extension",263"extensions",264"indented_code_classes",265"standalone",266"strip_comments",267"tab_stops",268"track_changes",269}270271local function options_to_meta(opts)272local result = {}273for _, key in ipairs(reader_option_keys) do274result[key] = opts[key]275end276return result277end278279local function meta_to_options(meta)280local result = {}281for _, key in ipairs(reader_option_keys) do282result[key] = meta[key]283end284return pandoc.ReaderOptions(result)285end286287return {288readqmd = readqmd,289options_to_meta = options_to_meta,290meta_to_options = meta_to_options291}292293294