Skip to content

Commit 913292c

Browse files
committed
Improved fix for 5089
Fixes #5089 Improvements include: 1) If the URL has been encoded repeatedly, we will continue decoding the URL until it is completely decoded. 2) If the file extension guessed from the URL doesn’t match the mime type, we will respect the mime type and use that to form the file name. This URL: https://substackcdn.com/image/fetch/w_1456,c_limit,f_webp,q_auto:good,fl_progressive:steep/https%253A%252F%252Fsubstack-post-media.s3.amazonaws.com%252Fpublic%252Fimages%252Fced5d5ad-a0d6-44aa-b6f5-73c0a937d016_3553x5273.jpeg is double URL encoded and returns a webp file (which can’t be rendered in PDFs, FYI).
1 parent aba7f28 commit 913292c

File tree

3 files changed

+45
-5
lines changed

3 files changed

+45
-5
lines changed

src/resources/filters/common/url.lua

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,13 @@ function urldecode(url)
1212
return url
1313
end
1414

15-
15+
function fullyUrlDecode(url)
16+
-- decode the url until it is fully decoded (not a single pass,
17+
-- but repeated until it decodes no further)
18+
result = urldecode(url)
19+
if result == url then
20+
return result
21+
else
22+
return fullyUrlDecode(result)
23+
end
24+
end

src/resources/filters/quarto-post/pdf-images.lua

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,19 @@ local function convert_svg(path)
2020
end
2121
end
2222

23+
local mimeImgExts = {
24+
["image/jpeg"]="jpg",
25+
["image/gif"]="gif",
26+
["image/vnd.microsoft.icon"]="ico",
27+
["image/avif"]="avif",
28+
["image/bmp"]="bmp",
29+
["image/png"]="png",
30+
["image/svg+xml"]="svg",
31+
["image/tiff"]="tif",
32+
["image/webp"]="webp",
33+
}
34+
35+
2336
-- A cache of image urls that we've resolved into the mediabag
2437
-- keyed by {url: mediabagpath}
2538
local resolvedUrls = {}
@@ -109,13 +122,31 @@ function pdfImages()
109122
else
110123
local relativePath = image.src:match('https?://[%w%.%:]+/(.+)')
111124
if relativePath then
112-
125+
113126
local imgMt, imgContents = pandoc.mediabag.fetch(image.src)
114-
local decodedSrc = urldecode(image.src)
127+
local decodedSrc = fullyUrlDecode(image.src)
115128
if decodedSrc == nil then
116129
decodedSrc = "unknown"
117130
end
118-
local filename = windows_safe_filename(tex_safe_filename(pandoc.path.filename(decodedSrc)))
131+
132+
local function filenameFromMimeType(filename, imgMt)
133+
-- Use the mime type to compute an extension when possible
134+
-- This will allow pandoc to properly know the type, even when
135+
-- the path to the image is a difficult to parse URI
136+
local mimeExt = mimeImgExts[imgMt]
137+
if mimeExt then
138+
local stem, _ext = pandoc.path.split_extension(filename)
139+
return stem .. '.' .. mimeExt
140+
else
141+
return filename
142+
end
143+
end
144+
145+
-- compute the filename for this file
146+
local basefilename = pandoc.path.filename(decodedSrc)
147+
local safefilename = windows_safe_filename(tex_safe_filename(basefilename))
148+
local filename = filenameFromMimeType(safefilename, imgMt)
149+
119150
if imgMt ~= nil then
120151
local existingMt = pandoc.mediabag.lookup(filename)
121152
local counter = 1
@@ -139,3 +170,4 @@ function pdfImages()
139170
}
140171
end
141172

173+

tests/docs/smoke-all/2023/04/04/5089.qmd

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ format: pdf
77

88
![](https://substackcdn.com/image/fetch/f_auto,q_auto:good,fl_progressive:steep/https%3A%2F%2Fbucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com%2Fpublic%2Fimages%2F9b7345d9-5f62-46dc-8062-d704c2c014a5_289x174.jpeg)
99

10-
1110
## Simple Url
1211

1312
![](https://quarto.org/quarto.png)

0 commit comments

Comments
 (0)