fix embedded images on deviantart

This commit is contained in:
Stefan Hardegger
2025-09-30 16:18:05 +02:00
parent c291559366
commit 48b0087b01
2 changed files with 112 additions and 8 deletions

View File

@@ -137,12 +137,63 @@ public class HtmlSanitizationService {
return config;
}
/**
* Preprocess HTML to extract images from figure tags before sanitization
*/
private String preprocessFigureTags(String html) {
if (html == null || html.trim().isEmpty()) {
return html;
}
try {
org.jsoup.nodes.Document doc = Jsoup.parse(html);
org.jsoup.select.Elements figures = doc.select("figure");
for (org.jsoup.nodes.Element figure : figures) {
// Find img tags within the figure
org.jsoup.select.Elements images = figure.select("img");
if (!images.isEmpty()) {
// Extract the first image and replace the figure with it
org.jsoup.nodes.Element img = images.first();
// Check if there's a figcaption to preserve as alt text
org.jsoup.select.Elements figcaptions = figure.select("figcaption");
if (!figcaptions.isEmpty() && !img.hasAttr("alt")) {
String captionText = figcaptions.first().text();
if (captionText != null && !captionText.trim().isEmpty()) {
img.attr("alt", captionText);
}
}
// Replace the figure element with just the img
figure.replaceWith(img.clone());
logger.debug("Extracted image from figure tag: {}", img.attr("src"));
} else {
// No images in figure, remove it entirely
figure.remove();
logger.debug("Removed figure tag without images");
}
}
return doc.body().html();
} catch (Exception e) {
logger.warn("Failed to preprocess figure tags, returning original HTML: {}", e.getMessage());
return html;
}
}
public String sanitize(String html) {
if (html == null || html.trim().isEmpty()) {
return "";
}
logger.info("Content before sanitization: "+html);
String saniztedHtml = Jsoup.clean(html, allowlist.preserveRelativeLinks(true));
// Preprocess to extract images from figure tags
String preprocessed = preprocessFigureTags(html);
String saniztedHtml = Jsoup.clean(preprocessed, allowlist.preserveRelativeLinks(true));
logger.info("Content after sanitization: "+saniztedHtml);
return saniztedHtml;
}