fix embedded images on deviantart
This commit is contained in:
@@ -137,12 +137,63 @@ public class HtmlSanitizationService {
|
||||
return config;
|
||||
}
|
||||
|
||||
/**
|
||||
* Preprocess HTML to extract images from figure tags before sanitization
|
||||
*/
|
||||
private String preprocessFigureTags(String html) {
|
||||
if (html == null || html.trim().isEmpty()) {
|
||||
return html;
|
||||
}
|
||||
|
||||
try {
|
||||
org.jsoup.nodes.Document doc = Jsoup.parse(html);
|
||||
org.jsoup.select.Elements figures = doc.select("figure");
|
||||
|
||||
for (org.jsoup.nodes.Element figure : figures) {
|
||||
// Find img tags within the figure
|
||||
org.jsoup.select.Elements images = figure.select("img");
|
||||
|
||||
if (!images.isEmpty()) {
|
||||
// Extract the first image and replace the figure with it
|
||||
org.jsoup.nodes.Element img = images.first();
|
||||
|
||||
// Check if there's a figcaption to preserve as alt text
|
||||
org.jsoup.select.Elements figcaptions = figure.select("figcaption");
|
||||
if (!figcaptions.isEmpty() && !img.hasAttr("alt")) {
|
||||
String captionText = figcaptions.first().text();
|
||||
if (captionText != null && !captionText.trim().isEmpty()) {
|
||||
img.attr("alt", captionText);
|
||||
}
|
||||
}
|
||||
|
||||
// Replace the figure element with just the img
|
||||
figure.replaceWith(img.clone());
|
||||
logger.debug("Extracted image from figure tag: {}", img.attr("src"));
|
||||
} else {
|
||||
// No images in figure, remove it entirely
|
||||
figure.remove();
|
||||
logger.debug("Removed figure tag without images");
|
||||
}
|
||||
}
|
||||
|
||||
return doc.body().html();
|
||||
} catch (Exception e) {
|
||||
logger.warn("Failed to preprocess figure tags, returning original HTML: {}", e.getMessage());
|
||||
return html;
|
||||
}
|
||||
}
|
||||
|
||||
public String sanitize(String html) {
|
||||
if (html == null || html.trim().isEmpty()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
logger.info("Content before sanitization: "+html);
|
||||
String saniztedHtml = Jsoup.clean(html, allowlist.preserveRelativeLinks(true));
|
||||
|
||||
// Preprocess to extract images from figure tags
|
||||
String preprocessed = preprocessFigureTags(html);
|
||||
|
||||
String saniztedHtml = Jsoup.clean(preprocessed, allowlist.preserveRelativeLinks(true));
|
||||
logger.info("Content after sanitization: "+saniztedHtml);
|
||||
return saniztedHtml;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user