fix embedded images on deviantart

This commit is contained in:
Stefan Hardegger
2025-09-30 16:18:05 +02:00
parent c291559366
commit 48b0087b01
2 changed files with 112 additions and 8 deletions

View File

@@ -137,12 +137,63 @@ public class HtmlSanitizationService {
return config;
}
/**
* Preprocess HTML to extract images from figure tags before sanitization
*/
private String preprocessFigureTags(String html) {
if (html == null || html.trim().isEmpty()) {
return html;
}
try {
org.jsoup.nodes.Document doc = Jsoup.parse(html);
org.jsoup.select.Elements figures = doc.select("figure");
for (org.jsoup.nodes.Element figure : figures) {
// Find img tags within the figure
org.jsoup.select.Elements images = figure.select("img");
if (!images.isEmpty()) {
// Extract the first image and replace the figure with it
org.jsoup.nodes.Element img = images.first();
// Check if there's a figcaption to preserve as alt text
org.jsoup.select.Elements figcaptions = figure.select("figcaption");
if (!figcaptions.isEmpty() && !img.hasAttr("alt")) {
String captionText = figcaptions.first().text();
if (captionText != null && !captionText.trim().isEmpty()) {
img.attr("alt", captionText);
}
}
// Replace the figure element with just the img
figure.replaceWith(img.clone());
logger.debug("Extracted image from figure tag: {}", img.attr("src"));
} else {
// No images in figure, remove it entirely
figure.remove();
logger.debug("Removed figure tag without images");
}
}
return doc.body().html();
} catch (Exception e) {
logger.warn("Failed to preprocess figure tags, returning original HTML: {}", e.getMessage());
return html;
}
}
public String sanitize(String html) {
if (html == null || html.trim().isEmpty()) {
return "";
}
logger.info("Content before sanitization: "+html);
String saniztedHtml = Jsoup.clean(html, allowlist.preserveRelativeLinks(true));
// Preprocess to extract images from figure tags
String preprocessed = preprocessFigureTags(html);
String saniztedHtml = Jsoup.clean(preprocessed, allowlist.preserveRelativeLinks(true));
logger.info("Content after sanitization: "+saniztedHtml);
return saniztedHtml;
}

View File

@@ -180,6 +180,53 @@ function createDOMPurifyConfig(config: SanitizationConfig) {
return domPurifyConfig;
}
/**
* Preprocess HTML to extract images from figure tags before sanitization
*/
function preprocessFigureTags(html: string): string {
if (!html || html.trim() === '') {
return html;
}
try {
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/html');
const figures = doc.querySelectorAll('figure');
figures.forEach(figure => {
// Find img tags within the figure
const images = figure.querySelectorAll('img');
if (images.length > 0) {
// Extract the first image
const img = images[0];
// Check if there's a figcaption to preserve as alt text
const figcaption = figure.querySelector('figcaption');
if (figcaption && !img.hasAttribute('alt')) {
const captionText = figcaption.textContent?.trim();
if (captionText) {
img.setAttribute('alt', captionText);
}
}
// Replace the figure element with just the img
figure.replaceWith(img.cloneNode(true));
debug.log('Extracted image from figure tag:', img.src);
} else {
// No images in figure, remove it entirely
figure.remove();
debug.log('Removed figure tag without images');
}
});
return doc.body.innerHTML;
} catch (error) {
console.warn('Failed to preprocess figure tags, returning original HTML:', error);
return html;
}
}
/**
* Sanitize HTML content using shared configuration from backend
*/
@@ -189,11 +236,14 @@ export async function sanitizeHtml(html: string): Promise<string> {
}
try {
// Preprocess to extract images from figure tags
const preprocessed = preprocessFigureTags(html);
const config = await fetchSanitizationConfig();
const domPurifyConfig = createDOMPurifyConfig(config);
// Configure DOMPurify with our settings
const cleanHtml = DOMPurify.sanitize(html, domPurifyConfig as any);
const cleanHtml = DOMPurify.sanitize(preprocessed, domPurifyConfig as any);
return cleanHtml.toString();
} catch (error) {
@@ -212,10 +262,13 @@ export function sanitizeHtmlSync(html: string): string {
return '';
}
// Preprocess to extract images from figure tags
const preprocessed = preprocessFigureTags(html);
// If we have cached config, use it
if (cachedConfig) {
const domPurifyConfig = createDOMPurifyConfig(cachedConfig);
return DOMPurify.sanitize(html, domPurifyConfig as any).toString();
return DOMPurify.sanitize(preprocessed, domPurifyConfig as any).toString();
}
// If we don't have cached config but there's an ongoing request, wait for it
@@ -271,7 +324,7 @@ export function sanitizeHtmlSync(html: string): string {
}
});
return DOMPurify.sanitize(html, fallbackConfig as any).toString();
return DOMPurify.sanitize(preprocessed, fallbackConfig as any).toString();
}
/**