From 48b0087b01ca5860566c5b4114f60dc0dd698870 Mon Sep 17 00:00:00 2001 From: Stefan Hardegger Date: Tue, 30 Sep 2025 16:18:05 +0200 Subject: [PATCH] fix embedded images on deviantart --- .../service/HtmlSanitizationService.java | 53 ++++++++++++++- frontend/src/lib/sanitization.ts | 67 +++++++++++++++++-- 2 files changed, 112 insertions(+), 8 deletions(-) diff --git a/backend/src/main/java/com/storycove/service/HtmlSanitizationService.java b/backend/src/main/java/com/storycove/service/HtmlSanitizationService.java index b984c71..28758a2 100644 --- a/backend/src/main/java/com/storycove/service/HtmlSanitizationService.java +++ b/backend/src/main/java/com/storycove/service/HtmlSanitizationService.java @@ -137,12 +137,63 @@ public class HtmlSanitizationService { return config; } + /** + * Preprocess HTML to extract images from figure tags before sanitization + */ + private String preprocessFigureTags(String html) { + if (html == null || html.trim().isEmpty()) { + return html; + } + + try { + org.jsoup.nodes.Document doc = Jsoup.parse(html); + org.jsoup.select.Elements figures = doc.select("figure"); + + for (org.jsoup.nodes.Element figure : figures) { + // Find img tags within the figure + org.jsoup.select.Elements images = figure.select("img"); + + if (!images.isEmpty()) { + // Extract the first image and replace the figure with it + org.jsoup.nodes.Element img = images.first(); + + // Check if there's a figcaption to preserve as alt text + org.jsoup.select.Elements figcaptions = figure.select("figcaption"); + if (!figcaptions.isEmpty() && !img.hasAttr("alt")) { + String captionText = figcaptions.first().text(); + if (captionText != null && !captionText.trim().isEmpty()) { + img.attr("alt", captionText); + } + } + + // Replace the figure element with just the img + figure.replaceWith(img.clone()); + logger.debug("Extracted image from figure tag: {}", img.attr("src")); + } else { + // No images in figure, remove it entirely + figure.remove(); + logger.debug("Removed figure tag without images"); + } + } + + return doc.body().html(); + } catch (Exception e) { + logger.warn("Failed to preprocess figure tags, returning original HTML: {}", e.getMessage()); + return html; + } + } + public String sanitize(String html) { if (html == null || html.trim().isEmpty()) { return ""; } + logger.info("Content before sanitization: "+html); - String saniztedHtml = Jsoup.clean(html, allowlist.preserveRelativeLinks(true)); + + // Preprocess to extract images from figure tags + String preprocessed = preprocessFigureTags(html); + + String saniztedHtml = Jsoup.clean(preprocessed, allowlist.preserveRelativeLinks(true)); logger.info("Content after sanitization: "+saniztedHtml); return saniztedHtml; } diff --git a/frontend/src/lib/sanitization.ts b/frontend/src/lib/sanitization.ts index 06cd102..00a3afe 100644 --- a/frontend/src/lib/sanitization.ts +++ b/frontend/src/lib/sanitization.ts @@ -180,6 +180,53 @@ function createDOMPurifyConfig(config: SanitizationConfig) { return domPurifyConfig; } +/** + * Preprocess HTML to extract images from figure tags before sanitization + */ +function preprocessFigureTags(html: string): string { + if (!html || html.trim() === '') { + return html; + } + + try { + const parser = new DOMParser(); + const doc = parser.parseFromString(html, 'text/html'); + const figures = doc.querySelectorAll('figure'); + + figures.forEach(figure => { + // Find img tags within the figure + const images = figure.querySelectorAll('img'); + + if (images.length > 0) { + // Extract the first image + const img = images[0]; + + // Check if there's a figcaption to preserve as alt text + const figcaption = figure.querySelector('figcaption'); + if (figcaption && !img.hasAttribute('alt')) { + const captionText = figcaption.textContent?.trim(); + if (captionText) { + img.setAttribute('alt', captionText); + } + } + + // Replace the figure element with just the img + figure.replaceWith(img.cloneNode(true)); + debug.log('Extracted image from figure tag:', img.src); + } else { + // No images in figure, remove it entirely + figure.remove(); + debug.log('Removed figure tag without images'); + } + }); + + return doc.body.innerHTML; + } catch (error) { + console.warn('Failed to preprocess figure tags, returning original HTML:', error); + return html; + } +} + /** * Sanitize HTML content using shared configuration from backend */ @@ -189,12 +236,15 @@ export async function sanitizeHtml(html: string): Promise { } try { + // Preprocess to extract images from figure tags + const preprocessed = preprocessFigureTags(html); + const config = await fetchSanitizationConfig(); const domPurifyConfig = createDOMPurifyConfig(config); - - // Configure DOMPurify with our settings - const cleanHtml = DOMPurify.sanitize(html, domPurifyConfig as any); - + + // Configure DOMPurify with our settings + const cleanHtml = DOMPurify.sanitize(preprocessed, domPurifyConfig as any); + return cleanHtml.toString(); } catch (error) { console.error('Error during HTML sanitization:', error); @@ -212,10 +262,13 @@ export function sanitizeHtmlSync(html: string): string { return ''; } + // Preprocess to extract images from figure tags + const preprocessed = preprocessFigureTags(html); + // If we have cached config, use it if (cachedConfig) { const domPurifyConfig = createDOMPurifyConfig(cachedConfig); - return DOMPurify.sanitize(html, domPurifyConfig as any).toString(); + return DOMPurify.sanitize(preprocessed, domPurifyConfig as any).toString(); } // If we don't have cached config but there's an ongoing request, wait for it @@ -270,8 +323,8 @@ export function sanitizeHtmlSync(html: string): string { } } }); - - return DOMPurify.sanitize(html, fallbackConfig as any).toString(); + + return DOMPurify.sanitize(preprocessed, fallbackConfig as any).toString(); } /**