From 48b0087b01ca5860566c5b4114f60dc0dd698870 Mon Sep 17 00:00:00 2001
From: Stefan Hardegger <dev@hardegger.io>
Date: Tue, 30 Sep 2025 16:18:05 +0200
Subject: [PATCH] fix embedded images on deviantart

---
 .../service/HtmlSanitizationService.java      | 53 ++++++++++++++-
 frontend/src/lib/sanitization.ts              | 67 +++++++++++++++++--
 2 files changed, 112 insertions(+), 8 deletions(-)

diff --git a/backend/src/main/java/com/storycove/service/HtmlSanitizationService.java b/backend/src/main/java/com/storycove/service/HtmlSanitizationService.java
index b984c71..28758a2 100644
--- a/backend/src/main/java/com/storycove/service/HtmlSanitizationService.java
+++ b/backend/src/main/java/com/storycove/service/HtmlSanitizationService.java
@@ -137,12 +137,63 @@ public class HtmlSanitizationService {
         return config;
     }
     
+    /**
+     * Preprocess HTML to extract images from figure tags before sanitization
+     */
+    private String preprocessFigureTags(String html) {
+        if (html == null || html.trim().isEmpty()) {
+            return html;
+        }
+
+        try {
+            org.jsoup.nodes.Document doc = Jsoup.parse(html);
+            org.jsoup.select.Elements figures = doc.select("figure");
+
+            for (org.jsoup.nodes.Element figure : figures) {
+                // Find img tags within the figure
+                org.jsoup.select.Elements images = figure.select("img");
+
+                if (!images.isEmpty()) {
+                    // Extract the first image and replace the figure with it
+                    org.jsoup.nodes.Element img = images.first();
+
+                    // Check if there's a figcaption to preserve as alt text
+                    org.jsoup.select.Elements figcaptions = figure.select("figcaption");
+                    if (!figcaptions.isEmpty() && !img.hasAttr("alt")) {
+                        String captionText = figcaptions.first().text();
+                        if (captionText != null && !captionText.trim().isEmpty()) {
+                            img.attr("alt", captionText);
+                        }
+                    }
+
+                    // Replace the figure element with just the img
+                    figure.replaceWith(img.clone());
+                    logger.debug("Extracted image from figure tag: {}", img.attr("src"));
+                } else {
+                    // No images in figure, remove it entirely
+                    figure.remove();
+                    logger.debug("Removed figure tag without images");
+                }
+            }
+
+            return doc.body().html();
+        } catch (Exception e) {
+            logger.warn("Failed to preprocess figure tags, returning original HTML: {}", e.getMessage());
+            return html;
+        }
+    }
+
     public String sanitize(String html) {
         if (html == null || html.trim().isEmpty()) {
             return "";
         }
+
         logger.info("Content before sanitization: "+html);
-        String saniztedHtml = Jsoup.clean(html, allowlist.preserveRelativeLinks(true));
+
+        // Preprocess to extract images from figure tags
+        String preprocessed = preprocessFigureTags(html);
+
+        String saniztedHtml = Jsoup.clean(preprocessed, allowlist.preserveRelativeLinks(true));
         logger.info("Content after sanitization: "+saniztedHtml);
         return saniztedHtml;
     }
diff --git a/frontend/src/lib/sanitization.ts b/frontend/src/lib/sanitization.ts
index 06cd102..00a3afe 100644
--- a/frontend/src/lib/sanitization.ts
+++ b/frontend/src/lib/sanitization.ts
@@ -180,6 +180,53 @@ function createDOMPurifyConfig(config: SanitizationConfig) {
   return domPurifyConfig;
 }
 
+/**
+ * Preprocess HTML to extract images from figure tags before sanitization
+ */
+function preprocessFigureTags(html: string): string {
+  if (!html || html.trim() === '') {
+    return html;
+  }
+
+  try {
+    const parser = new DOMParser();
+    const doc = parser.parseFromString(html, 'text/html');
+    const figures = doc.querySelectorAll('figure');
+
+    figures.forEach(figure => {
+      // Find img tags within the figure
+      const images = figure.querySelectorAll('img');
+
+      if (images.length > 0) {
+        // Extract the first image
+        const img = images[0];
+
+        // Check if there's a figcaption to preserve as alt text
+        const figcaption = figure.querySelector('figcaption');
+        if (figcaption && !img.hasAttribute('alt')) {
+          const captionText = figcaption.textContent?.trim();
+          if (captionText) {
+            img.setAttribute('alt', captionText);
+          }
+        }
+
+        // Replace the figure element with just the img
+        figure.replaceWith(img.cloneNode(true));
+        debug.log('Extracted image from figure tag:', img.src);
+      } else {
+        // No images in figure, remove it entirely
+        figure.remove();
+        debug.log('Removed figure tag without images');
+      }
+    });
+
+    return doc.body.innerHTML;
+  } catch (error) {
+    console.warn('Failed to preprocess figure tags, returning original HTML:', error);
+    return html;
+  }
+}
+
 /**
  * Sanitize HTML content using shared configuration from backend
  */
@@ -189,12 +236,15 @@ export async function sanitizeHtml(html: string): Promise<string> {
   }
 
   try {
+    // Preprocess to extract images from figure tags
+    const preprocessed = preprocessFigureTags(html);
+
     const config = await fetchSanitizationConfig();
     const domPurifyConfig = createDOMPurifyConfig(config);
-    
-    // Configure DOMPurify with our settings  
-    const cleanHtml = DOMPurify.sanitize(html, domPurifyConfig as any);
-    
+
+    // Configure DOMPurify with our settings
+    const cleanHtml = DOMPurify.sanitize(preprocessed, domPurifyConfig as any);
+
     return cleanHtml.toString();
   } catch (error) {
     console.error('Error during HTML sanitization:', error);
@@ -212,10 +262,13 @@ export function sanitizeHtmlSync(html: string): string {
     return '';
   }
 
+  // Preprocess to extract images from figure tags
+  const preprocessed = preprocessFigureTags(html);
+
   // If we have cached config, use it
   if (cachedConfig) {
     const domPurifyConfig = createDOMPurifyConfig(cachedConfig);
-    return DOMPurify.sanitize(html, domPurifyConfig as any).toString();
+    return DOMPurify.sanitize(preprocessed, domPurifyConfig as any).toString();
   }
 
   // If we don't have cached config but there's an ongoing request, wait for it
@@ -270,8 +323,8 @@ export function sanitizeHtmlSync(html: string): string {
       }
     }
   });
-  
-  return DOMPurify.sanitize(html, fallbackConfig as any).toString();
+
+  return DOMPurify.sanitize(preprocessed, fallbackConfig as any).toString();
 }
 
 /**