phase 1 and 2 of embedded images

2025-09-16 14:58:50 +02:00
parent c92308c24a
commit c7b516be31
14 changed files with 686 additions and 54 deletions
--- a/backend/src/main/java/com/storycove/controller/StoryController.java
+++ b/backend/src/main/java/com/storycove/controller/StoryController.java
@@ -228,6 +228,38 @@ public class StoryController {
        Story story = storyService.updateReadingStatus(id, request.getIsRead());
        return ResponseEntity.ok(convertToDto(story));
    }
+
+    @PostMapping("/{id}/process-content-images")
+    public ResponseEntity<Map<String, Object>> processContentImages(@PathVariable UUID id, @RequestBody ProcessContentImagesRequest request) {
+        logger.info("Processing content images for story {}", id);
+
+        try {
+            // Process the HTML content to download and replace image URLs
+            ImageService.ContentImageProcessingResult result = imageService.processContentImages(request.getHtmlContent(), id);
+
+            // If there are warnings, let the client decide whether to proceed
+            if (result.hasWarnings()) {
+                return ResponseEntity.ok(Map.of(
+                    "processedContent", result.getProcessedContent(),
+                    "warnings", result.getWarnings(),
+                    "downloadedImages", result.getDownloadedImages(),
+                    "hasWarnings", true
+                ));
+            }
+
+            // Success - no warnings
+            return ResponseEntity.ok(Map.of(
+                "processedContent", result.getProcessedContent(),
+                "downloadedImages", result.getDownloadedImages(),
+                "hasWarnings", false
+            ));
+
+        } catch (Exception e) {
+            logger.error("Failed to process content images for story {}", id, e);
+            return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR)
+                    .body(Map.of("error", "Failed to process content images: " + e.getMessage()));
+        }
+    }
    
    @PostMapping("/reindex")
    public ResponseEntity<String> manualReindex() {
@@ -458,7 +490,14 @@ public class StoryController {
                story.setDescription(updateReq.getDescription());
            }
            if (updateReq.getContentHtml() != null) {
-                story.setContentHtml(sanitizationService.sanitize(updateReq.getContentHtml()));
+                logger.info("Content before sanitization (length: {}): {}",
+                           updateReq.getContentHtml().length(),
+                           updateReq.getContentHtml().substring(0, Math.min(500, updateReq.getContentHtml().length())));
+                String sanitizedContent = sanitizationService.sanitize(updateReq.getContentHtml());
+                logger.info("Content after sanitization (length: {}): {}",
+                           sanitizedContent.length(),
+                           sanitizedContent.substring(0, Math.min(500, sanitizedContent.length())));
+                story.setContentHtml(sanitizedContent);
            }
            if (updateReq.getSourceUrl() != null) {
                story.setSourceUrl(updateReq.getSourceUrl());
--- a/backend/src/main/java/com/storycove/dto/ProcessContentImagesRequest.java
+++ b/backend/src/main/java/com/storycove/dto/ProcessContentImagesRequest.java
@@ -0,0 +1,23 @@
+package com.storycove.dto;
+
+import jakarta.validation.constraints.NotBlank;
+
+public class ProcessContentImagesRequest {
+
+    @NotBlank(message = "HTML content is required")
+    private String htmlContent;
+
+    public ProcessContentImagesRequest() {}
+
+    public ProcessContentImagesRequest(String htmlContent) {
+        this.htmlContent = htmlContent;
+    }
+
+    public String getHtmlContent() {
+        return htmlContent;
+    }
+
+    public void setHtmlContent(String htmlContent) {
+        this.htmlContent = htmlContent;
+    }
+}
--- a/backend/src/main/java/com/storycove/service/HtmlSanitizationService.java
+++ b/backend/src/main/java/com/storycove/service/HtmlSanitizationService.java
@@ -54,7 +54,7 @@ public class HtmlSanitizationService {
            "p", "br", "div", "span", "h1", "h2", "h3", "h4", "h5", "h6",
            "b", "strong", "i", "em", "u", "s", "strike", "del", "ins",
            "sup", "sub", "small", "big", "mark", "pre", "code",
-            "ul", "ol", "li", "dl", "dt", "dd", "a",
+            "ul", "ol", "li", "dl", "dt", "dd", "a", "img",
            "table", "thead", "tbody", "tfoot", "tr", "th", "td", "caption",
            "blockquote", "cite", "q", "hr"
        ));
@@ -65,13 +65,13 @@ public class HtmlSanitizationService {
    }
    
    private void createSafelist() {
-        this.allowlist = new Safelist();
-        
+        this.allowlist = Safelist.relaxed();
+
        // Add allowed tags
        if (config.getAllowedTags() != null) {
            config.getAllowedTags().forEach(allowlist::addTags);
        }
-        
+
        // Add allowed attributes
        if (config.getAllowedAttributes() != null) {
            for (Map.Entry<String, List<String>> entry : config.getAllowedAttributes().entrySet()) {
@@ -82,25 +82,33 @@ public class HtmlSanitizationService {
                }
            }
        }
-        
-        // Configure allowed protocols for specific attributes (e.g., href)
+
+        // Special handling for img tags - allow all src attributes and validate later
+        allowlist.removeProtocols("img", "src", "http", "https");
+        // This is the key: preserve relative URLs by not restricting them
+        allowlist.preserveRelativeLinks(true);
+
+        // Configure allowed protocols for other attributes
        if (config.getAllowedProtocols() != null) {
            for (Map.Entry<String, Map<String, List<String>>> tagEntry : config.getAllowedProtocols().entrySet()) {
                String tag = tagEntry.getKey();
                Map<String, List<String>> attributeProtocols = tagEntry.getValue();
-                
+
                if (attributeProtocols != null) {
                    for (Map.Entry<String, List<String>> attrEntry : attributeProtocols.entrySet()) {
                        String attribute = attrEntry.getKey();
                        List<String> protocols = attrEntry.getValue();
-                        
-                        if (protocols != null) {
+
+                        if (protocols != null && !("img".equals(tag) && "src".equals(attribute))) {
+                            // Skip img src since we handled it above
                            allowlist.addProtocols(tag, attribute, protocols.toArray(new String[0]));
                        }
                    }
                }
            }
        }
+
+        logger.info("Configured Jsoup Safelist with preserveRelativeLinks=true for local image URLs");
        
        // Remove specific attributes if needed (deprecated in favor of protocol control)
        if (config.getRemovedAttributes() != null) {
@@ -133,8 +141,10 @@ public class HtmlSanitizationService {
        if (html == null || html.trim().isEmpty()) {
            return "";
        }
-        
-        return Jsoup.clean(html, allowlist);
+        logger.info("Content before sanitization: "+html);
+        String saniztedHtml = Jsoup.clean(html, allowlist.preserveRelativeLinks(true));
+        logger.info("Content after sanitization: "+saniztedHtml);
+        return saniztedHtml;
    }
    
    public String extractPlainText(String html) {
--- a/backend/src/main/java/com/storycove/service/ImageService.java
+++ b/backend/src/main/java/com/storycove/service/ImageService.java
@@ -1,5 +1,7 @@
 package com.storycove.service;

+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.stereotype.Service;
@@ -8,18 +10,22 @@ import org.springframework.web.multipart.MultipartFile;
 import javax.imageio.ImageIO;
 import java.awt.*;
 import java.awt.image.BufferedImage;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
+import java.io.*;
+import java.net.HttpURLConnection;
+import java.net.URL;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
-import java.util.Set;
-import java.util.UUID;
+import java.util.*;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;

@Service
 public class ImageService {
-    
+
+    private static final Logger logger = LoggerFactory.getLogger(ImageService.class);
+
    private static final Set<String> ALLOWED_CONTENT_TYPES = Set.of(
        "image/jpeg", "image/jpg", "image/png"
    );
@@ -53,14 +59,15 @@ public class ImageService {
    
    public enum ImageType {
        COVER("covers"),
-        AVATAR("avatars");
-        
+        AVATAR("avatars"),
+        CONTENT("content");
+
        private final String directory;
-        
+
        ImageType(String directory) {
            this.directory = directory;
        }
-        
+
        public String getDirectory() {
            return directory;
        }
@@ -182,6 +189,9 @@ public class ImageService {
                maxWidth = avatarMaxSize;
                maxHeight = avatarMaxSize;
                break;
+            case CONTENT:
+                // Content images are not resized
+                return new Dimension(originalWidth, originalHeight);
            default:
                return new Dimension(originalWidth, originalHeight);
        }
@@ -228,4 +238,224 @@ public class ImageService {
        String extension = getFileExtension(filename);
        return ALLOWED_EXTENSIONS.contains(extension);
    }
+
+    // Content image processing methods
+
+    /**
+     * Process HTML content and download all referenced images, replacing URLs with local paths
+     */
+    public ContentImageProcessingResult processContentImages(String htmlContent, UUID storyId) {
+        logger.info("Processing content images for story: {}, content length: {}", storyId,
+                   htmlContent != null ? htmlContent.length() : 0);
+
+        List<String> warnings = new ArrayList<>();
+        List<String> downloadedImages = new ArrayList<>();
+
+        if (htmlContent == null || htmlContent.trim().isEmpty()) {
+            logger.info("No content to process for story: {}", storyId);
+            return new ContentImageProcessingResult(htmlContent, warnings, downloadedImages);
+        }
+
+        // Find all img tags with src attributes
+        Pattern imgPattern = Pattern.compile("<img[^>]+src=[\"']([^\"']+)[\"'][^>]*>", Pattern.CASE_INSENSITIVE);
+        Matcher matcher = imgPattern.matcher(htmlContent);
+
+        int imageCount = 0;
+        int externalImageCount = 0;
+
+        StringBuffer processedContent = new StringBuffer();
+
+        while (matcher.find()) {
+            String fullImgTag = matcher.group(0);
+            String imageUrl = matcher.group(1);
+            imageCount++;
+
+            logger.info("Found image #{}: {} in tag: {}", imageCount, imageUrl, fullImgTag);
+
+            try {
+                // Skip if it's already a local path or data URL
+                if (imageUrl.startsWith("/") || imageUrl.startsWith("data:")) {
+                    logger.info("Skipping local/data URL: {}", imageUrl);
+                    matcher.appendReplacement(processedContent, Matcher.quoteReplacement(fullImgTag));
+                    continue;
+                }
+
+                externalImageCount++;
+                logger.info("Processing external image #{}: {}", externalImageCount, imageUrl);
+
+                // Download and store the image
+                String localPath = downloadImageFromUrl(imageUrl, storyId);
+                downloadedImages.add(localPath);
+
+                // Generate local URL
+                String localUrl = getLocalImageUrl(storyId, localPath);
+                logger.info("Downloaded image: {} -> {}", imageUrl, localUrl);
+
+                // Replace the src attribute with the local path - handle both single and double quotes
+                String newImgTag = fullImgTag
+                    .replaceFirst("src=\"" + Pattern.quote(imageUrl) + "\"", "src=\"" + localUrl + "\"")
+                    .replaceFirst("src='" + Pattern.quote(imageUrl) + "'", "src=\"" + localUrl + "\"");
+
+                // If replacement didn't work, try a more generic approach
+                if (newImgTag.equals(fullImgTag)) {
+                    logger.warn("Standard replacement failed for image URL: {}, trying generic replacement", imageUrl);
+                    newImgTag = fullImgTag.replaceAll("src\\s*=\\s*[\"']?" + Pattern.quote(imageUrl) + "[\"']?", "src=\"" + localUrl + "\"");
+                }
+
+                logger.info("Replaced img tag: {} -> {}", fullImgTag, newImgTag);
+                matcher.appendReplacement(processedContent, Matcher.quoteReplacement(newImgTag));
+
+            } catch (Exception e) {
+                logger.error("Failed to download image: {} - {}", imageUrl, e.getMessage(), e);
+                warnings.add("Failed to download image: " + imageUrl + " - " + e.getMessage());
+                // Keep original URL in case of failure
+                matcher.appendReplacement(processedContent, Matcher.quoteReplacement(fullImgTag));
+            }
+        }
+
+        matcher.appendTail(processedContent);
+
+        logger.info("Finished processing images for story: {}. Found {} total images, {} external. Downloaded {} images, {} warnings",
+                   storyId, imageCount, externalImageCount, downloadedImages.size(), warnings.size());
+
+        return new ContentImageProcessingResult(processedContent.toString(), warnings, downloadedImages);
+    }
+
+    /**
+     * Download an image from a URL and store it locally
+     */
+    private String downloadImageFromUrl(String imageUrl, UUID storyId) throws IOException {
+        URL url = new URL(imageUrl);
+        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
+
+        // Set a reasonable user agent to avoid blocks
+        connection.setRequestProperty("User-Agent", "Mozilla/5.0 (StoryCove Image Processor)");
+        connection.setConnectTimeout(30000); // 30 seconds
+        connection.setReadTimeout(30000);
+
+        try (InputStream inputStream = connection.getInputStream()) {
+            // Get content type to determine file extension
+            String contentType = connection.getContentType();
+            String extension = getExtensionFromContentType(contentType);
+
+            if (extension == null) {
+                // Try to extract from URL
+                extension = getExtensionFromUrl(imageUrl);
+            }
+
+            if (extension == null || !ALLOWED_EXTENSIONS.contains(extension.toLowerCase())) {
+                throw new IllegalArgumentException("Unsupported image format: " + contentType);
+            }
+
+            // Create directories for content images
+            Path contentDir = Paths.get(getUploadDir(), ImageType.CONTENT.getDirectory(), storyId.toString());
+            Files.createDirectories(contentDir);
+
+            // Generate unique filename
+            String filename = UUID.randomUUID().toString() + "." + extension.toLowerCase();
+            Path filePath = contentDir.resolve(filename);
+
+            // Read and validate the image
+            byte[] imageData = inputStream.readAllBytes();
+            ByteArrayInputStream bais = new ByteArrayInputStream(imageData);
+            BufferedImage image = ImageIO.read(bais);
+
+            if (image == null) {
+                throw new IOException("Invalid image format");
+            }
+
+            // Save the image
+            Files.write(filePath, imageData);
+
+            // Return relative path
+            return ImageType.CONTENT.getDirectory() + "/" + storyId.toString() + "/" + filename;
+
+        } finally {
+            connection.disconnect();
+        }
+    }
+
+    /**
+     * Generate local image URL for serving
+     */
+    private String getLocalImageUrl(UUID storyId, String imagePath) {
+        String currentLibraryId = libraryService.getCurrentLibraryId();
+        if (currentLibraryId == null || currentLibraryId.trim().isEmpty()) {
+            logger.warn("Current library ID is null or empty when generating local image URL for story: {}", storyId);
+            return "/api/files/images/default/" + imagePath;
+        }
+        String localUrl = "/api/files/images/" + currentLibraryId + "/" + imagePath;
+        logger.info("Generated local image URL: {} for story: {}", localUrl, storyId);
+        return localUrl;
+    }
+
+    /**
+     * Get file extension from content type
+     */
+    private String getExtensionFromContentType(String contentType) {
+        if (contentType == null) return null;
+
+        switch (contentType.toLowerCase()) {
+            case "image/jpeg":
+            case "image/jpg":
+                return "jpg";
+            case "image/png":
+                return "png";
+            default:
+                return null;
+        }
+    }
+
+    /**
+     * Extract file extension from URL
+     */
+    private String getExtensionFromUrl(String url) {
+        try {
+            String path = new URL(url).getPath();
+            int lastDot = path.lastIndexOf('.');
+            if (lastDot > 0 && lastDot < path.length() - 1) {
+                return path.substring(lastDot + 1).toLowerCase();
+            }
+        } catch (Exception ignored) {
+        }
+        return null;
+    }
+
+    /**
+     * Clean up content images for a story
+     */
+    public void deleteContentImages(UUID storyId) {
+        try {
+            Path contentDir = Paths.get(getUploadDir(), ImageType.CONTENT.getDirectory(), storyId.toString());
+            if (Files.exists(contentDir)) {
+                Files.walk(contentDir)
+                    .sorted(Comparator.reverseOrder())
+                    .map(Path::toFile)
+                    .forEach(java.io.File::delete);
+            }
+        } catch (IOException e) {
+            // Log but don't throw - this is cleanup
+            System.err.println("Failed to clean up content images for story " + storyId + ": " + e.getMessage());
+        }
+    }
+
+    /**
+     * Result class for content image processing
+     */
+    public static class ContentImageProcessingResult {
+        private final String processedContent;
+        private final List<String> warnings;
+        private final List<String> downloadedImages;
+
+        public ContentImageProcessingResult(String processedContent, List<String> warnings, List<String> downloadedImages) {
+            this.processedContent = processedContent;
+            this.warnings = warnings;
+            this.downloadedImages = downloadedImages;
+        }
+
+        public String getProcessedContent() { return processedContent; }
+        public List<String> getWarnings() { return warnings; }
+        public List<String> getDownloadedImages() { return downloadedImages; }
+        public boolean hasWarnings() { return !warnings.isEmpty(); }
+    }
 }
--- a/backend/src/main/resources/html-sanitization-config.json
+++ b/backend/src/main/resources/html-sanitization-config.json
@@ -4,7 +4,7 @@
    "b", "strong", "i", "em", "u", "s", "strike", "del", "ins",
    "sup", "sub", "small", "big", "mark", "pre", "code", "kbd", "samp", "var",
    "ul", "ol", "li", "dl", "dt", "dd",
-    "a", "table", "thead", "tbody", "tfoot", "tr", "th", "td", "caption", "colgroup", "col",
+    "a", "img", "table", "thead", "tbody", "tfoot", "tr", "th", "td", "caption", "colgroup", "col",
    "blockquote", "cite", "q", "hr", "details", "summary"
  ],
  "allowedAttributes": {
@@ -18,6 +18,7 @@
    "h5": ["class", "style"],
    "h6": ["class", "style"],
    "a": ["class", "href", "title"],
+    "img": ["src", "alt", "width", "height", "class", "style"],
    "table": ["class", "style"],
    "th": ["class", "style", "colspan", "rowspan"],
    "td": ["class", "style", "colspan", "rowspan"],
@@ -41,6 +42,9 @@
  "allowedProtocols": {
    "a": {
      "href": ["http", "https", "#", "/"]
+    },
+    "img": {
+      "src": ["http", "https", "data", "/", "cid"]
    }
  },
  "description": "HTML sanitization configuration for StoryCove story content. This configuration is shared between frontend (DOMPurify) and backend (Jsoup) to ensure consistency."