phase 1 and 2 of embedded images

This commit is contained in:
Stefan Hardegger
2025-09-16 14:58:50 +02:00
parent c92308c24a
commit c7b516be31
14 changed files with 686 additions and 54 deletions

View File

@@ -228,6 +228,38 @@ public class StoryController {
Story story = storyService.updateReadingStatus(id, request.getIsRead());
return ResponseEntity.ok(convertToDto(story));
}
@PostMapping("/{id}/process-content-images")
public ResponseEntity<Map<String, Object>> processContentImages(@PathVariable UUID id, @RequestBody ProcessContentImagesRequest request) {
logger.info("Processing content images for story {}", id);
try {
// Process the HTML content to download and replace image URLs
ImageService.ContentImageProcessingResult result = imageService.processContentImages(request.getHtmlContent(), id);
// If there are warnings, let the client decide whether to proceed
if (result.hasWarnings()) {
return ResponseEntity.ok(Map.of(
"processedContent", result.getProcessedContent(),
"warnings", result.getWarnings(),
"downloadedImages", result.getDownloadedImages(),
"hasWarnings", true
));
}
// Success - no warnings
return ResponseEntity.ok(Map.of(
"processedContent", result.getProcessedContent(),
"downloadedImages", result.getDownloadedImages(),
"hasWarnings", false
));
} catch (Exception e) {
logger.error("Failed to process content images for story {}", id, e);
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR)
.body(Map.of("error", "Failed to process content images: " + e.getMessage()));
}
}
@PostMapping("/reindex")
public ResponseEntity<String> manualReindex() {
@@ -458,7 +490,14 @@ public class StoryController {
story.setDescription(updateReq.getDescription());
}
if (updateReq.getContentHtml() != null) {
story.setContentHtml(sanitizationService.sanitize(updateReq.getContentHtml()));
logger.info("Content before sanitization (length: {}): {}",
updateReq.getContentHtml().length(),
updateReq.getContentHtml().substring(0, Math.min(500, updateReq.getContentHtml().length())));
String sanitizedContent = sanitizationService.sanitize(updateReq.getContentHtml());
logger.info("Content after sanitization (length: {}): {}",
sanitizedContent.length(),
sanitizedContent.substring(0, Math.min(500, sanitizedContent.length())));
story.setContentHtml(sanitizedContent);
}
if (updateReq.getSourceUrl() != null) {
story.setSourceUrl(updateReq.getSourceUrl());

View File

@@ -0,0 +1,23 @@
package com.storycove.dto;
import jakarta.validation.constraints.NotBlank;
public class ProcessContentImagesRequest {
@NotBlank(message = "HTML content is required")
private String htmlContent;
public ProcessContentImagesRequest() {}
public ProcessContentImagesRequest(String htmlContent) {
this.htmlContent = htmlContent;
}
public String getHtmlContent() {
return htmlContent;
}
public void setHtmlContent(String htmlContent) {
this.htmlContent = htmlContent;
}
}

View File

@@ -54,7 +54,7 @@ public class HtmlSanitizationService {
"p", "br", "div", "span", "h1", "h2", "h3", "h4", "h5", "h6",
"b", "strong", "i", "em", "u", "s", "strike", "del", "ins",
"sup", "sub", "small", "big", "mark", "pre", "code",
"ul", "ol", "li", "dl", "dt", "dd", "a",
"ul", "ol", "li", "dl", "dt", "dd", "a", "img",
"table", "thead", "tbody", "tfoot", "tr", "th", "td", "caption",
"blockquote", "cite", "q", "hr"
));
@@ -65,13 +65,13 @@ public class HtmlSanitizationService {
}
private void createSafelist() {
this.allowlist = new Safelist();
this.allowlist = Safelist.relaxed();
// Add allowed tags
if (config.getAllowedTags() != null) {
config.getAllowedTags().forEach(allowlist::addTags);
}
// Add allowed attributes
if (config.getAllowedAttributes() != null) {
for (Map.Entry<String, List<String>> entry : config.getAllowedAttributes().entrySet()) {
@@ -82,25 +82,33 @@ public class HtmlSanitizationService {
}
}
}
// Configure allowed protocols for specific attributes (e.g., href)
// Special handling for img tags - allow all src attributes and validate later
allowlist.removeProtocols("img", "src", "http", "https");
// This is the key: preserve relative URLs by not restricting them
allowlist.preserveRelativeLinks(true);
// Configure allowed protocols for other attributes
if (config.getAllowedProtocols() != null) {
for (Map.Entry<String, Map<String, List<String>>> tagEntry : config.getAllowedProtocols().entrySet()) {
String tag = tagEntry.getKey();
Map<String, List<String>> attributeProtocols = tagEntry.getValue();
if (attributeProtocols != null) {
for (Map.Entry<String, List<String>> attrEntry : attributeProtocols.entrySet()) {
String attribute = attrEntry.getKey();
List<String> protocols = attrEntry.getValue();
if (protocols != null) {
if (protocols != null && !("img".equals(tag) && "src".equals(attribute))) {
// Skip img src since we handled it above
allowlist.addProtocols(tag, attribute, protocols.toArray(new String[0]));
}
}
}
}
}
logger.info("Configured Jsoup Safelist with preserveRelativeLinks=true for local image URLs");
// Remove specific attributes if needed (deprecated in favor of protocol control)
if (config.getRemovedAttributes() != null) {
@@ -133,8 +141,10 @@ public class HtmlSanitizationService {
if (html == null || html.trim().isEmpty()) {
return "";
}
return Jsoup.clean(html, allowlist);
logger.info("Content before sanitization: "+html);
String saniztedHtml = Jsoup.clean(html, allowlist.preserveRelativeLinks(true));
logger.info("Content after sanitization: "+saniztedHtml);
return saniztedHtml;
}
public String extractPlainText(String html) {

View File

@@ -1,5 +1,7 @@
package com.storycove.service;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
@@ -8,18 +10,22 @@ import org.springframework.web.multipart.MultipartFile;
import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Set;
import java.util.UUID;
import java.util.*;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@Service
public class ImageService {
private static final Logger logger = LoggerFactory.getLogger(ImageService.class);
private static final Set<String> ALLOWED_CONTENT_TYPES = Set.of(
"image/jpeg", "image/jpg", "image/png"
);
@@ -53,14 +59,15 @@ public class ImageService {
public enum ImageType {
COVER("covers"),
AVATAR("avatars");
AVATAR("avatars"),
CONTENT("content");
private final String directory;
ImageType(String directory) {
this.directory = directory;
}
public String getDirectory() {
return directory;
}
@@ -182,6 +189,9 @@ public class ImageService {
maxWidth = avatarMaxSize;
maxHeight = avatarMaxSize;
break;
case CONTENT:
// Content images are not resized
return new Dimension(originalWidth, originalHeight);
default:
return new Dimension(originalWidth, originalHeight);
}
@@ -228,4 +238,224 @@ public class ImageService {
String extension = getFileExtension(filename);
return ALLOWED_EXTENSIONS.contains(extension);
}
// Content image processing methods
/**
* Process HTML content and download all referenced images, replacing URLs with local paths
*/
public ContentImageProcessingResult processContentImages(String htmlContent, UUID storyId) {
logger.info("Processing content images for story: {}, content length: {}", storyId,
htmlContent != null ? htmlContent.length() : 0);
List<String> warnings = new ArrayList<>();
List<String> downloadedImages = new ArrayList<>();
if (htmlContent == null || htmlContent.trim().isEmpty()) {
logger.info("No content to process for story: {}", storyId);
return new ContentImageProcessingResult(htmlContent, warnings, downloadedImages);
}
// Find all img tags with src attributes
Pattern imgPattern = Pattern.compile("<img[^>]+src=[\"']([^\"']+)[\"'][^>]*>", Pattern.CASE_INSENSITIVE);
Matcher matcher = imgPattern.matcher(htmlContent);
int imageCount = 0;
int externalImageCount = 0;
StringBuffer processedContent = new StringBuffer();
while (matcher.find()) {
String fullImgTag = matcher.group(0);
String imageUrl = matcher.group(1);
imageCount++;
logger.info("Found image #{}: {} in tag: {}", imageCount, imageUrl, fullImgTag);
try {
// Skip if it's already a local path or data URL
if (imageUrl.startsWith("/") || imageUrl.startsWith("data:")) {
logger.info("Skipping local/data URL: {}", imageUrl);
matcher.appendReplacement(processedContent, Matcher.quoteReplacement(fullImgTag));
continue;
}
externalImageCount++;
logger.info("Processing external image #{}: {}", externalImageCount, imageUrl);
// Download and store the image
String localPath = downloadImageFromUrl(imageUrl, storyId);
downloadedImages.add(localPath);
// Generate local URL
String localUrl = getLocalImageUrl(storyId, localPath);
logger.info("Downloaded image: {} -> {}", imageUrl, localUrl);
// Replace the src attribute with the local path - handle both single and double quotes
String newImgTag = fullImgTag
.replaceFirst("src=\"" + Pattern.quote(imageUrl) + "\"", "src=\"" + localUrl + "\"")
.replaceFirst("src='" + Pattern.quote(imageUrl) + "'", "src=\"" + localUrl + "\"");
// If replacement didn't work, try a more generic approach
if (newImgTag.equals(fullImgTag)) {
logger.warn("Standard replacement failed for image URL: {}, trying generic replacement", imageUrl);
newImgTag = fullImgTag.replaceAll("src\\s*=\\s*[\"']?" + Pattern.quote(imageUrl) + "[\"']?", "src=\"" + localUrl + "\"");
}
logger.info("Replaced img tag: {} -> {}", fullImgTag, newImgTag);
matcher.appendReplacement(processedContent, Matcher.quoteReplacement(newImgTag));
} catch (Exception e) {
logger.error("Failed to download image: {} - {}", imageUrl, e.getMessage(), e);
warnings.add("Failed to download image: " + imageUrl + " - " + e.getMessage());
// Keep original URL in case of failure
matcher.appendReplacement(processedContent, Matcher.quoteReplacement(fullImgTag));
}
}
matcher.appendTail(processedContent);
logger.info("Finished processing images for story: {}. Found {} total images, {} external. Downloaded {} images, {} warnings",
storyId, imageCount, externalImageCount, downloadedImages.size(), warnings.size());
return new ContentImageProcessingResult(processedContent.toString(), warnings, downloadedImages);
}
/**
* Download an image from a URL and store it locally
*/
private String downloadImageFromUrl(String imageUrl, UUID storyId) throws IOException {
URL url = new URL(imageUrl);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
// Set a reasonable user agent to avoid blocks
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (StoryCove Image Processor)");
connection.setConnectTimeout(30000); // 30 seconds
connection.setReadTimeout(30000);
try (InputStream inputStream = connection.getInputStream()) {
// Get content type to determine file extension
String contentType = connection.getContentType();
String extension = getExtensionFromContentType(contentType);
if (extension == null) {
// Try to extract from URL
extension = getExtensionFromUrl(imageUrl);
}
if (extension == null || !ALLOWED_EXTENSIONS.contains(extension.toLowerCase())) {
throw new IllegalArgumentException("Unsupported image format: " + contentType);
}
// Create directories for content images
Path contentDir = Paths.get(getUploadDir(), ImageType.CONTENT.getDirectory(), storyId.toString());
Files.createDirectories(contentDir);
// Generate unique filename
String filename = UUID.randomUUID().toString() + "." + extension.toLowerCase();
Path filePath = contentDir.resolve(filename);
// Read and validate the image
byte[] imageData = inputStream.readAllBytes();
ByteArrayInputStream bais = new ByteArrayInputStream(imageData);
BufferedImage image = ImageIO.read(bais);
if (image == null) {
throw new IOException("Invalid image format");
}
// Save the image
Files.write(filePath, imageData);
// Return relative path
return ImageType.CONTENT.getDirectory() + "/" + storyId.toString() + "/" + filename;
} finally {
connection.disconnect();
}
}
/**
* Generate local image URL for serving
*/
private String getLocalImageUrl(UUID storyId, String imagePath) {
String currentLibraryId = libraryService.getCurrentLibraryId();
if (currentLibraryId == null || currentLibraryId.trim().isEmpty()) {
logger.warn("Current library ID is null or empty when generating local image URL for story: {}", storyId);
return "/api/files/images/default/" + imagePath;
}
String localUrl = "/api/files/images/" + currentLibraryId + "/" + imagePath;
logger.info("Generated local image URL: {} for story: {}", localUrl, storyId);
return localUrl;
}
/**
* Get file extension from content type
*/
private String getExtensionFromContentType(String contentType) {
if (contentType == null) return null;
switch (contentType.toLowerCase()) {
case "image/jpeg":
case "image/jpg":
return "jpg";
case "image/png":
return "png";
default:
return null;
}
}
/**
* Extract file extension from URL
*/
private String getExtensionFromUrl(String url) {
try {
String path = new URL(url).getPath();
int lastDot = path.lastIndexOf('.');
if (lastDot > 0 && lastDot < path.length() - 1) {
return path.substring(lastDot + 1).toLowerCase();
}
} catch (Exception ignored) {
}
return null;
}
/**
* Clean up content images for a story
*/
public void deleteContentImages(UUID storyId) {
try {
Path contentDir = Paths.get(getUploadDir(), ImageType.CONTENT.getDirectory(), storyId.toString());
if (Files.exists(contentDir)) {
Files.walk(contentDir)
.sorted(Comparator.reverseOrder())
.map(Path::toFile)
.forEach(java.io.File::delete);
}
} catch (IOException e) {
// Log but don't throw - this is cleanup
System.err.println("Failed to clean up content images for story " + storyId + ": " + e.getMessage());
}
}
/**
* Result class for content image processing
*/
public static class ContentImageProcessingResult {
private final String processedContent;
private final List<String> warnings;
private final List<String> downloadedImages;
public ContentImageProcessingResult(String processedContent, List<String> warnings, List<String> downloadedImages) {
this.processedContent = processedContent;
this.warnings = warnings;
this.downloadedImages = downloadedImages;
}
public String getProcessedContent() { return processedContent; }
public List<String> getWarnings() { return warnings; }
public List<String> getDownloadedImages() { return downloadedImages; }
public boolean hasWarnings() { return !warnings.isEmpty(); }
}
}

View File

@@ -4,7 +4,7 @@
"b", "strong", "i", "em", "u", "s", "strike", "del", "ins",
"sup", "sub", "small", "big", "mark", "pre", "code", "kbd", "samp", "var",
"ul", "ol", "li", "dl", "dt", "dd",
"a", "table", "thead", "tbody", "tfoot", "tr", "th", "td", "caption", "colgroup", "col",
"a", "img", "table", "thead", "tbody", "tfoot", "tr", "th", "td", "caption", "colgroup", "col",
"blockquote", "cite", "q", "hr", "details", "summary"
],
"allowedAttributes": {
@@ -18,6 +18,7 @@
"h5": ["class", "style"],
"h6": ["class", "style"],
"a": ["class", "href", "title"],
"img": ["src", "alt", "width", "height", "class", "style"],
"table": ["class", "style"],
"th": ["class", "style", "colspan", "rowspan"],
"td": ["class", "style", "colspan", "rowspan"],
@@ -41,6 +42,9 @@
"allowedProtocols": {
"a": {
"href": ["http", "https", "#", "/"]
},
"img": {
"src": ["http", "https", "data", "/", "cid"]
}
},
"description": "HTML sanitization configuration for StoryCove story content. This configuration is shared between frontend (DOMPurify) and backend (Jsoup) to ensure consistency."