fix embedded images on deviantart
This commit is contained in:
@@ -137,12 +137,63 @@ public class HtmlSanitizationService {
|
||||
return config;
|
||||
}
|
||||
|
||||
/**
|
||||
* Preprocess HTML to extract images from figure tags before sanitization
|
||||
*/
|
||||
private String preprocessFigureTags(String html) {
|
||||
if (html == null || html.trim().isEmpty()) {
|
||||
return html;
|
||||
}
|
||||
|
||||
try {
|
||||
org.jsoup.nodes.Document doc = Jsoup.parse(html);
|
||||
org.jsoup.select.Elements figures = doc.select("figure");
|
||||
|
||||
for (org.jsoup.nodes.Element figure : figures) {
|
||||
// Find img tags within the figure
|
||||
org.jsoup.select.Elements images = figure.select("img");
|
||||
|
||||
if (!images.isEmpty()) {
|
||||
// Extract the first image and replace the figure with it
|
||||
org.jsoup.nodes.Element img = images.first();
|
||||
|
||||
// Check if there's a figcaption to preserve as alt text
|
||||
org.jsoup.select.Elements figcaptions = figure.select("figcaption");
|
||||
if (!figcaptions.isEmpty() && !img.hasAttr("alt")) {
|
||||
String captionText = figcaptions.first().text();
|
||||
if (captionText != null && !captionText.trim().isEmpty()) {
|
||||
img.attr("alt", captionText);
|
||||
}
|
||||
}
|
||||
|
||||
// Replace the figure element with just the img
|
||||
figure.replaceWith(img.clone());
|
||||
logger.debug("Extracted image from figure tag: {}", img.attr("src"));
|
||||
} else {
|
||||
// No images in figure, remove it entirely
|
||||
figure.remove();
|
||||
logger.debug("Removed figure tag without images");
|
||||
}
|
||||
}
|
||||
|
||||
return doc.body().html();
|
||||
} catch (Exception e) {
|
||||
logger.warn("Failed to preprocess figure tags, returning original HTML: {}", e.getMessage());
|
||||
return html;
|
||||
}
|
||||
}
|
||||
|
||||
public String sanitize(String html) {
|
||||
if (html == null || html.trim().isEmpty()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
logger.info("Content before sanitization: "+html);
|
||||
String saniztedHtml = Jsoup.clean(html, allowlist.preserveRelativeLinks(true));
|
||||
|
||||
// Preprocess to extract images from figure tags
|
||||
String preprocessed = preprocessFigureTags(html);
|
||||
|
||||
String saniztedHtml = Jsoup.clean(preprocessed, allowlist.preserveRelativeLinks(true));
|
||||
logger.info("Content after sanitization: "+saniztedHtml);
|
||||
return saniztedHtml;
|
||||
}
|
||||
|
||||
@@ -180,6 +180,53 @@ function createDOMPurifyConfig(config: SanitizationConfig) {
|
||||
return domPurifyConfig;
|
||||
}
|
||||
|
||||
/**
|
||||
* Preprocess HTML to extract images from figure tags before sanitization
|
||||
*/
|
||||
function preprocessFigureTags(html: string): string {
|
||||
if (!html || html.trim() === '') {
|
||||
return html;
|
||||
}
|
||||
|
||||
try {
|
||||
const parser = new DOMParser();
|
||||
const doc = parser.parseFromString(html, 'text/html');
|
||||
const figures = doc.querySelectorAll('figure');
|
||||
|
||||
figures.forEach(figure => {
|
||||
// Find img tags within the figure
|
||||
const images = figure.querySelectorAll('img');
|
||||
|
||||
if (images.length > 0) {
|
||||
// Extract the first image
|
||||
const img = images[0];
|
||||
|
||||
// Check if there's a figcaption to preserve as alt text
|
||||
const figcaption = figure.querySelector('figcaption');
|
||||
if (figcaption && !img.hasAttribute('alt')) {
|
||||
const captionText = figcaption.textContent?.trim();
|
||||
if (captionText) {
|
||||
img.setAttribute('alt', captionText);
|
||||
}
|
||||
}
|
||||
|
||||
// Replace the figure element with just the img
|
||||
figure.replaceWith(img.cloneNode(true));
|
||||
debug.log('Extracted image from figure tag:', img.src);
|
||||
} else {
|
||||
// No images in figure, remove it entirely
|
||||
figure.remove();
|
||||
debug.log('Removed figure tag without images');
|
||||
}
|
||||
});
|
||||
|
||||
return doc.body.innerHTML;
|
||||
} catch (error) {
|
||||
console.warn('Failed to preprocess figure tags, returning original HTML:', error);
|
||||
return html;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize HTML content using shared configuration from backend
|
||||
*/
|
||||
@@ -189,11 +236,14 @@ export async function sanitizeHtml(html: string): Promise<string> {
|
||||
}
|
||||
|
||||
try {
|
||||
// Preprocess to extract images from figure tags
|
||||
const preprocessed = preprocessFigureTags(html);
|
||||
|
||||
const config = await fetchSanitizationConfig();
|
||||
const domPurifyConfig = createDOMPurifyConfig(config);
|
||||
|
||||
// Configure DOMPurify with our settings
|
||||
const cleanHtml = DOMPurify.sanitize(html, domPurifyConfig as any);
|
||||
const cleanHtml = DOMPurify.sanitize(preprocessed, domPurifyConfig as any);
|
||||
|
||||
return cleanHtml.toString();
|
||||
} catch (error) {
|
||||
@@ -212,10 +262,13 @@ export function sanitizeHtmlSync(html: string): string {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Preprocess to extract images from figure tags
|
||||
const preprocessed = preprocessFigureTags(html);
|
||||
|
||||
// If we have cached config, use it
|
||||
if (cachedConfig) {
|
||||
const domPurifyConfig = createDOMPurifyConfig(cachedConfig);
|
||||
return DOMPurify.sanitize(html, domPurifyConfig as any).toString();
|
||||
return DOMPurify.sanitize(preprocessed, domPurifyConfig as any).toString();
|
||||
}
|
||||
|
||||
// If we don't have cached config but there's an ongoing request, wait for it
|
||||
@@ -271,7 +324,7 @@ export function sanitizeHtmlSync(html: string): string {
|
||||
}
|
||||
});
|
||||
|
||||
return DOMPurify.sanitize(html, fallbackConfig as any).toString();
|
||||
return DOMPurify.sanitize(preprocessed, fallbackConfig as any).toString();
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user