fix embedded images on deviantart
This commit is contained in:
@@ -137,12 +137,63 @@ public class HtmlSanitizationService {
|
|||||||
return config;
|
return config;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Preprocess HTML to extract images from figure tags before sanitization
|
||||||
|
*/
|
||||||
|
private String preprocessFigureTags(String html) {
|
||||||
|
if (html == null || html.trim().isEmpty()) {
|
||||||
|
return html;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
org.jsoup.nodes.Document doc = Jsoup.parse(html);
|
||||||
|
org.jsoup.select.Elements figures = doc.select("figure");
|
||||||
|
|
||||||
|
for (org.jsoup.nodes.Element figure : figures) {
|
||||||
|
// Find img tags within the figure
|
||||||
|
org.jsoup.select.Elements images = figure.select("img");
|
||||||
|
|
||||||
|
if (!images.isEmpty()) {
|
||||||
|
// Extract the first image and replace the figure with it
|
||||||
|
org.jsoup.nodes.Element img = images.first();
|
||||||
|
|
||||||
|
// Check if there's a figcaption to preserve as alt text
|
||||||
|
org.jsoup.select.Elements figcaptions = figure.select("figcaption");
|
||||||
|
if (!figcaptions.isEmpty() && !img.hasAttr("alt")) {
|
||||||
|
String captionText = figcaptions.first().text();
|
||||||
|
if (captionText != null && !captionText.trim().isEmpty()) {
|
||||||
|
img.attr("alt", captionText);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replace the figure element with just the img
|
||||||
|
figure.replaceWith(img.clone());
|
||||||
|
logger.debug("Extracted image from figure tag: {}", img.attr("src"));
|
||||||
|
} else {
|
||||||
|
// No images in figure, remove it entirely
|
||||||
|
figure.remove();
|
||||||
|
logger.debug("Removed figure tag without images");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return doc.body().html();
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.warn("Failed to preprocess figure tags, returning original HTML: {}", e.getMessage());
|
||||||
|
return html;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public String sanitize(String html) {
|
public String sanitize(String html) {
|
||||||
if (html == null || html.trim().isEmpty()) {
|
if (html == null || html.trim().isEmpty()) {
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info("Content before sanitization: "+html);
|
logger.info("Content before sanitization: "+html);
|
||||||
String saniztedHtml = Jsoup.clean(html, allowlist.preserveRelativeLinks(true));
|
|
||||||
|
// Preprocess to extract images from figure tags
|
||||||
|
String preprocessed = preprocessFigureTags(html);
|
||||||
|
|
||||||
|
String saniztedHtml = Jsoup.clean(preprocessed, allowlist.preserveRelativeLinks(true));
|
||||||
logger.info("Content after sanitization: "+saniztedHtml);
|
logger.info("Content after sanitization: "+saniztedHtml);
|
||||||
return saniztedHtml;
|
return saniztedHtml;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -180,6 +180,53 @@ function createDOMPurifyConfig(config: SanitizationConfig) {
|
|||||||
return domPurifyConfig;
|
return domPurifyConfig;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Preprocess HTML to extract images from figure tags before sanitization
|
||||||
|
*/
|
||||||
|
function preprocessFigureTags(html: string): string {
|
||||||
|
if (!html || html.trim() === '') {
|
||||||
|
return html;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const parser = new DOMParser();
|
||||||
|
const doc = parser.parseFromString(html, 'text/html');
|
||||||
|
const figures = doc.querySelectorAll('figure');
|
||||||
|
|
||||||
|
figures.forEach(figure => {
|
||||||
|
// Find img tags within the figure
|
||||||
|
const images = figure.querySelectorAll('img');
|
||||||
|
|
||||||
|
if (images.length > 0) {
|
||||||
|
// Extract the first image
|
||||||
|
const img = images[0];
|
||||||
|
|
||||||
|
// Check if there's a figcaption to preserve as alt text
|
||||||
|
const figcaption = figure.querySelector('figcaption');
|
||||||
|
if (figcaption && !img.hasAttribute('alt')) {
|
||||||
|
const captionText = figcaption.textContent?.trim();
|
||||||
|
if (captionText) {
|
||||||
|
img.setAttribute('alt', captionText);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replace the figure element with just the img
|
||||||
|
figure.replaceWith(img.cloneNode(true));
|
||||||
|
debug.log('Extracted image from figure tag:', img.src);
|
||||||
|
} else {
|
||||||
|
// No images in figure, remove it entirely
|
||||||
|
figure.remove();
|
||||||
|
debug.log('Removed figure tag without images');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return doc.body.innerHTML;
|
||||||
|
} catch (error) {
|
||||||
|
console.warn('Failed to preprocess figure tags, returning original HTML:', error);
|
||||||
|
return html;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sanitize HTML content using shared configuration from backend
|
* Sanitize HTML content using shared configuration from backend
|
||||||
*/
|
*/
|
||||||
@@ -189,12 +236,15 @@ export async function sanitizeHtml(html: string): Promise<string> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
// Preprocess to extract images from figure tags
|
||||||
|
const preprocessed = preprocessFigureTags(html);
|
||||||
|
|
||||||
const config = await fetchSanitizationConfig();
|
const config = await fetchSanitizationConfig();
|
||||||
const domPurifyConfig = createDOMPurifyConfig(config);
|
const domPurifyConfig = createDOMPurifyConfig(config);
|
||||||
|
|
||||||
// Configure DOMPurify with our settings
|
// Configure DOMPurify with our settings
|
||||||
const cleanHtml = DOMPurify.sanitize(html, domPurifyConfig as any);
|
const cleanHtml = DOMPurify.sanitize(preprocessed, domPurifyConfig as any);
|
||||||
|
|
||||||
return cleanHtml.toString();
|
return cleanHtml.toString();
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error during HTML sanitization:', error);
|
console.error('Error during HTML sanitization:', error);
|
||||||
@@ -212,10 +262,13 @@ export function sanitizeHtmlSync(html: string): string {
|
|||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Preprocess to extract images from figure tags
|
||||||
|
const preprocessed = preprocessFigureTags(html);
|
||||||
|
|
||||||
// If we have cached config, use it
|
// If we have cached config, use it
|
||||||
if (cachedConfig) {
|
if (cachedConfig) {
|
||||||
const domPurifyConfig = createDOMPurifyConfig(cachedConfig);
|
const domPurifyConfig = createDOMPurifyConfig(cachedConfig);
|
||||||
return DOMPurify.sanitize(html, domPurifyConfig as any).toString();
|
return DOMPurify.sanitize(preprocessed, domPurifyConfig as any).toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we don't have cached config but there's an ongoing request, wait for it
|
// If we don't have cached config but there's an ongoing request, wait for it
|
||||||
@@ -270,8 +323,8 @@ export function sanitizeHtmlSync(html: string): string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
return DOMPurify.sanitize(html, fallbackConfig as any).toString();
|
return DOMPurify.sanitize(preprocessed, fallbackConfig as any).toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
Reference in New Issue
Block a user