diff --git a/frontend/src/lib/sanitization.ts b/frontend/src/lib/sanitization.ts index 00a3afe..33988df 100644 --- a/frontend/src/lib/sanitization.ts +++ b/frontend/src/lib/sanitization.ts @@ -153,7 +153,8 @@ function createDOMPurifyConfig(config: SanitizationConfig) { const domPurifyConfig: DOMPurify.Config = { ALLOWED_TAGS: allowedTags, ALLOWED_ATTR: uniqueAttributes, - ALLOWED_URI_REGEXP: /^(?:(?:https?|#|\/):?\/?)[\w.\-#/?=&%]+$/i, + // More permissive URL regex to allow complex query strings and tokens + ALLOWED_URI_REGEXP: /^(?:(?:https?|data|#|\/):)?[\s\S]*$/i, ALLOW_UNKNOWN_PROTOCOLS: false, SANITIZE_DOM: true, KEEP_CONTENT: true, @@ -193,30 +194,52 @@ function preprocessFigureTags(html: string): string { const doc = parser.parseFromString(html, 'text/html'); const figures = doc.querySelectorAll('figure'); - figures.forEach(figure => { - // Find img tags within the figure + figures.forEach((figure) => { + // Find img tags anywhere within the figure (deep search) const images = figure.querySelectorAll('img'); if (images.length > 0) { // Extract the first image const img = images[0]; - // Check if there's a figcaption to preserve as alt text - const figcaption = figure.querySelector('figcaption'); - if (figcaption && !img.hasAttribute('alt')) { - const captionText = figcaption.textContent?.trim(); - if (captionText) { - img.setAttribute('alt', captionText); + // Get the src attribute - it might be in the src attribute or data-src + const imgSrc = img.getAttribute('src') || img.getAttribute('data-src') || img.src || ''; + + if (!imgSrc || imgSrc.trim() === '') { + figure.remove(); + return; + } + + // Create a clean img element with just the essential attributes + const cleanImg = doc.createElement('img'); + cleanImg.setAttribute('src', imgSrc); + + // Preserve alt text + const existingAlt = img.getAttribute('alt') || img.alt; + if (existingAlt) { + cleanImg.setAttribute('alt', existingAlt); + } else { + // Check if there's a figcaption to use as alt text + const figcaption = figure.querySelector('figcaption'); + if (figcaption) { + const captionText = figcaption.textContent?.trim(); + if (captionText) { + cleanImg.setAttribute('alt', captionText); + } } } - // Replace the figure element with just the img - figure.replaceWith(img.cloneNode(true)); - debug.log('Extracted image from figure tag:', img.src); + // Preserve other useful attributes if they exist + const width = img.getAttribute('width') || img.width; + const height = img.getAttribute('height') || img.height; + if (width) cleanImg.setAttribute('width', width.toString()); + if (height) cleanImg.setAttribute('height', height.toString()); + + // Replace the figure element with just the clean img + figure.replaceWith(cleanImg); } else { // No images in figure, remove it entirely figure.remove(); - debug.log('Removed figure tag without images'); } }); @@ -300,8 +323,10 @@ export function sanitizeHtmlSync(html: string): string { 'blockquote', 'cite', 'q', 'hr', 'details', 'summary' ], ALLOWED_ATTR: [ - 'class', 'style', 'colspan', 'rowspan', 'src', 'alt', 'width', 'height' + 'class', 'style', 'colspan', 'rowspan', 'src', 'alt', 'width', 'height', 'href', 'title' ], + // More permissive URL regex to allow complex query strings and tokens + ALLOWED_URI_REGEXP: /^(?:(?:https?|data|#|\/):)?[\s\S]*$/i, ALLOW_UNKNOWN_PROTOCOLS: false, SANITIZE_DOM: true, KEEP_CONTENT: true,