scraping and improvements

2025-07-28 13:52:09 +02:00
parent f95d7aa8bb
commit fcad028959
31 changed files with 3788 additions and 118 deletions
--- a/frontend/src/lib/scraper/strategies/contentCleaner.ts
+++ b/frontend/src/lib/scraper/strategies/contentCleaner.ts
@@ -0,0 +1,164 @@
+// Dynamic cheerio import used to avoid client-side bundling issues
+// Using any type for CheerioAPI to prevent bundling issues
+import {
+  ChaptersStrategy,
+  ChapterContentStrategy,
+  MultipleTypesStrategy,
+  SchemaOrgStrategy,
+  ReactContentStrategy
+} from '../types';
+
+export function extractChapters(
+  $: any,
+  config: ChaptersStrategy
+): string {
+  // Check for multiple chapters first
+  if (config.chaptersWrapper) {
+    const chaptersWrapper = $(config.chaptersWrapper);
+    if (chaptersWrapper.length > 0) {
+      const chapters = chaptersWrapper.find(config.chapterSelector);
+      if (chapters.length > 1) {
+        // Multiple chapters - combine them
+        let content = '';
+        chapters.each((_: any, elem: any) => {
+          content += $(elem).html() + '\n\n';
+        });
+        return content.trim();
+      }
+    }
+  }
+  
+  // Single chapter fallback
+  if (config.singleChapter) {
+    const singleChapter = $(config.singleChapter);
+    if (singleChapter.length > 0) {
+      return singleChapter.html() || '';
+    }
+  }
+  
+  // Direct chapter selector fallback
+  const chapter = $(config.chapterSelector).first();
+  return chapter.html() || '';
+}
+
+export function extractChapterContent(
+  $: any,
+  config: ChapterContentStrategy
+): string {
+  const content = $(config.selector);
+  
+  // Remove cleanup selectors
+  if (config.cleanupSelectors) {
+    config.cleanupSelectors.forEach(selector => {
+      content.find(selector).remove();
+    });
+  }
+  
+  return content.html() || '';
+}
+
+export function extractMultipleTypes(
+  $: any,
+  config: MultipleTypesStrategy
+): string[] {
+  const tags: string[] = [];
+  
+  Object.entries(config.selectors).forEach(([type, selector]) => {
+    $(selector).each((_: any, elem: any) => {
+      const tag = $(elem).text().trim();
+      if (tag) {
+        tags.push(`${type}: ${tag}`);
+      }
+    });
+  });
+  
+  return tags;
+}
+
+export function extractSchemaOrg(
+  $: any,
+  config: SchemaOrgStrategy
+): string {
+  // Look for JSON-LD first
+  $('script[type="application/ld+json"]').each((_: any, elem: any) => {
+    try {
+      const data = JSON.parse($(elem).html() || '');
+      if (data['@type'] === config.schemaType || 
+          (Array.isArray(data) && data.some(item => item['@type'] === config.schemaType))) {
+        const item = Array.isArray(data) ? 
+          data.find(item => item['@type'] === config.schemaType) : data;
+        if (item && item[config.property]) {
+          return item[config.property];
+        }
+      }
+    } catch (e) {
+      // Invalid JSON, continue
+    }
+  });
+  
+  // Fallback to selector
+  if (config.fallbackSelector) {
+    return $(config.fallbackSelector).first().text().trim();
+  }
+  
+  return '';
+}
+
+export function extractReactContent(
+  $: any,
+  config: ReactContentStrategy
+): string {
+  // This is a simplified version - full React content extraction
+  // would require JavaScript execution or API access
+  
+  const contentElements = $(config.paragraphSelector);
+  let content = '';
+  
+  contentElements.each((_: any, elem: any) => {
+    const $elem = $(elem);
+    if ($elem.hasClass(config.contentClass)) {
+      content += $elem.html() + '\n\n';
+    }
+  });
+  
+  return content.trim();
+}
+
+export async function cleanHtml(html: string): Promise<string> {
+  // Basic HTML cleaning - remove scripts, styles, and dangerous elements
+  const cheerio = await import('cheerio');
+  const $ = cheerio.load(html, {
+    // Preserve self-closing tags like <br>
+    xmlMode: false,
+    decodeEntities: false
+  });
+  
+  // Remove dangerous elements
+  $('script, style, iframe, embed, object').remove();
+  
+  // Remove empty paragraphs and divs (but preserve <br> tags)
+  $('p:empty, div:empty').not(':has(br)').remove();
+  
+  // Clean up excessive whitespace in text nodes only, preserve <br> tags
+  $('*').each((_, elem) => {
+    const $elem = $(elem);
+    if (elem.type === 'text') {
+      const text = $elem.text();
+      if (text && text.trim() !== text) {
+        $elem.replaceWith(text.trim());
+      }
+    }
+  });
+  
+  // Return HTML with proper self-closing tag format
+  return $.html() || '';
+}
+
+export function extractAttribute(
+  $: any,
+  selector: string,
+  attribute: string
+): string {
+  const element = $(selector).first();
+  return element.attr(attribute) || '';
+}