// Dynamic cheerio import used to avoid client-side bundling issues // Using any type for CheerioAPI to prevent bundling issues import { ChaptersStrategy, ChapterContentStrategy, MultipleTypesStrategy, SchemaOrgStrategy, ReactContentStrategy } from '../types'; export function extractChapters( $: any, config: ChaptersStrategy ): string { // Check for multiple chapters first if (config.chaptersWrapper) { const chaptersWrapper = $(config.chaptersWrapper); if (chaptersWrapper.length > 0) { const chapters = chaptersWrapper.find(config.chapterSelector); if (chapters.length > 1) { // Multiple chapters - combine them let content = ''; chapters.each((_: any, elem: any) => { content += $(elem).html() + '\n\n'; }); return content.trim(); } } } // Single chapter fallback if (config.singleChapter) { const singleChapter = $(config.singleChapter); if (singleChapter.length > 0) { return singleChapter.html() || ''; } } // Direct chapter selector fallback const chapter = $(config.chapterSelector).first(); return chapter.html() || ''; } export function extractChapterContent( $: any, config: ChapterContentStrategy ): string { const content = $(config.selector); // Remove cleanup selectors if (config.cleanupSelectors) { config.cleanupSelectors.forEach(selector => { content.find(selector).remove(); }); } return content.html() || ''; } export function extractMultipleTypes( $: any, config: MultipleTypesStrategy ): string[] { const tags: string[] = []; Object.entries(config.selectors).forEach(([type, selector]) => { $(selector).each((_: any, elem: any) => { const tag = $(elem).text().trim(); if (tag) { tags.push(`${type}: ${tag}`); } }); }); return tags; } export function extractSchemaOrg( $: any, config: SchemaOrgStrategy ): string { // Look for JSON-LD first $('script[type="application/ld+json"]').each((_: any, elem: any) => { try { const data = JSON.parse($(elem).html() || ''); if (data['@type'] === config.schemaType || (Array.isArray(data) && data.some(item => item['@type'] === config.schemaType))) { const item = Array.isArray(data) ? data.find(item => item['@type'] === config.schemaType) : data; if (item && item[config.property]) { return item[config.property]; } } } catch (e) { // Invalid JSON, continue } }); // Fallback to selector if (config.fallbackSelector) { return $(config.fallbackSelector).first().text().trim(); } return ''; } export function extractReactContent( $: any, config: ReactContentStrategy ): string { // This is a simplified version - full React content extraction // would require JavaScript execution or API access const contentElements = $(config.paragraphSelector); let content = ''; contentElements.each((_: any, elem: any) => { const $elem = $(elem); if ($elem.hasClass(config.contentClass)) { content += $elem.html() + '\n\n'; } }); return content.trim(); } export async function cleanHtml(html: string): Promise { // Basic HTML cleaning - remove scripts, styles, and dangerous elements const cheerio = await import('cheerio'); const $ = cheerio.load(html, { // Preserve self-closing tags like
xmlMode: false }); // Remove dangerous elements $('script, style, iframe, embed, object').remove(); // Remove empty paragraphs and divs (but preserve
tags) $('p:empty, div:empty').not(':has(br)').remove(); // Clean up excessive whitespace in text nodes only, preserve
tags $('*').each((_, elem) => { const $elem = $(elem); if (elem.type === 'text') { const text = $elem.text(); if (text && text.trim() !== text) { $elem.replaceWith(text.trim()); } } }); // Return HTML with proper self-closing tag format return $.html() || ''; } export function extractAttribute( $: any, selector: string, attribute: string ): string { const element = $(selector).first(); return element.attr(attribute) || ''; }