163 lines
4.2 KiB
TypeScript
163 lines
4.2 KiB
TypeScript
// Dynamic cheerio import used to avoid client-side bundling issues
|
|
// Using any type for CheerioAPI to prevent bundling issues
|
|
import {
|
|
ChaptersStrategy,
|
|
ChapterContentStrategy,
|
|
MultipleTypesStrategy,
|
|
SchemaOrgStrategy,
|
|
ReactContentStrategy
|
|
} from '../types';
|
|
|
|
export function extractChapters(
|
|
$: any,
|
|
config: ChaptersStrategy
|
|
): string {
|
|
// Check for multiple chapters first
|
|
if (config.chaptersWrapper) {
|
|
const chaptersWrapper = $(config.chaptersWrapper);
|
|
if (chaptersWrapper.length > 0) {
|
|
const chapters = chaptersWrapper.find(config.chapterSelector);
|
|
if (chapters.length > 1) {
|
|
// Multiple chapters - combine them
|
|
let content = '';
|
|
chapters.each((_: any, elem: any) => {
|
|
content += $(elem).html() + '\n\n';
|
|
});
|
|
return content.trim();
|
|
}
|
|
}
|
|
}
|
|
|
|
// Single chapter fallback
|
|
if (config.singleChapter) {
|
|
const singleChapter = $(config.singleChapter);
|
|
if (singleChapter.length > 0) {
|
|
return singleChapter.html() || '';
|
|
}
|
|
}
|
|
|
|
// Direct chapter selector fallback
|
|
const chapter = $(config.chapterSelector).first();
|
|
return chapter.html() || '';
|
|
}
|
|
|
|
export function extractChapterContent(
|
|
$: any,
|
|
config: ChapterContentStrategy
|
|
): string {
|
|
const content = $(config.selector);
|
|
|
|
// Remove cleanup selectors
|
|
if (config.cleanupSelectors) {
|
|
config.cleanupSelectors.forEach(selector => {
|
|
content.find(selector).remove();
|
|
});
|
|
}
|
|
|
|
return content.html() || '';
|
|
}
|
|
|
|
export function extractMultipleTypes(
|
|
$: any,
|
|
config: MultipleTypesStrategy
|
|
): string[] {
|
|
const tags: string[] = [];
|
|
|
|
Object.entries(config.selectors).forEach(([type, selector]) => {
|
|
$(selector).each((_: any, elem: any) => {
|
|
const tag = $(elem).text().trim();
|
|
if (tag) {
|
|
tags.push(`${type}: ${tag}`);
|
|
}
|
|
});
|
|
});
|
|
|
|
return tags;
|
|
}
|
|
|
|
export function extractSchemaOrg(
|
|
$: any,
|
|
config: SchemaOrgStrategy
|
|
): string {
|
|
// Look for JSON-LD first
|
|
$('script[type="application/ld+json"]').each((_: any, elem: any) => {
|
|
try {
|
|
const data = JSON.parse($(elem).html() || '');
|
|
if (data['@type'] === config.schemaType ||
|
|
(Array.isArray(data) && data.some(item => item['@type'] === config.schemaType))) {
|
|
const item = Array.isArray(data) ?
|
|
data.find(item => item['@type'] === config.schemaType) : data;
|
|
if (item && item[config.property]) {
|
|
return item[config.property];
|
|
}
|
|
}
|
|
} catch (e) {
|
|
// Invalid JSON, continue
|
|
}
|
|
});
|
|
|
|
// Fallback to selector
|
|
if (config.fallbackSelector) {
|
|
return $(config.fallbackSelector).first().text().trim();
|
|
}
|
|
|
|
return '';
|
|
}
|
|
|
|
export function extractReactContent(
|
|
$: any,
|
|
config: ReactContentStrategy
|
|
): string {
|
|
// This is a simplified version - full React content extraction
|
|
// would require JavaScript execution or API access
|
|
|
|
const contentElements = $(config.paragraphSelector);
|
|
let content = '';
|
|
|
|
contentElements.each((_: any, elem: any) => {
|
|
const $elem = $(elem);
|
|
if ($elem.hasClass(config.contentClass)) {
|
|
content += $elem.html() + '\n\n';
|
|
}
|
|
});
|
|
|
|
return content.trim();
|
|
}
|
|
|
|
export async function cleanHtml(html: string): Promise<string> {
|
|
// Basic HTML cleaning - remove scripts, styles, and dangerous elements
|
|
const cheerio = await import('cheerio');
|
|
const $ = cheerio.load(html, {
|
|
// Preserve self-closing tags like <br>
|
|
xmlMode: false
|
|
});
|
|
|
|
// Remove dangerous elements
|
|
$('script, style, iframe, embed, object').remove();
|
|
|
|
// Remove empty paragraphs and divs (but preserve <br> tags)
|
|
$('p:empty, div:empty').not(':has(br)').remove();
|
|
|
|
// Clean up excessive whitespace in text nodes only, preserve <br> tags
|
|
$('*').each((_, elem) => {
|
|
const $elem = $(elem);
|
|
if (elem.type === 'text') {
|
|
const text = $elem.text();
|
|
if (text && text.trim() !== text) {
|
|
$elem.replaceWith(text.trim());
|
|
}
|
|
}
|
|
});
|
|
|
|
// Return HTML with proper self-closing tag format
|
|
return $.html() || '';
|
|
}
|
|
|
|
export function extractAttribute(
|
|
$: any,
|
|
selector: string,
|
|
attribute: string
|
|
): string {
|
|
const element = $(selector).first();
|
|
return element.attr(attribute) || '';
|
|
} |