scraping and improvements

This commit is contained in:
Stefan Hardegger
2025-07-28 13:52:09 +02:00
parent f95d7aa8bb
commit fcad028959
31 changed files with 3788 additions and 118 deletions

View File

@@ -0,0 +1,164 @@
// Dynamic cheerio import used to avoid client-side bundling issues
// Using any type for CheerioAPI to prevent bundling issues
import {
ChaptersStrategy,
ChapterContentStrategy,
MultipleTypesStrategy,
SchemaOrgStrategy,
ReactContentStrategy
} from '../types';
export function extractChapters(
$: any,
config: ChaptersStrategy
): string {
// Check for multiple chapters first
if (config.chaptersWrapper) {
const chaptersWrapper = $(config.chaptersWrapper);
if (chaptersWrapper.length > 0) {
const chapters = chaptersWrapper.find(config.chapterSelector);
if (chapters.length > 1) {
// Multiple chapters - combine them
let content = '';
chapters.each((_: any, elem: any) => {
content += $(elem).html() + '\n\n';
});
return content.trim();
}
}
}
// Single chapter fallback
if (config.singleChapter) {
const singleChapter = $(config.singleChapter);
if (singleChapter.length > 0) {
return singleChapter.html() || '';
}
}
// Direct chapter selector fallback
const chapter = $(config.chapterSelector).first();
return chapter.html() || '';
}
export function extractChapterContent(
$: any,
config: ChapterContentStrategy
): string {
const content = $(config.selector);
// Remove cleanup selectors
if (config.cleanupSelectors) {
config.cleanupSelectors.forEach(selector => {
content.find(selector).remove();
});
}
return content.html() || '';
}
export function extractMultipleTypes(
$: any,
config: MultipleTypesStrategy
): string[] {
const tags: string[] = [];
Object.entries(config.selectors).forEach(([type, selector]) => {
$(selector).each((_: any, elem: any) => {
const tag = $(elem).text().trim();
if (tag) {
tags.push(`${type}: ${tag}`);
}
});
});
return tags;
}
export function extractSchemaOrg(
$: any,
config: SchemaOrgStrategy
): string {
// Look for JSON-LD first
$('script[type="application/ld+json"]').each((_: any, elem: any) => {
try {
const data = JSON.parse($(elem).html() || '');
if (data['@type'] === config.schemaType ||
(Array.isArray(data) && data.some(item => item['@type'] === config.schemaType))) {
const item = Array.isArray(data) ?
data.find(item => item['@type'] === config.schemaType) : data;
if (item && item[config.property]) {
return item[config.property];
}
}
} catch (e) {
// Invalid JSON, continue
}
});
// Fallback to selector
if (config.fallbackSelector) {
return $(config.fallbackSelector).first().text().trim();
}
return '';
}
export function extractReactContent(
$: any,
config: ReactContentStrategy
): string {
// This is a simplified version - full React content extraction
// would require JavaScript execution or API access
const contentElements = $(config.paragraphSelector);
let content = '';
contentElements.each((_: any, elem: any) => {
const $elem = $(elem);
if ($elem.hasClass(config.contentClass)) {
content += $elem.html() + '\n\n';
}
});
return content.trim();
}
export async function cleanHtml(html: string): Promise<string> {
// Basic HTML cleaning - remove scripts, styles, and dangerous elements
const cheerio = await import('cheerio');
const $ = cheerio.load(html, {
// Preserve self-closing tags like <br>
xmlMode: false,
decodeEntities: false
});
// Remove dangerous elements
$('script, style, iframe, embed, object').remove();
// Remove empty paragraphs and divs (but preserve <br> tags)
$('p:empty, div:empty').not(':has(br)').remove();
// Clean up excessive whitespace in text nodes only, preserve <br> tags
$('*').each((_, elem) => {
const $elem = $(elem);
if (elem.type === 'text') {
const text = $elem.text();
if (text && text.trim() !== text) {
$elem.replaceWith(text.trim());
}
}
});
// Return HTML with proper self-closing tag format
return $.html() || '';
}
export function extractAttribute(
$: any,
selector: string,
attribute: string
): string {
const element = $(selector).first();
return element.attr(attribute) || '';
}