scraping and improvements
This commit is contained in:
164
frontend/src/lib/scraper/strategies/contentCleaner.ts
Normal file
164
frontend/src/lib/scraper/strategies/contentCleaner.ts
Normal file
@@ -0,0 +1,164 @@
|
||||
// Dynamic cheerio import used to avoid client-side bundling issues
|
||||
// Using any type for CheerioAPI to prevent bundling issues
|
||||
import {
|
||||
ChaptersStrategy,
|
||||
ChapterContentStrategy,
|
||||
MultipleTypesStrategy,
|
||||
SchemaOrgStrategy,
|
||||
ReactContentStrategy
|
||||
} from '../types';
|
||||
|
||||
export function extractChapters(
|
||||
$: any,
|
||||
config: ChaptersStrategy
|
||||
): string {
|
||||
// Check for multiple chapters first
|
||||
if (config.chaptersWrapper) {
|
||||
const chaptersWrapper = $(config.chaptersWrapper);
|
||||
if (chaptersWrapper.length > 0) {
|
||||
const chapters = chaptersWrapper.find(config.chapterSelector);
|
||||
if (chapters.length > 1) {
|
||||
// Multiple chapters - combine them
|
||||
let content = '';
|
||||
chapters.each((_: any, elem: any) => {
|
||||
content += $(elem).html() + '\n\n';
|
||||
});
|
||||
return content.trim();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Single chapter fallback
|
||||
if (config.singleChapter) {
|
||||
const singleChapter = $(config.singleChapter);
|
||||
if (singleChapter.length > 0) {
|
||||
return singleChapter.html() || '';
|
||||
}
|
||||
}
|
||||
|
||||
// Direct chapter selector fallback
|
||||
const chapter = $(config.chapterSelector).first();
|
||||
return chapter.html() || '';
|
||||
}
|
||||
|
||||
export function extractChapterContent(
|
||||
$: any,
|
||||
config: ChapterContentStrategy
|
||||
): string {
|
||||
const content = $(config.selector);
|
||||
|
||||
// Remove cleanup selectors
|
||||
if (config.cleanupSelectors) {
|
||||
config.cleanupSelectors.forEach(selector => {
|
||||
content.find(selector).remove();
|
||||
});
|
||||
}
|
||||
|
||||
return content.html() || '';
|
||||
}
|
||||
|
||||
export function extractMultipleTypes(
|
||||
$: any,
|
||||
config: MultipleTypesStrategy
|
||||
): string[] {
|
||||
const tags: string[] = [];
|
||||
|
||||
Object.entries(config.selectors).forEach(([type, selector]) => {
|
||||
$(selector).each((_: any, elem: any) => {
|
||||
const tag = $(elem).text().trim();
|
||||
if (tag) {
|
||||
tags.push(`${type}: ${tag}`);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
return tags;
|
||||
}
|
||||
|
||||
export function extractSchemaOrg(
|
||||
$: any,
|
||||
config: SchemaOrgStrategy
|
||||
): string {
|
||||
// Look for JSON-LD first
|
||||
$('script[type="application/ld+json"]').each((_: any, elem: any) => {
|
||||
try {
|
||||
const data = JSON.parse($(elem).html() || '');
|
||||
if (data['@type'] === config.schemaType ||
|
||||
(Array.isArray(data) && data.some(item => item['@type'] === config.schemaType))) {
|
||||
const item = Array.isArray(data) ?
|
||||
data.find(item => item['@type'] === config.schemaType) : data;
|
||||
if (item && item[config.property]) {
|
||||
return item[config.property];
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// Invalid JSON, continue
|
||||
}
|
||||
});
|
||||
|
||||
// Fallback to selector
|
||||
if (config.fallbackSelector) {
|
||||
return $(config.fallbackSelector).first().text().trim();
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
export function extractReactContent(
|
||||
$: any,
|
||||
config: ReactContentStrategy
|
||||
): string {
|
||||
// This is a simplified version - full React content extraction
|
||||
// would require JavaScript execution or API access
|
||||
|
||||
const contentElements = $(config.paragraphSelector);
|
||||
let content = '';
|
||||
|
||||
contentElements.each((_: any, elem: any) => {
|
||||
const $elem = $(elem);
|
||||
if ($elem.hasClass(config.contentClass)) {
|
||||
content += $elem.html() + '\n\n';
|
||||
}
|
||||
});
|
||||
|
||||
return content.trim();
|
||||
}
|
||||
|
||||
export async function cleanHtml(html: string): Promise<string> {
|
||||
// Basic HTML cleaning - remove scripts, styles, and dangerous elements
|
||||
const cheerio = await import('cheerio');
|
||||
const $ = cheerio.load(html, {
|
||||
// Preserve self-closing tags like <br>
|
||||
xmlMode: false,
|
||||
decodeEntities: false
|
||||
});
|
||||
|
||||
// Remove dangerous elements
|
||||
$('script, style, iframe, embed, object').remove();
|
||||
|
||||
// Remove empty paragraphs and divs (but preserve <br> tags)
|
||||
$('p:empty, div:empty').not(':has(br)').remove();
|
||||
|
||||
// Clean up excessive whitespace in text nodes only, preserve <br> tags
|
||||
$('*').each((_, elem) => {
|
||||
const $elem = $(elem);
|
||||
if (elem.type === 'text') {
|
||||
const text = $elem.text();
|
||||
if (text && text.trim() !== text) {
|
||||
$elem.replaceWith(text.trim());
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Return HTML with proper self-closing tag format
|
||||
return $.html() || '';
|
||||
}
|
||||
|
||||
export function extractAttribute(
|
||||
$: any,
|
||||
selector: string,
|
||||
attribute: string
|
||||
): string {
|
||||
const element = $(selector).first();
|
||||
return element.attr(attribute) || '';
|
||||
}
|
||||
Reference in New Issue
Block a user