Files
storycove/frontend/src/lib/scraper/strategies/linkExtractor.ts
2025-07-28 13:52:09 +02:00

98 lines
2.5 KiB
TypeScript

// Dynamic cheerio import used to avoid client-side bundling issues
// Using any type for CheerioAPI to prevent bundling issues
import {
LinkWithPathStrategy,
HrefPatternStrategy,
FirstImageStrategy,
ResponsiveImageStrategy,
LazyLoadedStrategy
} from '../types';
export function extractLinkWithPath(
$: any,
config: LinkWithPathStrategy
): string {
let searchScope = config.searchWithin ? $(config.searchWithin) : $('body');
const links = searchScope.find('a');
for (let i = 0; i < links.length; i++) {
const link = links.eq(i);
const href = link.attr('href');
if (href && href.includes(config.pathContains)) {
return link.text().trim();
}
}
return '';
}
export function extractHrefPattern(
$: any,
config: HrefPatternStrategy
): string[] {
let searchScope = config.searchWithin ? $(config.searchWithin) : $('body');
const pattern = new RegExp(config.pattern);
const links: string[] = [];
searchScope.find('a').each((_: any, elem: any) => {
const href = $(elem).attr('href');
if (href && pattern.test(href)) {
links.push(href);
}
});
return links;
}
export function extractFirstImage(
$: any,
config: FirstImageStrategy
): string {
let searchScope = config.searchWithin ? $(config.searchWithin) : $('body');
const img = searchScope.find('img').first();
return img.attr(config.attribute) || '';
}
export function extractResponsiveImage(
$: any,
config: ResponsiveImageStrategy
): string {
const img = $(config.selector).first();
if (config.selectLargest && config.srcsetAttribute) {
const srcset = img.attr(config.srcsetAttribute);
if (srcset) {
// Parse srcset and return the largest image
const sources = srcset.split(',').map((src: string) => {
const parts = src.trim().split(' ');
const url = parts[0];
const descriptor = parts[1] || '1x';
const width = descriptor.includes('w') ?
parseInt(descriptor.replace('w', '')) :
descriptor.includes('x') ?
parseInt(descriptor.replace('x', '')) * 100 : 100;
return { url, width };
});
const largest = sources.reduce((prev: any, current: any) =>
prev.width > current.width ? prev : current
);
return largest.url;
}
}
return img.attr('src') || '';
}
export function extractLazyLoadedImage(
$: any,
config: LazyLoadedStrategy
): string {
const img = $(config.selector).first();
return img.attr(config.attribute) || img.attr('src') || '';
}