scraping and improvements
This commit is contained in:
98
frontend/src/lib/scraper/strategies/linkExtractor.ts
Normal file
98
frontend/src/lib/scraper/strategies/linkExtractor.ts
Normal file
@@ -0,0 +1,98 @@
|
||||
// Dynamic cheerio import used to avoid client-side bundling issues
|
||||
// Using any type for CheerioAPI to prevent bundling issues
|
||||
import {
|
||||
LinkWithPathStrategy,
|
||||
HrefPatternStrategy,
|
||||
FirstImageStrategy,
|
||||
ResponsiveImageStrategy,
|
||||
LazyLoadedStrategy
|
||||
} from '../types';
|
||||
|
||||
export function extractLinkWithPath(
|
||||
$: any,
|
||||
config: LinkWithPathStrategy
|
||||
): string {
|
||||
let searchScope = config.searchWithin ? $(config.searchWithin) : $('body');
|
||||
|
||||
const links = searchScope.find('a');
|
||||
|
||||
for (let i = 0; i < links.length; i++) {
|
||||
const link = links.eq(i);
|
||||
const href = link.attr('href');
|
||||
|
||||
if (href && href.includes(config.pathContains)) {
|
||||
return link.text().trim();
|
||||
}
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
export function extractHrefPattern(
|
||||
$: any,
|
||||
config: HrefPatternStrategy
|
||||
): string[] {
|
||||
let searchScope = config.searchWithin ? $(config.searchWithin) : $('body');
|
||||
|
||||
const pattern = new RegExp(config.pattern);
|
||||
const links: string[] = [];
|
||||
|
||||
searchScope.find('a').each((_: any, elem: any) => {
|
||||
const href = $(elem).attr('href');
|
||||
if (href && pattern.test(href)) {
|
||||
links.push(href);
|
||||
}
|
||||
});
|
||||
|
||||
return links;
|
||||
}
|
||||
|
||||
export function extractFirstImage(
|
||||
$: any,
|
||||
config: FirstImageStrategy
|
||||
): string {
|
||||
let searchScope = config.searchWithin ? $(config.searchWithin) : $('body');
|
||||
|
||||
const img = searchScope.find('img').first();
|
||||
return img.attr(config.attribute) || '';
|
||||
}
|
||||
|
||||
export function extractResponsiveImage(
|
||||
$: any,
|
||||
config: ResponsiveImageStrategy
|
||||
): string {
|
||||
const img = $(config.selector).first();
|
||||
|
||||
if (config.selectLargest && config.srcsetAttribute) {
|
||||
const srcset = img.attr(config.srcsetAttribute);
|
||||
if (srcset) {
|
||||
// Parse srcset and return the largest image
|
||||
const sources = srcset.split(',').map((src: string) => {
|
||||
const parts = src.trim().split(' ');
|
||||
const url = parts[0];
|
||||
const descriptor = parts[1] || '1x';
|
||||
const width = descriptor.includes('w') ?
|
||||
parseInt(descriptor.replace('w', '')) :
|
||||
descriptor.includes('x') ?
|
||||
parseInt(descriptor.replace('x', '')) * 100 : 100;
|
||||
return { url, width };
|
||||
});
|
||||
|
||||
const largest = sources.reduce((prev: any, current: any) =>
|
||||
prev.width > current.width ? prev : current
|
||||
);
|
||||
|
||||
return largest.url;
|
||||
}
|
||||
}
|
||||
|
||||
return img.attr('src') || '';
|
||||
}
|
||||
|
||||
export function extractLazyLoadedImage(
|
||||
$: any,
|
||||
config: LazyLoadedStrategy
|
||||
): string {
|
||||
const img = $(config.selector).first();
|
||||
return img.attr(config.attribute) || img.attr('src') || '';
|
||||
}
|
||||
Reference in New Issue
Block a user