scraping and improvements
This commit is contained in:
334
frontend/src/lib/scraper/config/sites.json
Normal file
334
frontend/src/lib/scraper/config/sites.json
Normal file
@@ -0,0 +1,334 @@
|
||||
{
|
||||
"sites": {
|
||||
"deviantart.com": {
|
||||
"story": {
|
||||
"title": "h1",
|
||||
"titleFallback": "meta[property='og:title']",
|
||||
"titleFallbackAttribute": "content",
|
||||
"author": {
|
||||
"strategy": "text-pattern",
|
||||
"pattern": "by ([^\\s]+) on DeviantArt",
|
||||
"searchAfter": "<title>",
|
||||
"searchBefore": "</title>"
|
||||
},
|
||||
"content": {
|
||||
"strategy": "text-blocks",
|
||||
"minLength": 200,
|
||||
"containerHints": ["journal", "literature", "story", "text", "content"],
|
||||
"excludeSelectors": ["script", "style", "nav", "header", "footer", ".dev-page-sidebar"]
|
||||
},
|
||||
"summary": "meta[property='og:description']",
|
||||
"summaryAttribute": "content",
|
||||
"tags": "a[data-tagname]",
|
||||
"tagsAttribute": "data-tagname",
|
||||
"coverImage": "meta[property='og:image']",
|
||||
"coverImageAttribute": "content"
|
||||
},
|
||||
"authorPage": {
|
||||
"storyLinks": "a[data-hook='deviation_link']",
|
||||
"filterStrategy": "dom-check",
|
||||
"requiresChildElement": "div[class*='journal']"
|
||||
}
|
||||
},
|
||||
|
||||
"literotica.com": {
|
||||
"story": {
|
||||
"title": "h1",
|
||||
"titleFallback": "meta[property='og:title']",
|
||||
"titleFallbackAttribute": "content",
|
||||
"author": {
|
||||
"strategy": "link-with-path",
|
||||
"pathContains": "/authors/",
|
||||
"searchWithin": "header, .story-info, #story-meta"
|
||||
},
|
||||
"content": {
|
||||
"strategy": "text-blocks",
|
||||
"minLength": 500,
|
||||
"containerHints": ["story", "content", "text"],
|
||||
"excludeSelectors": ["script", "style", "nav", "header", "footer"]
|
||||
},
|
||||
"summary": "meta[name='description']",
|
||||
"summaryAttribute": "content",
|
||||
"multiPage": {
|
||||
"enabled": true,
|
||||
"strategy": "url-pattern",
|
||||
"pageParam": "page",
|
||||
"maxPages": 20
|
||||
}
|
||||
},
|
||||
"authorPage": {
|
||||
"storyLinks": {
|
||||
"strategy": "href-pattern",
|
||||
"pattern": "/s/[^/]+$",
|
||||
"searchWithin": "main, #content, .stories-list"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
"mcstories.com": {
|
||||
"story": {
|
||||
"title": "title",
|
||||
"titleTransform": "remove-suffix: - MCStories.com",
|
||||
"author": "meta[name='dcterms.creator']",
|
||||
"authorAttribute": "content",
|
||||
"content": "article#mcstories",
|
||||
"summary": "meta[name='dcterms.description']",
|
||||
"summaryAttribute": "content"
|
||||
},
|
||||
"authorPage": {
|
||||
"storyLinks": "a[href$='.html']:not([href*='Authors'])",
|
||||
"linkPrefix": "https://mcstories.com/"
|
||||
}
|
||||
},
|
||||
|
||||
"docs-lab.com": {
|
||||
"story": {
|
||||
"title": "title",
|
||||
"titleTransform": "remove-suffix: - Doc's Lab",
|
||||
"author": "a[href*='/profiles/'] strong",
|
||||
"content": {
|
||||
"strategy": "html-between",
|
||||
"startMarker": "<h2>Story</h2>",
|
||||
"endMarker": "</div>",
|
||||
"includeStart": false
|
||||
},
|
||||
"tags": "span.label"
|
||||
},
|
||||
"authorPage": {
|
||||
"storyLinks": "a[href*='/submissions/']",
|
||||
"linkPrefix": "https://docs-lab.com"
|
||||
}
|
||||
},
|
||||
|
||||
"archiveofourown.org": {
|
||||
"story": {
|
||||
"title": "h2.title",
|
||||
"author": "a[rel='author']",
|
||||
"content": {
|
||||
"strategy": "chapters",
|
||||
"chapterSelector": "div.userstuff[role='article']",
|
||||
"chaptersWrapper": "#chapters",
|
||||
"singleChapter": "#workskin"
|
||||
},
|
||||
"summary": "div.summary blockquote.userstuff",
|
||||
"tags": {
|
||||
"strategy": "multiple-types",
|
||||
"selectors": {
|
||||
"fandom": "dd.fandom a.tag",
|
||||
"warning": "dd.warning a.tag",
|
||||
"category": "dd.category a.tag",
|
||||
"relationship": "dd.relationship a.tag",
|
||||
"character": "dd.character a.tag",
|
||||
"freeform": "dd.freeform a.tag"
|
||||
}
|
||||
},
|
||||
"multiPage": {
|
||||
"enabled": true,
|
||||
"strategy": "chapter-navigation",
|
||||
"chapterListSelector": "#chapter_index option",
|
||||
"urlPattern": "/chapters/{chapterId}"
|
||||
}
|
||||
},
|
||||
"authorPage": {
|
||||
"storyLinks": "h4.heading a[href*='/works/']",
|
||||
"pagination": {
|
||||
"enabled": true,
|
||||
"nextPageSelector": "li.next a[rel='next']"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
"fanfiction.net": {
|
||||
"story": {
|
||||
"title": "#profile_top b.xcontrast_txt",
|
||||
"author": "#profile_top a[href*='/u/']",
|
||||
"content": "#storytext",
|
||||
"summary": "#profile_top div.xcontrast_txt",
|
||||
"coverImage": {
|
||||
"strategy": "lazy-loaded",
|
||||
"selector": "img.cimage",
|
||||
"attribute": "data-original"
|
||||
},
|
||||
"multiPage": {
|
||||
"enabled": true,
|
||||
"strategy": "chapter-dropdown",
|
||||
"chapterSelector": "select#chap_select option",
|
||||
"urlPattern": "{baseUrl}/{chapterNumber}"
|
||||
}
|
||||
},
|
||||
"authorPage": {
|
||||
"storyLinks": "div.z-list a.stitle",
|
||||
"metadata": {
|
||||
"strategy": "sibling-text",
|
||||
"metadataSelector": "div.z-padtop2",
|
||||
"parsePattern": "Rated: ([^-]+) - .+ - Chapters: (\\d+)"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
"royalroad.com": {
|
||||
"story": {
|
||||
"title": "h1[property='name']",
|
||||
"author": "h4[property='author'] a",
|
||||
"content": {
|
||||
"strategy": "chapter-content",
|
||||
"selector": "div.chapter-content",
|
||||
"cleanupSelectors": [".portlet", ".ads-holder", "div[style*='display:none']"]
|
||||
},
|
||||
"summary": "div.description div.hidden-content",
|
||||
"tags": "span.tags a.fiction-tag",
|
||||
"coverImage": "img.thumbnail",
|
||||
"coverImageAttribute": "src",
|
||||
"multiPage": {
|
||||
"enabled": true,
|
||||
"strategy": "table-of-contents",
|
||||
"tocSelector": "table#chapters tbody tr a[href*='/chapter/']",
|
||||
"requiresAuth": false
|
||||
}
|
||||
},
|
||||
"authorPage": {
|
||||
"storyLinks": "div.fiction-list-item h2.fiction-title a",
|
||||
"additionalInfo": {
|
||||
"strategy": "data-attributes",
|
||||
"statsSelector": "div.stats",
|
||||
"extractStats": ["pages", "followers", "views"]
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
"wattpad.com": {
|
||||
"story": {
|
||||
"title": "h1",
|
||||
"author": {
|
||||
"strategy": "schema-org",
|
||||
"schemaType": "Person",
|
||||
"property": "name",
|
||||
"fallbackSelector": "a[href*='/user/']"
|
||||
},
|
||||
"content": {
|
||||
"strategy": "react-content",
|
||||
"contentClass": "pre-wrap",
|
||||
"paragraphSelector": "p[data-p-id]",
|
||||
"requiresJavaScript": true
|
||||
},
|
||||
"summary": "h2.description",
|
||||
"tags": "div.tag-items a.tag",
|
||||
"coverImage": {
|
||||
"strategy": "responsive-image",
|
||||
"selector": "img[alt*='cover']",
|
||||
"srcsetAttribute": "srcset",
|
||||
"selectLargest": true
|
||||
},
|
||||
"multiPage": {
|
||||
"enabled": true,
|
||||
"strategy": "api-based",
|
||||
"apiPattern": "/v4/parts/{partId}/text",
|
||||
"tocApiPattern": "/v5/stories/{storyId}/parts",
|
||||
"requiresAuth": true
|
||||
}
|
||||
},
|
||||
"authorPage": {
|
||||
"storyLinks": {
|
||||
"strategy": "infinite-scroll",
|
||||
"initialSelector": "a[href*='/story/']",
|
||||
"apiEndpoint": "/v4/users/{userId}/stories",
|
||||
"requiresJavaScript": true
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
"strategies": {
|
||||
"text-blocks": {
|
||||
"description": "Find content by looking for large text blocks",
|
||||
"implementation": "Find all text nodes, group by parent, select parent with most text"
|
||||
},
|
||||
"link-with-path": {
|
||||
"description": "Find links containing specific path patterns",
|
||||
"implementation": "querySelector with href*= or iterate and check .href property"
|
||||
},
|
||||
"href-pattern": {
|
||||
"description": "Match links by regex pattern",
|
||||
"implementation": "Array.from(links).filter(a => pattern.test(a.href))"
|
||||
},
|
||||
"text-pattern": {
|
||||
"description": "Extract text using regex from raw HTML",
|
||||
"implementation": "Use regex on .html() with proper groups"
|
||||
},
|
||||
"html-between": {
|
||||
"description": "Extract HTML between markers",
|
||||
"implementation": "indexOf() to find positions, substring to extract"
|
||||
},
|
||||
"chapters": {
|
||||
"description": "Extract story content that may be in chapters",
|
||||
"implementation": "Check for multiple chapters or single chapter format"
|
||||
},
|
||||
"multiple-types": {
|
||||
"description": "Extract different categories of tags",
|
||||
"implementation": "Map over selector types and extract each category"
|
||||
},
|
||||
"chapter-navigation": {
|
||||
"description": "Navigate through chapters using chapter index",
|
||||
"implementation": "Extract chapter IDs and construct URLs"
|
||||
},
|
||||
"lazy-loaded": {
|
||||
"description": "Extract images that are lazy-loaded",
|
||||
"implementation": "Check data-* attributes for actual image source"
|
||||
},
|
||||
"chapter-dropdown": {
|
||||
"description": "Handle stories with chapter selection dropdown",
|
||||
"implementation": "Parse dropdown options and construct chapter URLs"
|
||||
},
|
||||
"table-of-contents": {
|
||||
"description": "Extract chapters from a table of contents",
|
||||
"implementation": "Find all chapter links in TOC structure"
|
||||
},
|
||||
"schema-org": {
|
||||
"description": "Extract data from schema.org structured data",
|
||||
"implementation": "Parse JSON-LD or microdata for specific properties"
|
||||
},
|
||||
"react-content": {
|
||||
"description": "Extract content from React-rendered pages",
|
||||
"implementation": "May require JavaScript execution or API access"
|
||||
},
|
||||
"responsive-image": {
|
||||
"description": "Select best quality from responsive images",
|
||||
"implementation": "Parse srcset and select highest resolution"
|
||||
},
|
||||
"api-based": {
|
||||
"description": "Use API endpoints instead of HTML scraping",
|
||||
"implementation": "Detect API patterns and make direct API calls"
|
||||
},
|
||||
"infinite-scroll": {
|
||||
"description": "Handle pages with infinite scroll",
|
||||
"implementation": "Detect scroll API endpoints or pagination"
|
||||
}
|
||||
},
|
||||
|
||||
"globalOptions": {
|
||||
"userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
||||
"timeout": 30000,
|
||||
"retryAttempts": 3,
|
||||
"rateLimitMs": 1000,
|
||||
"cacheDuration": 300000,
|
||||
"javascriptTimeout": 10000
|
||||
},
|
||||
|
||||
"siteNotes": {
|
||||
"wattpad.com": {
|
||||
"warning": "Wattpad has aggressive anti-scraping measures. Consider using their API if available.",
|
||||
"requiresAuth": "Some stories may require login to access full content"
|
||||
},
|
||||
"royalroad.com": {
|
||||
"note": "Very scraper-friendly with good HTML structure"
|
||||
},
|
||||
"archiveofourown.org": {
|
||||
"note": "Respects robots.txt, has good semantic HTML",
|
||||
"rateLimit": "Be extra respectful of rate limits"
|
||||
},
|
||||
"fanfiction.net": {
|
||||
"note": "Older site with simpler HTML structure",
|
||||
"warning": "Known to block IPs for aggressive scraping"
|
||||
}
|
||||
}
|
||||
}
|
||||
379
frontend/src/lib/scraper/scraper.ts
Normal file
379
frontend/src/lib/scraper/scraper.ts
Normal file
@@ -0,0 +1,379 @@
|
||||
import 'server-only';
|
||||
|
||||
// Note: cheerio import is done dynamically to avoid client-side bundling issues
|
||||
// Using any type for CheerioAPI to prevent bundling issues
|
||||
import {
|
||||
SitesConfig,
|
||||
SiteConfig,
|
||||
ScrapedStory,
|
||||
ScrapedAuthorStory,
|
||||
SelectorStrategy,
|
||||
MultiPageConfig,
|
||||
ScraperError
|
||||
} from './types';
|
||||
import { RateLimiter } from './utils/rateLimit';
|
||||
import { ScraperCache } from './utils/cache';
|
||||
import { UrlParser } from './utils/urlParser';
|
||||
import {
|
||||
extractByTextPattern,
|
||||
extractTextBlocks,
|
||||
extractHtmlBetween,
|
||||
extractLinkText,
|
||||
extractLinkWithPath,
|
||||
extractHrefPattern,
|
||||
extractFirstImage,
|
||||
extractResponsiveImage,
|
||||
extractLazyLoadedImage,
|
||||
extractChapters,
|
||||
extractChapterContent,
|
||||
extractMultipleTypes,
|
||||
extractSchemaOrg,
|
||||
extractReactContent,
|
||||
cleanHtml,
|
||||
extractAttribute
|
||||
} from './strategies';
|
||||
import sitesConfig from './config/sites.json';
|
||||
|
||||
export class StoryScraper {
|
||||
private config: SitesConfig;
|
||||
private cache: ScraperCache;
|
||||
private rateLimiter: RateLimiter;
|
||||
|
||||
constructor() {
|
||||
this.config = sitesConfig as SitesConfig;
|
||||
this.cache = new ScraperCache(this.config.globalOptions.cacheDuration);
|
||||
this.rateLimiter = new RateLimiter(this.config.globalOptions.rateLimitMs);
|
||||
}
|
||||
|
||||
async scrapeStory(url: string): Promise<ScrapedStory> {
|
||||
try {
|
||||
if (!UrlParser.validateUrl(url)) {
|
||||
throw new Error(`Invalid URL: ${url}`);
|
||||
}
|
||||
|
||||
const domain = UrlParser.getDomain(url);
|
||||
const siteConfig = this.config.sites[domain];
|
||||
|
||||
if (!siteConfig) {
|
||||
throw new Error(`Unsupported site: ${domain}`);
|
||||
}
|
||||
|
||||
const html = await this.fetchWithCache(url);
|
||||
const cheerio = await import('cheerio');
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const story: ScrapedStory = {
|
||||
title: await this.extractFieldWithFallback($, siteConfig.story, 'title', html),
|
||||
author: await this.extractFieldWithFallback($, siteConfig.story, 'author', html),
|
||||
content: await this.extractContent($, siteConfig.story, url, html),
|
||||
sourceUrl: url
|
||||
};
|
||||
|
||||
// Extract optional fields
|
||||
if (siteConfig.story.summary) {
|
||||
story.summary = await this.extractField($, siteConfig.story.summary, html, siteConfig.story.summaryAttribute);
|
||||
}
|
||||
|
||||
if (siteConfig.story.coverImage) {
|
||||
story.coverImage = await this.extractField($, siteConfig.story.coverImage, html, siteConfig.story.coverImageAttribute);
|
||||
}
|
||||
|
||||
if (siteConfig.story.tags) {
|
||||
const tagsResult = await this.extractTags($, siteConfig.story.tags, html, siteConfig.story.tagsAttribute);
|
||||
if (Array.isArray(tagsResult)) {
|
||||
story.tags = tagsResult;
|
||||
} else if (typeof tagsResult === 'string' && tagsResult) {
|
||||
story.tags = [tagsResult];
|
||||
}
|
||||
}
|
||||
|
||||
// Apply post-processing
|
||||
story.title = this.applyTransforms(story.title, siteConfig.story.titleTransform);
|
||||
story.content = await cleanHtml(story.content);
|
||||
|
||||
return story;
|
||||
} catch (error) {
|
||||
if (error instanceof Error) {
|
||||
throw new ScraperError(
|
||||
`Failed to scrape ${url}: ${error.message}`,
|
||||
url,
|
||||
error
|
||||
);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async scrapeAuthorPage(url: string): Promise<ScrapedAuthorStory[]> {
|
||||
try {
|
||||
if (!UrlParser.validateUrl(url)) {
|
||||
throw new Error(`Invalid URL: ${url}`);
|
||||
}
|
||||
|
||||
const domain = UrlParser.getDomain(url);
|
||||
const siteConfig = this.config.sites[domain];
|
||||
|
||||
if (!siteConfig || !siteConfig.authorPage) {
|
||||
throw new Error(`Author page scraping not supported for: ${domain}`);
|
||||
}
|
||||
|
||||
const html = await this.fetchWithCache(url);
|
||||
const cheerio = await import('cheerio');
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const storyLinks = await this.extractField($, siteConfig.authorPage.storyLinks, html);
|
||||
const stories: ScrapedAuthorStory[] = [];
|
||||
|
||||
if (Array.isArray(storyLinks)) {
|
||||
for (const link of storyLinks) {
|
||||
const storyUrl = UrlParser.normalizeUrl(link, url);
|
||||
try {
|
||||
const scrapedStory = await this.scrapeStory(storyUrl);
|
||||
stories.push({
|
||||
url: storyUrl,
|
||||
title: scrapedStory.title,
|
||||
author: scrapedStory.author,
|
||||
summary: scrapedStory.summary
|
||||
});
|
||||
} catch (error) {
|
||||
console.warn(`Failed to scrape story ${storyUrl}:`, error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return stories;
|
||||
} catch (error) {
|
||||
if (error instanceof Error) {
|
||||
throw new ScraperError(
|
||||
`Failed to scrape author page ${url}: ${error.message}`,
|
||||
url,
|
||||
error
|
||||
);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private async extractFieldWithFallback(
|
||||
$: any,
|
||||
config: any,
|
||||
fieldName: string,
|
||||
html: string
|
||||
): Promise<string> {
|
||||
const primarySelector = config[fieldName];
|
||||
const fallbackSelector = config[`${fieldName}Fallback`];
|
||||
const attribute = config[`${fieldName}Attribute`];
|
||||
const fallbackAttribute = config[`${fieldName}FallbackAttribute`];
|
||||
|
||||
// Try primary selector first
|
||||
if (primarySelector) {
|
||||
const result = await this.extractField($, primarySelector, html, attribute);
|
||||
if (result && result.trim()) {
|
||||
return result.trim();
|
||||
}
|
||||
}
|
||||
|
||||
// Try fallback selector if primary failed
|
||||
if (fallbackSelector) {
|
||||
const result = await this.extractField($, fallbackSelector, html, fallbackAttribute);
|
||||
if (result && result.trim()) {
|
||||
return result.trim();
|
||||
}
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
private async extractField(
|
||||
$: any,
|
||||
selector: string | SelectorStrategy,
|
||||
html: string,
|
||||
attribute?: string
|
||||
): Promise<any> {
|
||||
if (typeof selector === 'string') {
|
||||
// Simple CSS selector - always return single value (first element)
|
||||
const element = $(selector).first();
|
||||
if (attribute) {
|
||||
// Extract specific attribute instead of text
|
||||
return element.attr(attribute) || '';
|
||||
}
|
||||
return element.text().trim();
|
||||
}
|
||||
|
||||
// Strategy-based extraction
|
||||
return await this.executeStrategy($, selector, html);
|
||||
}
|
||||
|
||||
private async extractTags(
|
||||
$: any,
|
||||
selector: string | SelectorStrategy,
|
||||
html: string,
|
||||
attribute?: string
|
||||
): Promise<any> {
|
||||
if (typeof selector === 'string') {
|
||||
// Simple CSS selector - collect ALL matching elements for tags
|
||||
const elements = $(selector);
|
||||
|
||||
if (elements.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const results: string[] = [];
|
||||
elements.each((_: any, elem: any) => {
|
||||
const $elem = $(elem);
|
||||
const value = attribute ? $elem.attr(attribute) : $elem.text().trim();
|
||||
if (value) {
|
||||
results.push(value);
|
||||
}
|
||||
});
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
// Strategy-based extraction for tags
|
||||
return await this.executeStrategy($, selector, html);
|
||||
}
|
||||
|
||||
private async executeStrategy(
|
||||
$: any,
|
||||
strategy: SelectorStrategy,
|
||||
html: string
|
||||
): Promise<any> {
|
||||
switch (strategy.strategy) {
|
||||
case 'text-pattern':
|
||||
return extractByTextPattern(html, strategy as any);
|
||||
case 'link-with-path':
|
||||
return extractLinkWithPath($, strategy as any);
|
||||
case 'text-blocks':
|
||||
return extractTextBlocks($, strategy as any);
|
||||
case 'href-pattern':
|
||||
return extractHrefPattern($, strategy as any);
|
||||
case 'html-between':
|
||||
return extractHtmlBetween(html, strategy as any);
|
||||
case 'link-text':
|
||||
return extractLinkText($, strategy as any);
|
||||
case 'first-image':
|
||||
return extractFirstImage($, strategy as any);
|
||||
case 'responsive-image':
|
||||
return extractResponsiveImage($, strategy as any);
|
||||
case 'lazy-loaded':
|
||||
return extractLazyLoadedImage($, strategy as any);
|
||||
case 'chapters':
|
||||
return extractChapters($, strategy as any);
|
||||
case 'chapter-content':
|
||||
return extractChapterContent($, strategy as any);
|
||||
case 'multiple-types':
|
||||
return extractMultipleTypes($, strategy as any);
|
||||
case 'schema-org':
|
||||
return extractSchemaOrg($, strategy as any);
|
||||
case 'react-content':
|
||||
return extractReactContent($, strategy as any);
|
||||
default:
|
||||
throw new Error(`Unknown strategy: ${strategy.strategy}`);
|
||||
}
|
||||
}
|
||||
|
||||
private async extractContent(
|
||||
$: any,
|
||||
storyConfig: any,
|
||||
url: string,
|
||||
html: string
|
||||
): Promise<string> {
|
||||
let content = await this.extractField($, storyConfig.content, html);
|
||||
|
||||
if (storyConfig.multiPage?.enabled) {
|
||||
const additionalPages = await this.fetchAdditionalPages(
|
||||
$,
|
||||
url,
|
||||
storyConfig.multiPage
|
||||
);
|
||||
|
||||
for (const pageHtml of additionalPages) {
|
||||
const cheerioPage = await import('cheerio');
|
||||
const $page = cheerioPage.load(pageHtml);
|
||||
const pageContent = await this.extractField(
|
||||
$page,
|
||||
storyConfig.content,
|
||||
pageHtml
|
||||
);
|
||||
content += '\n\n' + pageContent;
|
||||
}
|
||||
}
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
private async fetchAdditionalPages(
|
||||
$: any,
|
||||
baseUrl: string,
|
||||
config: MultiPageConfig
|
||||
): Promise<string[]> {
|
||||
const pages: string[] = [];
|
||||
let currentUrl = baseUrl;
|
||||
let pageNum = 2;
|
||||
|
||||
while (pageNum <= (config.maxPages || 20)) {
|
||||
let nextUrl: string | null = null;
|
||||
|
||||
if (config.strategy === 'url-pattern') {
|
||||
nextUrl = UrlParser.buildPageUrl(baseUrl, pageNum, config);
|
||||
} else if (config.nextPageSelector) {
|
||||
const nextLink = $(config.nextPageSelector).attr('href');
|
||||
if (nextLink) {
|
||||
nextUrl = UrlParser.normalizeUrl(nextLink, currentUrl);
|
||||
}
|
||||
}
|
||||
|
||||
if (!nextUrl) break;
|
||||
|
||||
try {
|
||||
await this.rateLimiter.throttle();
|
||||
const html = await this.fetchWithCache(nextUrl);
|
||||
pages.push(html);
|
||||
currentUrl = nextUrl;
|
||||
pageNum++;
|
||||
} catch (error) {
|
||||
console.error(`Failed to fetch page ${pageNum}:`, error);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return pages;
|
||||
}
|
||||
|
||||
private async fetchWithCache(url: string): Promise<string> {
|
||||
const cached = this.cache.get(url);
|
||||
if (cached) {
|
||||
return cached;
|
||||
}
|
||||
|
||||
await this.rateLimiter.throttle();
|
||||
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'User-Agent': this.config.globalOptions.userAgent,
|
||||
},
|
||||
signal: AbortSignal.timeout(this.config.globalOptions.timeout)
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||||
}
|
||||
|
||||
const html = await response.text();
|
||||
this.cache.set(url, html);
|
||||
|
||||
return html;
|
||||
}
|
||||
|
||||
private applyTransforms(text: string, transform?: string): string {
|
||||
if (!transform) return text;
|
||||
|
||||
if (transform.startsWith('remove-suffix:')) {
|
||||
const suffix = transform.substring('remove-suffix:'.length).trim();
|
||||
return text.replace(new RegExp(`${suffix}$`, 'i'), '').trim();
|
||||
}
|
||||
|
||||
return text;
|
||||
}
|
||||
}
|
||||
164
frontend/src/lib/scraper/strategies/contentCleaner.ts
Normal file
164
frontend/src/lib/scraper/strategies/contentCleaner.ts
Normal file
@@ -0,0 +1,164 @@
|
||||
// Dynamic cheerio import used to avoid client-side bundling issues
|
||||
// Using any type for CheerioAPI to prevent bundling issues
|
||||
import {
|
||||
ChaptersStrategy,
|
||||
ChapterContentStrategy,
|
||||
MultipleTypesStrategy,
|
||||
SchemaOrgStrategy,
|
||||
ReactContentStrategy
|
||||
} from '../types';
|
||||
|
||||
export function extractChapters(
|
||||
$: any,
|
||||
config: ChaptersStrategy
|
||||
): string {
|
||||
// Check for multiple chapters first
|
||||
if (config.chaptersWrapper) {
|
||||
const chaptersWrapper = $(config.chaptersWrapper);
|
||||
if (chaptersWrapper.length > 0) {
|
||||
const chapters = chaptersWrapper.find(config.chapterSelector);
|
||||
if (chapters.length > 1) {
|
||||
// Multiple chapters - combine them
|
||||
let content = '';
|
||||
chapters.each((_: any, elem: any) => {
|
||||
content += $(elem).html() + '\n\n';
|
||||
});
|
||||
return content.trim();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Single chapter fallback
|
||||
if (config.singleChapter) {
|
||||
const singleChapter = $(config.singleChapter);
|
||||
if (singleChapter.length > 0) {
|
||||
return singleChapter.html() || '';
|
||||
}
|
||||
}
|
||||
|
||||
// Direct chapter selector fallback
|
||||
const chapter = $(config.chapterSelector).first();
|
||||
return chapter.html() || '';
|
||||
}
|
||||
|
||||
export function extractChapterContent(
|
||||
$: any,
|
||||
config: ChapterContentStrategy
|
||||
): string {
|
||||
const content = $(config.selector);
|
||||
|
||||
// Remove cleanup selectors
|
||||
if (config.cleanupSelectors) {
|
||||
config.cleanupSelectors.forEach(selector => {
|
||||
content.find(selector).remove();
|
||||
});
|
||||
}
|
||||
|
||||
return content.html() || '';
|
||||
}
|
||||
|
||||
export function extractMultipleTypes(
|
||||
$: any,
|
||||
config: MultipleTypesStrategy
|
||||
): string[] {
|
||||
const tags: string[] = [];
|
||||
|
||||
Object.entries(config.selectors).forEach(([type, selector]) => {
|
||||
$(selector).each((_: any, elem: any) => {
|
||||
const tag = $(elem).text().trim();
|
||||
if (tag) {
|
||||
tags.push(`${type}: ${tag}`);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
return tags;
|
||||
}
|
||||
|
||||
export function extractSchemaOrg(
|
||||
$: any,
|
||||
config: SchemaOrgStrategy
|
||||
): string {
|
||||
// Look for JSON-LD first
|
||||
$('script[type="application/ld+json"]').each((_: any, elem: any) => {
|
||||
try {
|
||||
const data = JSON.parse($(elem).html() || '');
|
||||
if (data['@type'] === config.schemaType ||
|
||||
(Array.isArray(data) && data.some(item => item['@type'] === config.schemaType))) {
|
||||
const item = Array.isArray(data) ?
|
||||
data.find(item => item['@type'] === config.schemaType) : data;
|
||||
if (item && item[config.property]) {
|
||||
return item[config.property];
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// Invalid JSON, continue
|
||||
}
|
||||
});
|
||||
|
||||
// Fallback to selector
|
||||
if (config.fallbackSelector) {
|
||||
return $(config.fallbackSelector).first().text().trim();
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
export function extractReactContent(
|
||||
$: any,
|
||||
config: ReactContentStrategy
|
||||
): string {
|
||||
// This is a simplified version - full React content extraction
|
||||
// would require JavaScript execution or API access
|
||||
|
||||
const contentElements = $(config.paragraphSelector);
|
||||
let content = '';
|
||||
|
||||
contentElements.each((_: any, elem: any) => {
|
||||
const $elem = $(elem);
|
||||
if ($elem.hasClass(config.contentClass)) {
|
||||
content += $elem.html() + '\n\n';
|
||||
}
|
||||
});
|
||||
|
||||
return content.trim();
|
||||
}
|
||||
|
||||
export async function cleanHtml(html: string): Promise<string> {
|
||||
// Basic HTML cleaning - remove scripts, styles, and dangerous elements
|
||||
const cheerio = await import('cheerio');
|
||||
const $ = cheerio.load(html, {
|
||||
// Preserve self-closing tags like <br>
|
||||
xmlMode: false,
|
||||
decodeEntities: false
|
||||
});
|
||||
|
||||
// Remove dangerous elements
|
||||
$('script, style, iframe, embed, object').remove();
|
||||
|
||||
// Remove empty paragraphs and divs (but preserve <br> tags)
|
||||
$('p:empty, div:empty').not(':has(br)').remove();
|
||||
|
||||
// Clean up excessive whitespace in text nodes only, preserve <br> tags
|
||||
$('*').each((_, elem) => {
|
||||
const $elem = $(elem);
|
||||
if (elem.type === 'text') {
|
||||
const text = $elem.text();
|
||||
if (text && text.trim() !== text) {
|
||||
$elem.replaceWith(text.trim());
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Return HTML with proper self-closing tag format
|
||||
return $.html() || '';
|
||||
}
|
||||
|
||||
export function extractAttribute(
|
||||
$: any,
|
||||
selector: string,
|
||||
attribute: string
|
||||
): string {
|
||||
const element = $(selector).first();
|
||||
return element.attr(attribute) || '';
|
||||
}
|
||||
3
frontend/src/lib/scraper/strategies/index.ts
Normal file
3
frontend/src/lib/scraper/strategies/index.ts
Normal file
@@ -0,0 +1,3 @@
|
||||
export * from './textExtractor';
|
||||
export * from './linkExtractor';
|
||||
export * from './contentCleaner';
|
||||
98
frontend/src/lib/scraper/strategies/linkExtractor.ts
Normal file
98
frontend/src/lib/scraper/strategies/linkExtractor.ts
Normal file
@@ -0,0 +1,98 @@
|
||||
// Dynamic cheerio import used to avoid client-side bundling issues
|
||||
// Using any type for CheerioAPI to prevent bundling issues
|
||||
import {
|
||||
LinkWithPathStrategy,
|
||||
HrefPatternStrategy,
|
||||
FirstImageStrategy,
|
||||
ResponsiveImageStrategy,
|
||||
LazyLoadedStrategy
|
||||
} from '../types';
|
||||
|
||||
export function extractLinkWithPath(
|
||||
$: any,
|
||||
config: LinkWithPathStrategy
|
||||
): string {
|
||||
let searchScope = config.searchWithin ? $(config.searchWithin) : $('body');
|
||||
|
||||
const links = searchScope.find('a');
|
||||
|
||||
for (let i = 0; i < links.length; i++) {
|
||||
const link = links.eq(i);
|
||||
const href = link.attr('href');
|
||||
|
||||
if (href && href.includes(config.pathContains)) {
|
||||
return link.text().trim();
|
||||
}
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
export function extractHrefPattern(
|
||||
$: any,
|
||||
config: HrefPatternStrategy
|
||||
): string[] {
|
||||
let searchScope = config.searchWithin ? $(config.searchWithin) : $('body');
|
||||
|
||||
const pattern = new RegExp(config.pattern);
|
||||
const links: string[] = [];
|
||||
|
||||
searchScope.find('a').each((_: any, elem: any) => {
|
||||
const href = $(elem).attr('href');
|
||||
if (href && pattern.test(href)) {
|
||||
links.push(href);
|
||||
}
|
||||
});
|
||||
|
||||
return links;
|
||||
}
|
||||
|
||||
export function extractFirstImage(
|
||||
$: any,
|
||||
config: FirstImageStrategy
|
||||
): string {
|
||||
let searchScope = config.searchWithin ? $(config.searchWithin) : $('body');
|
||||
|
||||
const img = searchScope.find('img').first();
|
||||
return img.attr(config.attribute) || '';
|
||||
}
|
||||
|
||||
export function extractResponsiveImage(
|
||||
$: any,
|
||||
config: ResponsiveImageStrategy
|
||||
): string {
|
||||
const img = $(config.selector).first();
|
||||
|
||||
if (config.selectLargest && config.srcsetAttribute) {
|
||||
const srcset = img.attr(config.srcsetAttribute);
|
||||
if (srcset) {
|
||||
// Parse srcset and return the largest image
|
||||
const sources = srcset.split(',').map((src: string) => {
|
||||
const parts = src.trim().split(' ');
|
||||
const url = parts[0];
|
||||
const descriptor = parts[1] || '1x';
|
||||
const width = descriptor.includes('w') ?
|
||||
parseInt(descriptor.replace('w', '')) :
|
||||
descriptor.includes('x') ?
|
||||
parseInt(descriptor.replace('x', '')) * 100 : 100;
|
||||
return { url, width };
|
||||
});
|
||||
|
||||
const largest = sources.reduce((prev: any, current: any) =>
|
||||
prev.width > current.width ? prev : current
|
||||
);
|
||||
|
||||
return largest.url;
|
||||
}
|
||||
}
|
||||
|
||||
return img.attr('src') || '';
|
||||
}
|
||||
|
||||
export function extractLazyLoadedImage(
|
||||
$: any,
|
||||
config: LazyLoadedStrategy
|
||||
): string {
|
||||
const img = $(config.selector).first();
|
||||
return img.attr(config.attribute) || img.attr('src') || '';
|
||||
}
|
||||
144
frontend/src/lib/scraper/strategies/textExtractor.ts
Normal file
144
frontend/src/lib/scraper/strategies/textExtractor.ts
Normal file
@@ -0,0 +1,144 @@
|
||||
import 'server-only';
|
||||
|
||||
// Dynamic cheerio import used to avoid client-side bundling issues
|
||||
// Using any type for CheerioAPI to prevent bundling issues
|
||||
import {
|
||||
TextPatternStrategy,
|
||||
TextBlockStrategy,
|
||||
HtmlBetweenStrategy,
|
||||
LinkTextStrategy
|
||||
} from '../types';
|
||||
|
||||
export function extractByTextPattern(
|
||||
html: string,
|
||||
config: TextPatternStrategy
|
||||
): string {
|
||||
let searchContent = html;
|
||||
|
||||
// Limit search scope if specified
|
||||
if (config.searchAfter) {
|
||||
const afterIndex = html.indexOf(config.searchAfter);
|
||||
if (afterIndex !== -1) {
|
||||
searchContent = html.substring(afterIndex);
|
||||
}
|
||||
}
|
||||
|
||||
if (config.searchBefore) {
|
||||
const beforeIndex = searchContent.indexOf(config.searchBefore);
|
||||
if (beforeIndex !== -1) {
|
||||
searchContent = searchContent.substring(0, beforeIndex);
|
||||
}
|
||||
}
|
||||
|
||||
const regex = new RegExp(config.pattern, 'i');
|
||||
const match = searchContent.match(regex);
|
||||
return match ? match[config.group || 1].trim() : '';
|
||||
}
|
||||
|
||||
export function extractTextBlocks(
|
||||
$: any,
|
||||
config: TextBlockStrategy
|
||||
): string {
|
||||
const blocks: Array<{element: any, text: string}> = [];
|
||||
|
||||
// Remove excluded elements first
|
||||
if (config.excludeSelectors) {
|
||||
config.excludeSelectors.forEach(selector => {
|
||||
$(selector).remove();
|
||||
});
|
||||
}
|
||||
|
||||
$('*').each((_: any, elem: any) => {
|
||||
const $elem = $(elem);
|
||||
const text = $elem.clone().children().remove().end().text().trim();
|
||||
|
||||
if (text.length >= (config.minLength || 500)) {
|
||||
blocks.push({ element: elem, text });
|
||||
}
|
||||
});
|
||||
|
||||
// Find the block that likely contains story content
|
||||
const storyBlock = blocks.find(block => {
|
||||
if (config.containerHints && config.containerHints.length > 0) {
|
||||
const hasHints = config.containerHints.some(hint =>
|
||||
$(block.element).attr('class')?.includes(hint) ||
|
||||
$(block.element).attr('id')?.includes(hint)
|
||||
);
|
||||
return hasHints;
|
||||
}
|
||||
return blocks.length === 1;
|
||||
});
|
||||
|
||||
if (storyBlock) {
|
||||
return $(storyBlock.element).html() || '';
|
||||
}
|
||||
|
||||
// Fallback to largest block
|
||||
const largestBlock = blocks.reduce((prev, current) =>
|
||||
prev.text.length > current.text.length ? prev : current
|
||||
);
|
||||
|
||||
return largestBlock ? $(largestBlock.element).html() || '' : '';
|
||||
}
|
||||
|
||||
export function extractHtmlBetween(
|
||||
html: string,
|
||||
config: HtmlBetweenStrategy
|
||||
): string {
|
||||
const startIndex = html.indexOf(config.startMarker);
|
||||
if (startIndex === -1) return '';
|
||||
|
||||
const contentStart = config.includeStart ?
|
||||
startIndex :
|
||||
startIndex + config.startMarker.length;
|
||||
|
||||
const endIndex = html.indexOf(config.endMarker, contentStart);
|
||||
if (endIndex === -1) {
|
||||
return html.substring(contentStart);
|
||||
}
|
||||
|
||||
return html.substring(contentStart, endIndex).trim();
|
||||
}
|
||||
|
||||
export function extractLinkText(
|
||||
$: any,
|
||||
config: LinkTextStrategy
|
||||
): string {
|
||||
let searchScope = config.searchWithin ? $(config.searchWithin) : $('body');
|
||||
|
||||
// Look for links near the specified text patterns
|
||||
let foundText = '';
|
||||
|
||||
config.nearText.forEach(text => {
|
||||
if (foundText) return; // Already found
|
||||
|
||||
searchScope.find('*').each((_: any, elem: any) => {
|
||||
const $elem = $(elem);
|
||||
const elemText = $elem.text().toLowerCase();
|
||||
|
||||
if (elemText.includes(text.toLowerCase())) {
|
||||
// Look for nearby links
|
||||
const $link = $elem.find('a').first();
|
||||
if ($link.length) {
|
||||
foundText = $link.text().trim();
|
||||
return false; // Break out of each
|
||||
}
|
||||
|
||||
// Check if the element itself is a link
|
||||
if ($elem.is('a')) {
|
||||
foundText = $elem.text().trim();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Look for links in the next few siblings
|
||||
const $siblings = $elem.nextAll().slice(0, 3);
|
||||
$siblings.find('a').first().each((_: any, link: any) => {
|
||||
foundText = $(link).text().trim();
|
||||
return false;
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
return foundText;
|
||||
}
|
||||
248
frontend/src/lib/scraper/types.ts
Normal file
248
frontend/src/lib/scraper/types.ts
Normal file
@@ -0,0 +1,248 @@
|
||||
export interface SiteConfig {
|
||||
story: StorySelectors;
|
||||
authorPage: AuthorPageSelectors;
|
||||
}
|
||||
|
||||
export interface StorySelectors {
|
||||
title: string | SelectorStrategy;
|
||||
author: string | SelectorStrategy;
|
||||
content: string | SelectorStrategy;
|
||||
summary?: string | SelectorStrategy;
|
||||
coverImage?: string | SelectorStrategy;
|
||||
tags?: string | SelectorStrategy;
|
||||
multiPage?: MultiPageConfig;
|
||||
titleFallback?: string;
|
||||
titleFallbackAttribute?: string;
|
||||
titleTransform?: string;
|
||||
summaryAttribute?: string;
|
||||
coverImageAttribute?: string;
|
||||
tagsAttribute?: string;
|
||||
}
|
||||
|
||||
export interface AuthorPageSelectors {
|
||||
storyLinks: string | SelectorStrategy;
|
||||
pagination?: PaginationConfig;
|
||||
linkPrefix?: string;
|
||||
filterStrategy?: string;
|
||||
requiresChildElement?: string;
|
||||
requiresNavigation?: NavigationConfig;
|
||||
metadata?: MetadataConfig;
|
||||
additionalInfo?: AdditionalInfoConfig;
|
||||
}
|
||||
|
||||
export interface SelectorStrategy {
|
||||
strategy: string;
|
||||
[key: string]: any;
|
||||
}
|
||||
|
||||
export interface MultiPageConfig {
|
||||
enabled: boolean;
|
||||
strategy: 'url-pattern' | 'next-link' | 'chapter-navigation' | 'chapter-dropdown' | 'table-of-contents' | 'api-based';
|
||||
nextPageSelector?: string;
|
||||
pageParam?: string;
|
||||
maxPages?: number;
|
||||
chapterListSelector?: string;
|
||||
chapterSelector?: string;
|
||||
urlPattern?: string;
|
||||
tocSelector?: string;
|
||||
requiresAuth?: boolean;
|
||||
apiPattern?: string;
|
||||
tocApiPattern?: string;
|
||||
}
|
||||
|
||||
export interface PaginationConfig {
|
||||
enabled: boolean;
|
||||
nextPageSelector: string;
|
||||
}
|
||||
|
||||
export interface NavigationConfig {
|
||||
enabled: boolean;
|
||||
clickText: string;
|
||||
waitMs: number;
|
||||
}
|
||||
|
||||
export interface MetadataConfig {
|
||||
strategy: string;
|
||||
metadataSelector: string;
|
||||
parsePattern: string;
|
||||
}
|
||||
|
||||
export interface AdditionalInfoConfig {
|
||||
strategy: string;
|
||||
statsSelector: string;
|
||||
extractStats: string[];
|
||||
}
|
||||
|
||||
export interface ScrapedStory {
|
||||
title: string;
|
||||
author: string;
|
||||
content: string;
|
||||
summary?: string;
|
||||
coverImage?: string;
|
||||
tags?: string[];
|
||||
sourceUrl: string;
|
||||
}
|
||||
|
||||
export interface ScrapedAuthorStory {
|
||||
url: string;
|
||||
title: string;
|
||||
author: string;
|
||||
summary?: string;
|
||||
}
|
||||
|
||||
export interface SitesConfig {
|
||||
sites: Record<string, SiteConfig>;
|
||||
strategies: Record<string, StrategyDescription>;
|
||||
globalOptions: GlobalOptions;
|
||||
siteNotes?: Record<string, SiteNotes>;
|
||||
}
|
||||
|
||||
export interface StrategyDescription {
|
||||
description: string;
|
||||
implementation: string;
|
||||
}
|
||||
|
||||
export interface GlobalOptions {
|
||||
userAgent: string;
|
||||
timeout: number;
|
||||
retryAttempts: number;
|
||||
rateLimitMs: number;
|
||||
cacheDuration?: number;
|
||||
javascriptTimeout?: number;
|
||||
}
|
||||
|
||||
export interface SiteNotes {
|
||||
warning?: string;
|
||||
note?: string;
|
||||
rateLimit?: string;
|
||||
requiresAuth?: string;
|
||||
}
|
||||
|
||||
// Strategy-specific interfaces
|
||||
export interface TextPatternStrategy extends SelectorStrategy {
|
||||
strategy: 'text-pattern';
|
||||
pattern: string;
|
||||
group?: number;
|
||||
searchAfter?: string;
|
||||
searchBefore?: string;
|
||||
}
|
||||
|
||||
export interface LinkWithPathStrategy extends SelectorStrategy {
|
||||
strategy: 'link-with-path';
|
||||
pathContains: string;
|
||||
searchWithin?: string;
|
||||
}
|
||||
|
||||
export interface TextBlockStrategy extends SelectorStrategy {
|
||||
strategy: 'text-blocks';
|
||||
minLength?: number;
|
||||
containerHints?: string[];
|
||||
excludeSelectors?: string[];
|
||||
}
|
||||
|
||||
export interface HrefPatternStrategy extends SelectorStrategy {
|
||||
strategy: 'href-pattern';
|
||||
pattern: string;
|
||||
searchWithin?: string;
|
||||
}
|
||||
|
||||
export interface HtmlBetweenStrategy extends SelectorStrategy {
|
||||
strategy: 'html-between';
|
||||
startMarker: string;
|
||||
endMarker: string;
|
||||
includeStart?: boolean;
|
||||
}
|
||||
|
||||
export interface ChaptersStrategy extends SelectorStrategy {
|
||||
strategy: 'chapters';
|
||||
chapterSelector: string;
|
||||
chaptersWrapper?: string;
|
||||
singleChapter?: string;
|
||||
}
|
||||
|
||||
export interface MultipleTypesStrategy extends SelectorStrategy {
|
||||
strategy: 'multiple-types';
|
||||
selectors: Record<string, string>;
|
||||
}
|
||||
|
||||
export interface LinkTextStrategy extends SelectorStrategy {
|
||||
strategy: 'link-text';
|
||||
nearText: string[];
|
||||
searchWithin?: string;
|
||||
}
|
||||
|
||||
export interface FirstImageStrategy extends SelectorStrategy {
|
||||
strategy: 'first-image';
|
||||
searchWithin: string;
|
||||
attribute: string;
|
||||
}
|
||||
|
||||
export interface SchemaOrgStrategy extends SelectorStrategy {
|
||||
strategy: 'schema-org';
|
||||
schemaType: string;
|
||||
property: string;
|
||||
fallbackSelector?: string;
|
||||
}
|
||||
|
||||
export interface ReactContentStrategy extends SelectorStrategy {
|
||||
strategy: 'react-content';
|
||||
contentClass: string;
|
||||
paragraphSelector: string;
|
||||
requiresJavaScript: boolean;
|
||||
}
|
||||
|
||||
export interface ResponsiveImageStrategy extends SelectorStrategy {
|
||||
strategy: 'responsive-image';
|
||||
selector: string;
|
||||
srcsetAttribute: string;
|
||||
selectLargest: boolean;
|
||||
}
|
||||
|
||||
export interface LazyLoadedStrategy extends SelectorStrategy {
|
||||
strategy: 'lazy-loaded';
|
||||
selector: string;
|
||||
attribute: string;
|
||||
}
|
||||
|
||||
export interface ChapterContentStrategy extends SelectorStrategy {
|
||||
strategy: 'chapter-content';
|
||||
selector: string;
|
||||
cleanupSelectors?: string[];
|
||||
}
|
||||
|
||||
export interface DataAttributesStrategy extends SelectorStrategy {
|
||||
strategy: 'data-attributes';
|
||||
statsSelector: string;
|
||||
extractStats: string[];
|
||||
}
|
||||
|
||||
export interface SiblingTextStrategy extends SelectorStrategy {
|
||||
strategy: 'sibling-text';
|
||||
metadataSelector: string;
|
||||
parsePattern: string;
|
||||
}
|
||||
|
||||
export interface ApiBasedStrategy extends SelectorStrategy {
|
||||
strategy: 'api-based';
|
||||
apiPattern: string;
|
||||
tocApiPattern?: string;
|
||||
requiresAuth: boolean;
|
||||
}
|
||||
|
||||
export interface InfiniteScrollStrategy extends SelectorStrategy {
|
||||
strategy: 'infinite-scroll';
|
||||
initialSelector: string;
|
||||
apiEndpoint: string;
|
||||
requiresJavaScript: boolean;
|
||||
}
|
||||
|
||||
export class ScraperError extends Error {
|
||||
constructor(
|
||||
message: string,
|
||||
public url: string,
|
||||
public originalError?: Error
|
||||
) {
|
||||
super(message);
|
||||
this.name = 'ScraperError';
|
||||
}
|
||||
}
|
||||
35
frontend/src/lib/scraper/utils/cache.ts
Normal file
35
frontend/src/lib/scraper/utils/cache.ts
Normal file
@@ -0,0 +1,35 @@
|
||||
export class ScraperCache {
|
||||
private cache: Map<string, { data: any; timestamp: number }> = new Map();
|
||||
private ttl: number;
|
||||
|
||||
constructor(ttlMs: number = 300000) { // 5 minutes default
|
||||
this.ttl = ttlMs;
|
||||
}
|
||||
|
||||
get(key: string): any | null {
|
||||
const entry = this.cache.get(key);
|
||||
if (!entry) return null;
|
||||
|
||||
if (Date.now() - entry.timestamp > this.ttl) {
|
||||
this.cache.delete(key);
|
||||
return null;
|
||||
}
|
||||
|
||||
return entry.data;
|
||||
}
|
||||
|
||||
set(key: string, data: any): void {
|
||||
this.cache.set(key, {
|
||||
data,
|
||||
timestamp: Date.now()
|
||||
});
|
||||
}
|
||||
|
||||
clear(): void {
|
||||
this.cache.clear();
|
||||
}
|
||||
|
||||
size(): number {
|
||||
return this.cache.size;
|
||||
}
|
||||
}
|
||||
23
frontend/src/lib/scraper/utils/rateLimit.ts
Normal file
23
frontend/src/lib/scraper/utils/rateLimit.ts
Normal file
@@ -0,0 +1,23 @@
|
||||
export class RateLimiter {
|
||||
private lastRequest: number = 0;
|
||||
private minDelay: number;
|
||||
|
||||
constructor(minDelayMs: number = 1000) {
|
||||
this.minDelay = minDelayMs;
|
||||
}
|
||||
|
||||
async throttle(): Promise<void> {
|
||||
const now = Date.now();
|
||||
const timeSinceLastRequest = now - this.lastRequest;
|
||||
|
||||
if (timeSinceLastRequest < this.minDelay) {
|
||||
await this.delay(this.minDelay - timeSinceLastRequest);
|
||||
}
|
||||
|
||||
this.lastRequest = Date.now();
|
||||
}
|
||||
|
||||
private delay(ms: number): Promise<void> {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
}
|
||||
61
frontend/src/lib/scraper/utils/urlParser.ts
Normal file
61
frontend/src/lib/scraper/utils/urlParser.ts
Normal file
@@ -0,0 +1,61 @@
|
||||
export class UrlParser {
|
||||
static getDomain(url: string): string {
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
return urlObj.hostname.replace(/^www\./, '');
|
||||
} catch (error) {
|
||||
throw new Error(`Invalid URL: ${url}`);
|
||||
}
|
||||
}
|
||||
|
||||
static validateUrl(url: string): boolean {
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
return urlObj.protocol === 'http:' || urlObj.protocol === 'https:';
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static buildPageUrl(baseUrl: string, pageNum: number, config: any): string {
|
||||
try {
|
||||
const urlObj = new URL(baseUrl);
|
||||
if (config.pageParam) {
|
||||
urlObj.searchParams.set(config.pageParam, pageNum.toString());
|
||||
} else if (config.urlPattern) {
|
||||
// Replace {page} or similar patterns in URL
|
||||
return config.urlPattern.replace(/\{page\}/g, pageNum.toString());
|
||||
}
|
||||
return urlObj.toString();
|
||||
} catch (error) {
|
||||
throw new Error(`Failed to build page URL: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
static normalizeUrl(url: string, baseUrl?: string): string {
|
||||
try {
|
||||
if (url.startsWith('http://') || url.startsWith('https://')) {
|
||||
return url;
|
||||
}
|
||||
|
||||
if (baseUrl) {
|
||||
return new URL(url, baseUrl).toString();
|
||||
}
|
||||
|
||||
return url;
|
||||
} catch (error) {
|
||||
throw new Error(`Failed to normalize URL: ${url}`);
|
||||
}
|
||||
}
|
||||
|
||||
static extractDomainConfig(url: string, sitesConfig: any): any {
|
||||
const domain = this.getDomain(url);
|
||||
const config = sitesConfig.sites[domain];
|
||||
|
||||
if (!config) {
|
||||
throw new Error(`Unsupported site: ${domain}`);
|
||||
}
|
||||
|
||||
return config;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user