scraping and improvements

This commit is contained in:
Stefan Hardegger
2025-07-28 13:52:09 +02:00
parent f95d7aa8bb
commit fcad028959
31 changed files with 3788 additions and 118 deletions

474
storycove-scraper-spec.md Normal file
View File

@@ -0,0 +1,474 @@
# StoryCove Web Scraper Feature Specification
## Overview
The Web Scraper feature allows users to import stories from external websites into StoryCove by providing URLs. The scraper extracts story metadata and content using configurable selectors defined in a JSON configuration file.
## Feature Requirements
### Core Functionality
1. **Single Story Import**: Users can provide a story URL and the scraper will extract:
- Title (required)
- Author (required)
- Content (required)
- Summary (optional)
- Cover Image (optional)
- Tags (optional)
2. **Author Page Scanning**: Users can provide an author page URL to:
- Discover all stories by that author
- Present a selectable list of stories
- Allow bulk import of selected stories
3. **Multi-page Story Support**: Handle stories split across multiple pages by:
- Detecting pagination
- Fetching all pages
- Merging content in correct order
### User Interface Flow
1. **Add Story View Enhancement**:
```
[Manual Entry] | [Import from URL]
When "Import from URL" selected:
- URL input field
- "Fetch" button
- Loading indicator during fetch
- Pre-filled form with scraped data
- Ability to edit before saving
```
2. **Bulk Import View** (future enhancement):
```
- URL input for author page
- "Scan for Stories" button
- Checkbox list of discovered stories
- "Import Selected" button
- Progress indicator
```
## Technical Implementation
### Architecture
```
/lib/scraper/
├── config/
│ └── sites.json # Site configurations
├── scraper.ts # Main scraper class
├── strategies/ # Strategy implementations
│ ├── index.ts
│ ├── textExtractor.ts
│ ├── linkExtractor.ts
│ └── contentCleaner.ts
├── utils/
│ ├── rateLimit.ts
│ ├── cache.ts
│ └── urlParser.ts
└── types.ts # TypeScript definitions
```
### API Routes
```typescript
// /app/api/scrape/story/route.ts
POST /api/scrape/story
Body: { url: string }
Response: {
title: string,
author: string,
content: string,
summary?: string,
coverImage?: string,
tags?: string[]
}
// /app/api/scrape/author/route.ts
POST /api/scrape/author
Body: { url: string }
Response: {
stories: Array<{
url: string,
title: string,
author: string,
summary?: string
}>
}
```
### Core Classes
```typescript
// /lib/scraper/types.ts
interface SiteConfig {
story: StorySelectors;
authorPage: AuthorPageSelectors;
}
interface StorySelectors {
title: string | SelectorStrategy;
author: string | SelectorStrategy;
content: string | SelectorStrategy;
summary?: string | SelectorStrategy;
coverImage?: string | SelectorStrategy;
tags?: string | SelectorStrategy;
multiPage?: MultiPageConfig;
}
interface SelectorStrategy {
strategy: string;
[key: string]: any;
}
interface ScrapedStory {
title: string;
author: string;
content: string;
summary?: string;
coverImage?: string;
tags?: string[];
sourceUrl: string;
}
```
### Main Scraper Implementation
```typescript
// /lib/scraper/scraper.ts
import * as cheerio from 'cheerio';
import { SiteConfig, ScrapedStory } from './types';
import sitesConfig from './config/sites.json';
export class StoryScraper {
private config: Record<string, SiteConfig>;
private cache: Map<string, any>;
constructor() {
this.config = sitesConfig.sites;
this.cache = new Map();
}
async scrapeStory(url: string): Promise<ScrapedStory> {
const domain = this.getDomain(url);
const siteConfig = this.config[domain];
if (!siteConfig) {
throw new Error(`Unsupported site: ${domain}`);
}
const html = await this.fetchWithCache(url);
const $ = cheerio.load(html);
const story: ScrapedStory = {
title: await this.extractField($, siteConfig.story.title, html),
author: await this.extractField($, siteConfig.story.author, html),
content: await this.extractContent($, siteConfig.story, url),
sourceUrl: url
};
// Extract optional fields
if (siteConfig.story.summary) {
story.summary = await this.extractField($, siteConfig.story.summary, html);
}
return story;
}
private async extractField(
$: cheerio.CheerioAPI,
selector: string | SelectorStrategy,
html: string
): Promise<string> {
if (typeof selector === 'string') {
// Simple CSS selector
return $(selector).first().text().trim();
}
// Strategy-based extraction
return await this.executeStrategy($, selector, html);
}
private async executeStrategy(
$: cheerio.CheerioAPI,
strategy: SelectorStrategy,
html: string
): Promise<string> {
switch (strategy.strategy) {
case 'text-pattern':
return this.extractByTextPattern(html, strategy);
case 'link-with-path':
return this.extractLinkWithPath($, strategy);
case 'text-blocks':
return this.extractTextBlocks($, strategy);
// ... other strategies
}
}
}
```
### Strategy Implementations
```typescript
// /lib/scraper/strategies/textExtractor.ts
export function extractByTextPattern(
html: string,
config: TextPatternStrategy
): string {
const regex = new RegExp(config.pattern, 'i');
const match = html.match(regex);
return match ? match[config.group || 1].trim() : '';
}
export function extractTextBlocks(
$: cheerio.CheerioAPI,
config: TextBlockStrategy
): string {
const blocks: Array<{element: any, text: string}> = [];
$('*').each((_, elem) => {
const $elem = $(elem);
const text = $elem.clone().children().remove().end().text().trim();
if (text.length >= (config.minLength || 500)) {
blocks.push({ element: elem, text });
}
});
// Find the block that likely contains story content
const storyBlock = blocks.find(block => {
const hasHints = config.containerHints?.some(hint =>
$(block.element).attr('class')?.includes(hint) ||
$(block.element).attr('id')?.includes(hint)
);
return hasHints || blocks.length === 1;
});
return storyBlock ? $(storyBlock.element).html() || '' : '';
}
```
### Rate Limiting
```typescript
// /lib/scraper/utils/rateLimit.ts
export class RateLimiter {
private lastRequest: number = 0;
private minDelay: number;
constructor(minDelayMs: number = 1000) {
this.minDelay = minDelayMs;
}
async throttle(): Promise<void> {
const now = Date.now();
const timeSinceLastRequest = now - this.lastRequest;
if (timeSinceLastRequest < this.minDelay) {
await this.delay(this.minDelay - timeSinceLastRequest);
}
this.lastRequest = Date.now();
}
private delay(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
```
### Multi-page Story Handling
```typescript
// /lib/scraper/scraper.ts (addition)
private async extractContent(
$: cheerio.CheerioAPI,
storyConfig: StorySelectors,
url: string
): Promise<string> {
let content = await this.extractField($, storyConfig.content, $.html());
if (storyConfig.multiPage?.enabled) {
const additionalPages = await this.fetchAdditionalPages(
$,
url,
storyConfig.multiPage
);
for (const pageHtml of additionalPages) {
const $page = cheerio.load(pageHtml);
const pageContent = await this.extractField(
$page,
storyConfig.content,
pageHtml
);
content += '\n\n' + pageContent;
}
}
return content;
}
private async fetchAdditionalPages(
$: cheerio.CheerioAPI,
baseUrl: string,
config: MultiPageConfig
): Promise<string[]> {
const pages: string[] = [];
let currentUrl = baseUrl;
let pageNum = 2;
while (pageNum <= (config.maxPages || 20)) {
let nextUrl: string | null = null;
if (config.strategy === 'url-pattern') {
nextUrl = this.buildPageUrl(baseUrl, pageNum, config);
} else if (config.nextPageSelector) {
const nextLink = $(config.nextPageSelector).attr('href');
if (nextLink) {
nextUrl = new URL(nextLink, currentUrl).href;
}
}
if (!nextUrl) break;
try {
await this.rateLimiter.throttle();
const html = await this.fetchWithCache(nextUrl);
pages.push(html);
currentUrl = nextUrl;
pageNum++;
} catch (error) {
console.error(`Failed to fetch page ${pageNum}:`, error);
break;
}
}
return pages;
}
```
### Error Handling
```typescript
// /lib/scraper/scraper.ts (addition)
async scrapeStory(url: string): Promise<ScrapedStory> {
try {
// ... existing implementation
} catch (error) {
if (error instanceof Error) {
throw new ScraperError(
`Failed to scrape ${url}: ${error.message}`,
url,
error
);
}
throw error;
}
}
export class ScraperError extends Error {
constructor(
message: string,
public url: string,
public originalError?: Error
) {
super(message);
this.name = 'ScraperError';
}
}
```
## Configuration File Structure
```json
{
"sites": {
"domain.com": {
"story": {
"title": "selector or strategy object",
"author": "selector or strategy object",
"content": "selector or strategy object",
"summary": "optional selector or strategy",
"coverImage": "optional selector or strategy",
"tags": "optional selector or strategy",
"multiPage": {
"enabled": true,
"strategy": "url-pattern|next-link",
"nextPageSelector": "a.next-page",
"pageParam": "page",
"maxPages": 20
}
},
"authorPage": {
"storyLinks": "selector or strategy object",
"pagination": {
"enabled": true,
"nextPageSelector": "a.next"
}
}
}
},
"globalOptions": {
"userAgent": "Mozilla/5.0...",
"timeout": 30000,
"retryAttempts": 3,
"rateLimitMs": 1000
}
}
```
## Usage Example
```typescript
// In a Next.js API route
import { StoryScraper } from '@/lib/scraper/scraper';
export async function POST(request: Request) {
const { url } = await request.json();
try {
const scraper = new StoryScraper();
const story = await scraper.scrapeStory(url);
return Response.json(story);
} catch (error) {
if (error instanceof ScraperError) {
return Response.json(
{ error: error.message },
{ status: 400 }
);
}
return Response.json(
{ error: 'Internal server error' },
{ status: 500 }
);
}
}
```
## Testing Considerations
1. **Unit Tests**: Test individual strategies and extractors
2. **Integration Tests**: Test against saved HTML samples
3. **Mock External Requests**: Use saved HTML fixtures to avoid hitting real sites
4. **Edge Cases**: Empty content, missing fields, malformed HTML
5. **Rate Limiting**: Verify delays are properly applied
## Security Considerations
1. **URL Validation**: Only accept HTTP/HTTPS URLs
2. **Domain Allowlist**: Restrict to configured domains
3. **Content Sanitization**: Clean HTML before storage
4. **Request Timeouts**: Prevent hanging on slow sites
5. **Rate Limiting**: Prevent abuse of the scraping endpoint
## Future Enhancements
1. **Browser Automation**: Use Playwright for JavaScript-rendered content
2. **AI Content Extraction**: Use LLMs for sites without clear patterns
3. **User-Submitted Configurations**: Allow users to define selectors
4. **Scheduled Imports**: Periodic author page checking
5. **Import History**: Track what has been imported to avoid duplicates