474 lines
12 KiB
Markdown
474 lines
12 KiB
Markdown
# StoryCove Web Scraper Feature Specification
|
|
|
|
## Overview
|
|
|
|
The Web Scraper feature allows users to import stories from external websites into StoryCove by providing URLs. The scraper extracts story metadata and content using configurable selectors defined in a JSON configuration file.
|
|
|
|
## Feature Requirements
|
|
|
|
### Core Functionality
|
|
|
|
1. **Single Story Import**: Users can provide a story URL and the scraper will extract:
|
|
- Title (required)
|
|
- Author (required)
|
|
- Content (required)
|
|
- Summary (optional)
|
|
- Cover Image (optional)
|
|
- Tags (optional)
|
|
|
|
2. **Author Page Scanning**: Users can provide an author page URL to:
|
|
- Discover all stories by that author
|
|
- Present a selectable list of stories
|
|
- Allow bulk import of selected stories
|
|
|
|
3. **Multi-page Story Support**: Handle stories split across multiple pages by:
|
|
- Detecting pagination
|
|
- Fetching all pages
|
|
- Merging content in correct order
|
|
|
|
### User Interface Flow
|
|
|
|
1. **Add Story View Enhancement**:
|
|
```
|
|
[Manual Entry] | [Import from URL]
|
|
|
|
When "Import from URL" selected:
|
|
- URL input field
|
|
- "Fetch" button
|
|
- Loading indicator during fetch
|
|
- Pre-filled form with scraped data
|
|
- Ability to edit before saving
|
|
```
|
|
|
|
2. **Bulk Import View** (future enhancement):
|
|
```
|
|
- URL input for author page
|
|
- "Scan for Stories" button
|
|
- Checkbox list of discovered stories
|
|
- "Import Selected" button
|
|
- Progress indicator
|
|
```
|
|
|
|
## Technical Implementation
|
|
|
|
### Architecture
|
|
|
|
```
|
|
/lib/scraper/
|
|
├── config/
|
|
│ └── sites.json # Site configurations
|
|
├── scraper.ts # Main scraper class
|
|
├── strategies/ # Strategy implementations
|
|
│ ├── index.ts
|
|
│ ├── textExtractor.ts
|
|
│ ├── linkExtractor.ts
|
|
│ └── contentCleaner.ts
|
|
├── utils/
|
|
│ ├── rateLimit.ts
|
|
│ ├── cache.ts
|
|
│ └── urlParser.ts
|
|
└── types.ts # TypeScript definitions
|
|
```
|
|
|
|
### API Routes
|
|
|
|
```typescript
|
|
// /app/api/scrape/story/route.ts
|
|
POST /api/scrape/story
|
|
Body: { url: string }
|
|
Response: {
|
|
title: string,
|
|
author: string,
|
|
content: string,
|
|
summary?: string,
|
|
coverImage?: string,
|
|
tags?: string[]
|
|
}
|
|
|
|
// /app/api/scrape/author/route.ts
|
|
POST /api/scrape/author
|
|
Body: { url: string }
|
|
Response: {
|
|
stories: Array<{
|
|
url: string,
|
|
title: string,
|
|
author: string,
|
|
summary?: string
|
|
}>
|
|
}
|
|
```
|
|
|
|
### Core Classes
|
|
|
|
```typescript
|
|
// /lib/scraper/types.ts
|
|
interface SiteConfig {
|
|
story: StorySelectors;
|
|
authorPage: AuthorPageSelectors;
|
|
}
|
|
|
|
interface StorySelectors {
|
|
title: string | SelectorStrategy;
|
|
author: string | SelectorStrategy;
|
|
content: string | SelectorStrategy;
|
|
summary?: string | SelectorStrategy;
|
|
coverImage?: string | SelectorStrategy;
|
|
tags?: string | SelectorStrategy;
|
|
multiPage?: MultiPageConfig;
|
|
}
|
|
|
|
interface SelectorStrategy {
|
|
strategy: string;
|
|
[key: string]: any;
|
|
}
|
|
|
|
interface ScrapedStory {
|
|
title: string;
|
|
author: string;
|
|
content: string;
|
|
summary?: string;
|
|
coverImage?: string;
|
|
tags?: string[];
|
|
sourceUrl: string;
|
|
}
|
|
```
|
|
|
|
### Main Scraper Implementation
|
|
|
|
```typescript
|
|
// /lib/scraper/scraper.ts
|
|
import * as cheerio from 'cheerio';
|
|
import { SiteConfig, ScrapedStory } from './types';
|
|
import sitesConfig from './config/sites.json';
|
|
|
|
export class StoryScraper {
|
|
private config: Record<string, SiteConfig>;
|
|
private cache: Map<string, any>;
|
|
|
|
constructor() {
|
|
this.config = sitesConfig.sites;
|
|
this.cache = new Map();
|
|
}
|
|
|
|
async scrapeStory(url: string): Promise<ScrapedStory> {
|
|
const domain = this.getDomain(url);
|
|
const siteConfig = this.config[domain];
|
|
|
|
if (!siteConfig) {
|
|
throw new Error(`Unsupported site: ${domain}`);
|
|
}
|
|
|
|
const html = await this.fetchWithCache(url);
|
|
const $ = cheerio.load(html);
|
|
|
|
const story: ScrapedStory = {
|
|
title: await this.extractField($, siteConfig.story.title, html),
|
|
author: await this.extractField($, siteConfig.story.author, html),
|
|
content: await this.extractContent($, siteConfig.story, url),
|
|
sourceUrl: url
|
|
};
|
|
|
|
// Extract optional fields
|
|
if (siteConfig.story.summary) {
|
|
story.summary = await this.extractField($, siteConfig.story.summary, html);
|
|
}
|
|
|
|
return story;
|
|
}
|
|
|
|
private async extractField(
|
|
$: cheerio.CheerioAPI,
|
|
selector: string | SelectorStrategy,
|
|
html: string
|
|
): Promise<string> {
|
|
if (typeof selector === 'string') {
|
|
// Simple CSS selector
|
|
return $(selector).first().text().trim();
|
|
}
|
|
|
|
// Strategy-based extraction
|
|
return await this.executeStrategy($, selector, html);
|
|
}
|
|
|
|
private async executeStrategy(
|
|
$: cheerio.CheerioAPI,
|
|
strategy: SelectorStrategy,
|
|
html: string
|
|
): Promise<string> {
|
|
switch (strategy.strategy) {
|
|
case 'text-pattern':
|
|
return this.extractByTextPattern(html, strategy);
|
|
case 'link-with-path':
|
|
return this.extractLinkWithPath($, strategy);
|
|
case 'text-blocks':
|
|
return this.extractTextBlocks($, strategy);
|
|
// ... other strategies
|
|
}
|
|
}
|
|
}
|
|
```
|
|
|
|
### Strategy Implementations
|
|
|
|
```typescript
|
|
// /lib/scraper/strategies/textExtractor.ts
|
|
export function extractByTextPattern(
|
|
html: string,
|
|
config: TextPatternStrategy
|
|
): string {
|
|
const regex = new RegExp(config.pattern, 'i');
|
|
const match = html.match(regex);
|
|
return match ? match[config.group || 1].trim() : '';
|
|
}
|
|
|
|
export function extractTextBlocks(
|
|
$: cheerio.CheerioAPI,
|
|
config: TextBlockStrategy
|
|
): string {
|
|
const blocks: Array<{element: any, text: string}> = [];
|
|
|
|
$('*').each((_, elem) => {
|
|
const $elem = $(elem);
|
|
const text = $elem.clone().children().remove().end().text().trim();
|
|
|
|
if (text.length >= (config.minLength || 500)) {
|
|
blocks.push({ element: elem, text });
|
|
}
|
|
});
|
|
|
|
// Find the block that likely contains story content
|
|
const storyBlock = blocks.find(block => {
|
|
const hasHints = config.containerHints?.some(hint =>
|
|
$(block.element).attr('class')?.includes(hint) ||
|
|
$(block.element).attr('id')?.includes(hint)
|
|
);
|
|
return hasHints || blocks.length === 1;
|
|
});
|
|
|
|
return storyBlock ? $(storyBlock.element).html() || '' : '';
|
|
}
|
|
```
|
|
|
|
### Rate Limiting
|
|
|
|
```typescript
|
|
// /lib/scraper/utils/rateLimit.ts
|
|
export class RateLimiter {
|
|
private lastRequest: number = 0;
|
|
private minDelay: number;
|
|
|
|
constructor(minDelayMs: number = 1000) {
|
|
this.minDelay = minDelayMs;
|
|
}
|
|
|
|
async throttle(): Promise<void> {
|
|
const now = Date.now();
|
|
const timeSinceLastRequest = now - this.lastRequest;
|
|
|
|
if (timeSinceLastRequest < this.minDelay) {
|
|
await this.delay(this.minDelay - timeSinceLastRequest);
|
|
}
|
|
|
|
this.lastRequest = Date.now();
|
|
}
|
|
|
|
private delay(ms: number): Promise<void> {
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
}
|
|
}
|
|
```
|
|
|
|
### Multi-page Story Handling
|
|
|
|
```typescript
|
|
// /lib/scraper/scraper.ts (addition)
|
|
private async extractContent(
|
|
$: cheerio.CheerioAPI,
|
|
storyConfig: StorySelectors,
|
|
url: string
|
|
): Promise<string> {
|
|
let content = await this.extractField($, storyConfig.content, $.html());
|
|
|
|
if (storyConfig.multiPage?.enabled) {
|
|
const additionalPages = await this.fetchAdditionalPages(
|
|
$,
|
|
url,
|
|
storyConfig.multiPage
|
|
);
|
|
|
|
for (const pageHtml of additionalPages) {
|
|
const $page = cheerio.load(pageHtml);
|
|
const pageContent = await this.extractField(
|
|
$page,
|
|
storyConfig.content,
|
|
pageHtml
|
|
);
|
|
content += '\n\n' + pageContent;
|
|
}
|
|
}
|
|
|
|
return content;
|
|
}
|
|
|
|
private async fetchAdditionalPages(
|
|
$: cheerio.CheerioAPI,
|
|
baseUrl: string,
|
|
config: MultiPageConfig
|
|
): Promise<string[]> {
|
|
const pages: string[] = [];
|
|
let currentUrl = baseUrl;
|
|
let pageNum = 2;
|
|
|
|
while (pageNum <= (config.maxPages || 20)) {
|
|
let nextUrl: string | null = null;
|
|
|
|
if (config.strategy === 'url-pattern') {
|
|
nextUrl = this.buildPageUrl(baseUrl, pageNum, config);
|
|
} else if (config.nextPageSelector) {
|
|
const nextLink = $(config.nextPageSelector).attr('href');
|
|
if (nextLink) {
|
|
nextUrl = new URL(nextLink, currentUrl).href;
|
|
}
|
|
}
|
|
|
|
if (!nextUrl) break;
|
|
|
|
try {
|
|
await this.rateLimiter.throttle();
|
|
const html = await this.fetchWithCache(nextUrl);
|
|
pages.push(html);
|
|
currentUrl = nextUrl;
|
|
pageNum++;
|
|
} catch (error) {
|
|
console.error(`Failed to fetch page ${pageNum}:`, error);
|
|
break;
|
|
}
|
|
}
|
|
|
|
return pages;
|
|
}
|
|
```
|
|
|
|
### Error Handling
|
|
|
|
```typescript
|
|
// /lib/scraper/scraper.ts (addition)
|
|
async scrapeStory(url: string): Promise<ScrapedStory> {
|
|
try {
|
|
// ... existing implementation
|
|
} catch (error) {
|
|
if (error instanceof Error) {
|
|
throw new ScraperError(
|
|
`Failed to scrape ${url}: ${error.message}`,
|
|
url,
|
|
error
|
|
);
|
|
}
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
export class ScraperError extends Error {
|
|
constructor(
|
|
message: string,
|
|
public url: string,
|
|
public originalError?: Error
|
|
) {
|
|
super(message);
|
|
this.name = 'ScraperError';
|
|
}
|
|
}
|
|
```
|
|
|
|
## Configuration File Structure
|
|
|
|
```json
|
|
{
|
|
"sites": {
|
|
"domain.com": {
|
|
"story": {
|
|
"title": "selector or strategy object",
|
|
"author": "selector or strategy object",
|
|
"content": "selector or strategy object",
|
|
"summary": "optional selector or strategy",
|
|
"coverImage": "optional selector or strategy",
|
|
"tags": "optional selector or strategy",
|
|
"multiPage": {
|
|
"enabled": true,
|
|
"strategy": "url-pattern|next-link",
|
|
"nextPageSelector": "a.next-page",
|
|
"pageParam": "page",
|
|
"maxPages": 20
|
|
}
|
|
},
|
|
"authorPage": {
|
|
"storyLinks": "selector or strategy object",
|
|
"pagination": {
|
|
"enabled": true,
|
|
"nextPageSelector": "a.next"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"globalOptions": {
|
|
"userAgent": "Mozilla/5.0...",
|
|
"timeout": 30000,
|
|
"retryAttempts": 3,
|
|
"rateLimitMs": 1000
|
|
}
|
|
}
|
|
```
|
|
|
|
## Usage Example
|
|
|
|
```typescript
|
|
// In a Next.js API route
|
|
import { StoryScraper } from '@/lib/scraper/scraper';
|
|
|
|
export async function POST(request: Request) {
|
|
const { url } = await request.json();
|
|
|
|
try {
|
|
const scraper = new StoryScraper();
|
|
const story = await scraper.scrapeStory(url);
|
|
|
|
return Response.json(story);
|
|
} catch (error) {
|
|
if (error instanceof ScraperError) {
|
|
return Response.json(
|
|
{ error: error.message },
|
|
{ status: 400 }
|
|
);
|
|
}
|
|
|
|
return Response.json(
|
|
{ error: 'Internal server error' },
|
|
{ status: 500 }
|
|
);
|
|
}
|
|
}
|
|
```
|
|
|
|
## Testing Considerations
|
|
|
|
1. **Unit Tests**: Test individual strategies and extractors
|
|
2. **Integration Tests**: Test against saved HTML samples
|
|
3. **Mock External Requests**: Use saved HTML fixtures to avoid hitting real sites
|
|
4. **Edge Cases**: Empty content, missing fields, malformed HTML
|
|
5. **Rate Limiting**: Verify delays are properly applied
|
|
|
|
## Security Considerations
|
|
|
|
1. **URL Validation**: Only accept HTTP/HTTPS URLs
|
|
2. **Domain Allowlist**: Restrict to configured domains
|
|
3. **Content Sanitization**: Clean HTML before storage
|
|
4. **Request Timeouts**: Prevent hanging on slow sites
|
|
5. **Rate Limiting**: Prevent abuse of the scraping endpoint
|
|
|
|
## Future Enhancements
|
|
|
|
1. **Browser Automation**: Use Playwright for JavaScript-rendered content
|
|
2. **AI Content Extraction**: Use LLMs for sites without clear patterns
|
|
3. **User-Submitted Configurations**: Allow users to define selectors
|
|
4. **Scheduled Imports**: Periodic author page checking
|
|
5. **Import History**: Track what has been imported to avoid duplicates |