scraping and improvements
This commit is contained in:
474
storycove-scraper-spec.md
Normal file
474
storycove-scraper-spec.md
Normal file
@@ -0,0 +1,474 @@
|
||||
# StoryCove Web Scraper Feature Specification
|
||||
|
||||
## Overview
|
||||
|
||||
The Web Scraper feature allows users to import stories from external websites into StoryCove by providing URLs. The scraper extracts story metadata and content using configurable selectors defined in a JSON configuration file.
|
||||
|
||||
## Feature Requirements
|
||||
|
||||
### Core Functionality
|
||||
|
||||
1. **Single Story Import**: Users can provide a story URL and the scraper will extract:
|
||||
- Title (required)
|
||||
- Author (required)
|
||||
- Content (required)
|
||||
- Summary (optional)
|
||||
- Cover Image (optional)
|
||||
- Tags (optional)
|
||||
|
||||
2. **Author Page Scanning**: Users can provide an author page URL to:
|
||||
- Discover all stories by that author
|
||||
- Present a selectable list of stories
|
||||
- Allow bulk import of selected stories
|
||||
|
||||
3. **Multi-page Story Support**: Handle stories split across multiple pages by:
|
||||
- Detecting pagination
|
||||
- Fetching all pages
|
||||
- Merging content in correct order
|
||||
|
||||
### User Interface Flow
|
||||
|
||||
1. **Add Story View Enhancement**:
|
||||
```
|
||||
[Manual Entry] | [Import from URL]
|
||||
|
||||
When "Import from URL" selected:
|
||||
- URL input field
|
||||
- "Fetch" button
|
||||
- Loading indicator during fetch
|
||||
- Pre-filled form with scraped data
|
||||
- Ability to edit before saving
|
||||
```
|
||||
|
||||
2. **Bulk Import View** (future enhancement):
|
||||
```
|
||||
- URL input for author page
|
||||
- "Scan for Stories" button
|
||||
- Checkbox list of discovered stories
|
||||
- "Import Selected" button
|
||||
- Progress indicator
|
||||
```
|
||||
|
||||
## Technical Implementation
|
||||
|
||||
### Architecture
|
||||
|
||||
```
|
||||
/lib/scraper/
|
||||
├── config/
|
||||
│ └── sites.json # Site configurations
|
||||
├── scraper.ts # Main scraper class
|
||||
├── strategies/ # Strategy implementations
|
||||
│ ├── index.ts
|
||||
│ ├── textExtractor.ts
|
||||
│ ├── linkExtractor.ts
|
||||
│ └── contentCleaner.ts
|
||||
├── utils/
|
||||
│ ├── rateLimit.ts
|
||||
│ ├── cache.ts
|
||||
│ └── urlParser.ts
|
||||
└── types.ts # TypeScript definitions
|
||||
```
|
||||
|
||||
### API Routes
|
||||
|
||||
```typescript
|
||||
// /app/api/scrape/story/route.ts
|
||||
POST /api/scrape/story
|
||||
Body: { url: string }
|
||||
Response: {
|
||||
title: string,
|
||||
author: string,
|
||||
content: string,
|
||||
summary?: string,
|
||||
coverImage?: string,
|
||||
tags?: string[]
|
||||
}
|
||||
|
||||
// /app/api/scrape/author/route.ts
|
||||
POST /api/scrape/author
|
||||
Body: { url: string }
|
||||
Response: {
|
||||
stories: Array<{
|
||||
url: string,
|
||||
title: string,
|
||||
author: string,
|
||||
summary?: string
|
||||
}>
|
||||
}
|
||||
```
|
||||
|
||||
### Core Classes
|
||||
|
||||
```typescript
|
||||
// /lib/scraper/types.ts
|
||||
interface SiteConfig {
|
||||
story: StorySelectors;
|
||||
authorPage: AuthorPageSelectors;
|
||||
}
|
||||
|
||||
interface StorySelectors {
|
||||
title: string | SelectorStrategy;
|
||||
author: string | SelectorStrategy;
|
||||
content: string | SelectorStrategy;
|
||||
summary?: string | SelectorStrategy;
|
||||
coverImage?: string | SelectorStrategy;
|
||||
tags?: string | SelectorStrategy;
|
||||
multiPage?: MultiPageConfig;
|
||||
}
|
||||
|
||||
interface SelectorStrategy {
|
||||
strategy: string;
|
||||
[key: string]: any;
|
||||
}
|
||||
|
||||
interface ScrapedStory {
|
||||
title: string;
|
||||
author: string;
|
||||
content: string;
|
||||
summary?: string;
|
||||
coverImage?: string;
|
||||
tags?: string[];
|
||||
sourceUrl: string;
|
||||
}
|
||||
```
|
||||
|
||||
### Main Scraper Implementation
|
||||
|
||||
```typescript
|
||||
// /lib/scraper/scraper.ts
|
||||
import * as cheerio from 'cheerio';
|
||||
import { SiteConfig, ScrapedStory } from './types';
|
||||
import sitesConfig from './config/sites.json';
|
||||
|
||||
export class StoryScraper {
|
||||
private config: Record<string, SiteConfig>;
|
||||
private cache: Map<string, any>;
|
||||
|
||||
constructor() {
|
||||
this.config = sitesConfig.sites;
|
||||
this.cache = new Map();
|
||||
}
|
||||
|
||||
async scrapeStory(url: string): Promise<ScrapedStory> {
|
||||
const domain = this.getDomain(url);
|
||||
const siteConfig = this.config[domain];
|
||||
|
||||
if (!siteConfig) {
|
||||
throw new Error(`Unsupported site: ${domain}`);
|
||||
}
|
||||
|
||||
const html = await this.fetchWithCache(url);
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const story: ScrapedStory = {
|
||||
title: await this.extractField($, siteConfig.story.title, html),
|
||||
author: await this.extractField($, siteConfig.story.author, html),
|
||||
content: await this.extractContent($, siteConfig.story, url),
|
||||
sourceUrl: url
|
||||
};
|
||||
|
||||
// Extract optional fields
|
||||
if (siteConfig.story.summary) {
|
||||
story.summary = await this.extractField($, siteConfig.story.summary, html);
|
||||
}
|
||||
|
||||
return story;
|
||||
}
|
||||
|
||||
private async extractField(
|
||||
$: cheerio.CheerioAPI,
|
||||
selector: string | SelectorStrategy,
|
||||
html: string
|
||||
): Promise<string> {
|
||||
if (typeof selector === 'string') {
|
||||
// Simple CSS selector
|
||||
return $(selector).first().text().trim();
|
||||
}
|
||||
|
||||
// Strategy-based extraction
|
||||
return await this.executeStrategy($, selector, html);
|
||||
}
|
||||
|
||||
private async executeStrategy(
|
||||
$: cheerio.CheerioAPI,
|
||||
strategy: SelectorStrategy,
|
||||
html: string
|
||||
): Promise<string> {
|
||||
switch (strategy.strategy) {
|
||||
case 'text-pattern':
|
||||
return this.extractByTextPattern(html, strategy);
|
||||
case 'link-with-path':
|
||||
return this.extractLinkWithPath($, strategy);
|
||||
case 'text-blocks':
|
||||
return this.extractTextBlocks($, strategy);
|
||||
// ... other strategies
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Strategy Implementations
|
||||
|
||||
```typescript
|
||||
// /lib/scraper/strategies/textExtractor.ts
|
||||
export function extractByTextPattern(
|
||||
html: string,
|
||||
config: TextPatternStrategy
|
||||
): string {
|
||||
const regex = new RegExp(config.pattern, 'i');
|
||||
const match = html.match(regex);
|
||||
return match ? match[config.group || 1].trim() : '';
|
||||
}
|
||||
|
||||
export function extractTextBlocks(
|
||||
$: cheerio.CheerioAPI,
|
||||
config: TextBlockStrategy
|
||||
): string {
|
||||
const blocks: Array<{element: any, text: string}> = [];
|
||||
|
||||
$('*').each((_, elem) => {
|
||||
const $elem = $(elem);
|
||||
const text = $elem.clone().children().remove().end().text().trim();
|
||||
|
||||
if (text.length >= (config.minLength || 500)) {
|
||||
blocks.push({ element: elem, text });
|
||||
}
|
||||
});
|
||||
|
||||
// Find the block that likely contains story content
|
||||
const storyBlock = blocks.find(block => {
|
||||
const hasHints = config.containerHints?.some(hint =>
|
||||
$(block.element).attr('class')?.includes(hint) ||
|
||||
$(block.element).attr('id')?.includes(hint)
|
||||
);
|
||||
return hasHints || blocks.length === 1;
|
||||
});
|
||||
|
||||
return storyBlock ? $(storyBlock.element).html() || '' : '';
|
||||
}
|
||||
```
|
||||
|
||||
### Rate Limiting
|
||||
|
||||
```typescript
|
||||
// /lib/scraper/utils/rateLimit.ts
|
||||
export class RateLimiter {
|
||||
private lastRequest: number = 0;
|
||||
private minDelay: number;
|
||||
|
||||
constructor(minDelayMs: number = 1000) {
|
||||
this.minDelay = minDelayMs;
|
||||
}
|
||||
|
||||
async throttle(): Promise<void> {
|
||||
const now = Date.now();
|
||||
const timeSinceLastRequest = now - this.lastRequest;
|
||||
|
||||
if (timeSinceLastRequest < this.minDelay) {
|
||||
await this.delay(this.minDelay - timeSinceLastRequest);
|
||||
}
|
||||
|
||||
this.lastRequest = Date.now();
|
||||
}
|
||||
|
||||
private delay(ms: number): Promise<void> {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Multi-page Story Handling
|
||||
|
||||
```typescript
|
||||
// /lib/scraper/scraper.ts (addition)
|
||||
private async extractContent(
|
||||
$: cheerio.CheerioAPI,
|
||||
storyConfig: StorySelectors,
|
||||
url: string
|
||||
): Promise<string> {
|
||||
let content = await this.extractField($, storyConfig.content, $.html());
|
||||
|
||||
if (storyConfig.multiPage?.enabled) {
|
||||
const additionalPages = await this.fetchAdditionalPages(
|
||||
$,
|
||||
url,
|
||||
storyConfig.multiPage
|
||||
);
|
||||
|
||||
for (const pageHtml of additionalPages) {
|
||||
const $page = cheerio.load(pageHtml);
|
||||
const pageContent = await this.extractField(
|
||||
$page,
|
||||
storyConfig.content,
|
||||
pageHtml
|
||||
);
|
||||
content += '\n\n' + pageContent;
|
||||
}
|
||||
}
|
||||
|
||||
return content;
|
||||
}
|
||||
|
||||
private async fetchAdditionalPages(
|
||||
$: cheerio.CheerioAPI,
|
||||
baseUrl: string,
|
||||
config: MultiPageConfig
|
||||
): Promise<string[]> {
|
||||
const pages: string[] = [];
|
||||
let currentUrl = baseUrl;
|
||||
let pageNum = 2;
|
||||
|
||||
while (pageNum <= (config.maxPages || 20)) {
|
||||
let nextUrl: string | null = null;
|
||||
|
||||
if (config.strategy === 'url-pattern') {
|
||||
nextUrl = this.buildPageUrl(baseUrl, pageNum, config);
|
||||
} else if (config.nextPageSelector) {
|
||||
const nextLink = $(config.nextPageSelector).attr('href');
|
||||
if (nextLink) {
|
||||
nextUrl = new URL(nextLink, currentUrl).href;
|
||||
}
|
||||
}
|
||||
|
||||
if (!nextUrl) break;
|
||||
|
||||
try {
|
||||
await this.rateLimiter.throttle();
|
||||
const html = await this.fetchWithCache(nextUrl);
|
||||
pages.push(html);
|
||||
currentUrl = nextUrl;
|
||||
pageNum++;
|
||||
} catch (error) {
|
||||
console.error(`Failed to fetch page ${pageNum}:`, error);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return pages;
|
||||
}
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
|
||||
```typescript
|
||||
// /lib/scraper/scraper.ts (addition)
|
||||
async scrapeStory(url: string): Promise<ScrapedStory> {
|
||||
try {
|
||||
// ... existing implementation
|
||||
} catch (error) {
|
||||
if (error instanceof Error) {
|
||||
throw new ScraperError(
|
||||
`Failed to scrape ${url}: ${error.message}`,
|
||||
url,
|
||||
error
|
||||
);
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
export class ScraperError extends Error {
|
||||
constructor(
|
||||
message: string,
|
||||
public url: string,
|
||||
public originalError?: Error
|
||||
) {
|
||||
super(message);
|
||||
this.name = 'ScraperError';
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Configuration File Structure
|
||||
|
||||
```json
|
||||
{
|
||||
"sites": {
|
||||
"domain.com": {
|
||||
"story": {
|
||||
"title": "selector or strategy object",
|
||||
"author": "selector or strategy object",
|
||||
"content": "selector or strategy object",
|
||||
"summary": "optional selector or strategy",
|
||||
"coverImage": "optional selector or strategy",
|
||||
"tags": "optional selector or strategy",
|
||||
"multiPage": {
|
||||
"enabled": true,
|
||||
"strategy": "url-pattern|next-link",
|
||||
"nextPageSelector": "a.next-page",
|
||||
"pageParam": "page",
|
||||
"maxPages": 20
|
||||
}
|
||||
},
|
||||
"authorPage": {
|
||||
"storyLinks": "selector or strategy object",
|
||||
"pagination": {
|
||||
"enabled": true,
|
||||
"nextPageSelector": "a.next"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"globalOptions": {
|
||||
"userAgent": "Mozilla/5.0...",
|
||||
"timeout": 30000,
|
||||
"retryAttempts": 3,
|
||||
"rateLimitMs": 1000
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Usage Example
|
||||
|
||||
```typescript
|
||||
// In a Next.js API route
|
||||
import { StoryScraper } from '@/lib/scraper/scraper';
|
||||
|
||||
export async function POST(request: Request) {
|
||||
const { url } = await request.json();
|
||||
|
||||
try {
|
||||
const scraper = new StoryScraper();
|
||||
const story = await scraper.scrapeStory(url);
|
||||
|
||||
return Response.json(story);
|
||||
} catch (error) {
|
||||
if (error instanceof ScraperError) {
|
||||
return Response.json(
|
||||
{ error: error.message },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
|
||||
return Response.json(
|
||||
{ error: 'Internal server error' },
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Testing Considerations
|
||||
|
||||
1. **Unit Tests**: Test individual strategies and extractors
|
||||
2. **Integration Tests**: Test against saved HTML samples
|
||||
3. **Mock External Requests**: Use saved HTML fixtures to avoid hitting real sites
|
||||
4. **Edge Cases**: Empty content, missing fields, malformed HTML
|
||||
5. **Rate Limiting**: Verify delays are properly applied
|
||||
|
||||
## Security Considerations
|
||||
|
||||
1. **URL Validation**: Only accept HTTP/HTTPS URLs
|
||||
2. **Domain Allowlist**: Restrict to configured domains
|
||||
3. **Content Sanitization**: Clean HTML before storage
|
||||
4. **Request Timeouts**: Prevent hanging on slow sites
|
||||
5. **Rate Limiting**: Prevent abuse of the scraping endpoint
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
1. **Browser Automation**: Use Playwright for JavaScript-rendered content
|
||||
2. **AI Content Extraction**: Use LLMs for sites without clear patterns
|
||||
3. **User-Submitted Configurations**: Allow users to define selectors
|
||||
4. **Scheduled Imports**: Periodic author page checking
|
||||
5. **Import History**: Track what has been imported to avoid duplicates
|
||||
Reference in New Issue
Block a user