scraping and improvements

2025-07-28 13:52:09 +02:00
parent f95d7aa8bb
commit fcad028959
31 changed files with 3788 additions and 118 deletions
--- a/storycove-scraper-spec.md
+++ b/storycove-scraper-spec.md
@@ -0,0 +1,474 @@
+# StoryCove Web Scraper Feature Specification
+
+## Overview
+
+The Web Scraper feature allows users to import stories from external websites into StoryCove by providing URLs. The scraper extracts story metadata and content using configurable selectors defined in a JSON configuration file.
+
+## Feature Requirements
+
+### Core Functionality
+
+1. **Single Story Import**: Users can provide a story URL and the scraper will extract:
+   - Title (required)
+   - Author (required)
+   - Content (required)
+   - Summary (optional)
+   - Cover Image (optional)
+   - Tags (optional)
+
+2. **Author Page Scanning**: Users can provide an author page URL to:
+   - Discover all stories by that author
+   - Present a selectable list of stories
+   - Allow bulk import of selected stories
+
+3. **Multi-page Story Support**: Handle stories split across multiple pages by:
+   - Detecting pagination
+   - Fetching all pages
+   - Merging content in correct order
+
+### User Interface Flow
+
+1. **Add Story View Enhancement**:
+   ```
+   [Manual Entry] | [Import from URL]
+   
+   When "Import from URL" selected:
+   - URL input field
+   - "Fetch" button
+   - Loading indicator during fetch
+   - Pre-filled form with scraped data
+   - Ability to edit before saving
+   ```
+
+2. **Bulk Import View** (future enhancement):
+   ```
+   - URL input for author page
+   - "Scan for Stories" button
+   - Checkbox list of discovered stories
+   - "Import Selected" button
+   - Progress indicator
+   ```
+
+## Technical Implementation
+
+### Architecture
+
+```
+/lib/scraper/
+├── config/
+│   └── sites.json          # Site configurations
+├── scraper.ts              # Main scraper class
+├── strategies/             # Strategy implementations
+│   ├── index.ts
+│   ├── textExtractor.ts
+│   ├── linkExtractor.ts
+│   └── contentCleaner.ts
+├── utils/
+│   ├── rateLimit.ts
+│   ├── cache.ts
+│   └── urlParser.ts
+└── types.ts                # TypeScript definitions
+```
+
+### API Routes
+
+```typescript
+// /app/api/scrape/story/route.ts
+POST /api/scrape/story
+Body: { url: string }
+Response: { 
+  title: string,
+  author: string,
+  content: string,
+  summary?: string,
+  coverImage?: string,
+  tags?: string[]
+}
+
+// /app/api/scrape/author/route.ts
+POST /api/scrape/author
+Body: { url: string }
+Response: {
+  stories: Array<{
+    url: string,
+    title: string,
+    author: string,
+    summary?: string
+  }>
+}
+```
+
+### Core Classes
+
+```typescript
+// /lib/scraper/types.ts
+interface SiteConfig {
+  story: StorySelectors;
+  authorPage: AuthorPageSelectors;
+}
+
+interface StorySelectors {
+  title: string | SelectorStrategy;
+  author: string | SelectorStrategy;
+  content: string | SelectorStrategy;
+  summary?: string | SelectorStrategy;
+  coverImage?: string | SelectorStrategy;
+  tags?: string | SelectorStrategy;
+  multiPage?: MultiPageConfig;
+}
+
+interface SelectorStrategy {
+  strategy: string;
+  [key: string]: any;
+}
+
+interface ScrapedStory {
+  title: string;
+  author: string;
+  content: string;
+  summary?: string;
+  coverImage?: string;
+  tags?: string[];
+  sourceUrl: string;
+}
+```
+
+### Main Scraper Implementation
+
+```typescript
+// /lib/scraper/scraper.ts
+import * as cheerio from 'cheerio';
+import { SiteConfig, ScrapedStory } from './types';
+import sitesConfig from './config/sites.json';
+
+export class StoryScraper {
+  private config: Record<string, SiteConfig>;
+  private cache: Map<string, any>;
+  
+  constructor() {
+    this.config = sitesConfig.sites;
+    this.cache = new Map();
+  }
+
+  async scrapeStory(url: string): Promise<ScrapedStory> {
+    const domain = this.getDomain(url);
+    const siteConfig = this.config[domain];
+    
+    if (!siteConfig) {
+      throw new Error(`Unsupported site: ${domain}`);
+    }
+    
+    const html = await this.fetchWithCache(url);
+    const $ = cheerio.load(html);
+    
+    const story: ScrapedStory = {
+      title: await this.extractField($, siteConfig.story.title, html),
+      author: await this.extractField($, siteConfig.story.author, html),
+      content: await this.extractContent($, siteConfig.story, url),
+      sourceUrl: url
+    };
+    
+    // Extract optional fields
+    if (siteConfig.story.summary) {
+      story.summary = await this.extractField($, siteConfig.story.summary, html);
+    }
+    
+    return story;
+  }
+
+  private async extractField(
+    $: cheerio.CheerioAPI, 
+    selector: string | SelectorStrategy,
+    html: string
+  ): Promise<string> {
+    if (typeof selector === 'string') {
+      // Simple CSS selector
+      return $(selector).first().text().trim();
+    }
+    
+    // Strategy-based extraction
+    return await this.executeStrategy($, selector, html);
+  }
+
+  private async executeStrategy(
+    $: cheerio.CheerioAPI,
+    strategy: SelectorStrategy,
+    html: string
+  ): Promise<string> {
+    switch (strategy.strategy) {
+      case 'text-pattern':
+        return this.extractByTextPattern(html, strategy);
+      case 'link-with-path':
+        return this.extractLinkWithPath($, strategy);
+      case 'text-blocks':
+        return this.extractTextBlocks($, strategy);
+      // ... other strategies
+    }
+  }
+}
+```
+
+### Strategy Implementations
+
+```typescript
+// /lib/scraper/strategies/textExtractor.ts
+export function extractByTextPattern(
+  html: string, 
+  config: TextPatternStrategy
+): string {
+  const regex = new RegExp(config.pattern, 'i');
+  const match = html.match(regex);
+  return match ? match[config.group || 1].trim() : '';
+}
+
+export function extractTextBlocks(
+  $: cheerio.CheerioAPI,
+  config: TextBlockStrategy
+): string {
+  const blocks: Array<{element: any, text: string}> = [];
+  
+  $('*').each((_, elem) => {
+    const $elem = $(elem);
+    const text = $elem.clone().children().remove().end().text().trim();
+    
+    if (text.length >= (config.minLength || 500)) {
+      blocks.push({ element: elem, text });
+    }
+  });
+  
+  // Find the block that likely contains story content
+  const storyBlock = blocks.find(block => {
+    const hasHints = config.containerHints?.some(hint => 
+      $(block.element).attr('class')?.includes(hint) ||
+      $(block.element).attr('id')?.includes(hint)
+    );
+    return hasHints || blocks.length === 1;
+  });
+  
+  return storyBlock ? $(storyBlock.element).html() || '' : '';
+}
+```
+
+### Rate Limiting
+
+```typescript
+// /lib/scraper/utils/rateLimit.ts
+export class RateLimiter {
+  private lastRequest: number = 0;
+  private minDelay: number;
+  
+  constructor(minDelayMs: number = 1000) {
+    this.minDelay = minDelayMs;
+  }
+  
+  async throttle(): Promise<void> {
+    const now = Date.now();
+    const timeSinceLastRequest = now - this.lastRequest;
+    
+    if (timeSinceLastRequest < this.minDelay) {
+      await this.delay(this.minDelay - timeSinceLastRequest);
+    }
+    
+    this.lastRequest = Date.now();
+  }
+  
+  private delay(ms: number): Promise<void> {
+    return new Promise(resolve => setTimeout(resolve, ms));
+  }
+}
+```
+
+### Multi-page Story Handling
+
+```typescript
+// /lib/scraper/scraper.ts (addition)
+private async extractContent(
+  $: cheerio.CheerioAPI,
+  storyConfig: StorySelectors,
+  url: string
+): Promise<string> {
+  let content = await this.extractField($, storyConfig.content, $.html());
+  
+  if (storyConfig.multiPage?.enabled) {
+    const additionalPages = await this.fetchAdditionalPages(
+      $, 
+      url, 
+      storyConfig.multiPage
+    );
+    
+    for (const pageHtml of additionalPages) {
+      const $page = cheerio.load(pageHtml);
+      const pageContent = await this.extractField(
+        $page, 
+        storyConfig.content, 
+        pageHtml
+      );
+      content += '\n\n' + pageContent;
+    }
+  }
+  
+  return content;
+}
+
+private async fetchAdditionalPages(
+  $: cheerio.CheerioAPI,
+  baseUrl: string,
+  config: MultiPageConfig
+): Promise<string[]> {
+  const pages: string[] = [];
+  let currentUrl = baseUrl;
+  let pageNum = 2;
+  
+  while (pageNum <= (config.maxPages || 20)) {
+    let nextUrl: string | null = null;
+    
+    if (config.strategy === 'url-pattern') {
+      nextUrl = this.buildPageUrl(baseUrl, pageNum, config);
+    } else if (config.nextPageSelector) {
+      const nextLink = $(config.nextPageSelector).attr('href');
+      if (nextLink) {
+        nextUrl = new URL(nextLink, currentUrl).href;
+      }
+    }
+    
+    if (!nextUrl) break;
+    
+    try {
+      await this.rateLimiter.throttle();
+      const html = await this.fetchWithCache(nextUrl);
+      pages.push(html);
+      currentUrl = nextUrl;
+      pageNum++;
+    } catch (error) {
+      console.error(`Failed to fetch page ${pageNum}:`, error);
+      break;
+    }
+  }
+  
+  return pages;
+}
+```
+
+### Error Handling
+
+```typescript
+// /lib/scraper/scraper.ts (addition)
+async scrapeStory(url: string): Promise<ScrapedStory> {
+  try {
+    // ... existing implementation
+  } catch (error) {
+    if (error instanceof Error) {
+      throw new ScraperError(
+        `Failed to scrape ${url}: ${error.message}`,
+        url,
+        error
+      );
+    }
+    throw error;
+  }
+}
+
+export class ScraperError extends Error {
+  constructor(
+    message: string,
+    public url: string,
+    public originalError?: Error
+  ) {
+    super(message);
+    this.name = 'ScraperError';
+  }
+}
+```
+
+## Configuration File Structure
+
+```json
+{
+  "sites": {
+    "domain.com": {
+      "story": {
+        "title": "selector or strategy object",
+        "author": "selector or strategy object",
+        "content": "selector or strategy object",
+        "summary": "optional selector or strategy",
+        "coverImage": "optional selector or strategy",
+        "tags": "optional selector or strategy",
+        "multiPage": {
+          "enabled": true,
+          "strategy": "url-pattern|next-link",
+          "nextPageSelector": "a.next-page",
+          "pageParam": "page",
+          "maxPages": 20
+        }
+      },
+      "authorPage": {
+        "storyLinks": "selector or strategy object",
+        "pagination": {
+          "enabled": true,
+          "nextPageSelector": "a.next"
+        }
+      }
+    }
+  },
+  "globalOptions": {
+    "userAgent": "Mozilla/5.0...",
+    "timeout": 30000,
+    "retryAttempts": 3,
+    "rateLimitMs": 1000
+  }
+}
+```
+
+## Usage Example
+
+```typescript
+// In a Next.js API route
+import { StoryScraper } from '@/lib/scraper/scraper';
+
+export async function POST(request: Request) {
+  const { url } = await request.json();
+  
+  try {
+    const scraper = new StoryScraper();
+    const story = await scraper.scrapeStory(url);
+    
+    return Response.json(story);
+  } catch (error) {
+    if (error instanceof ScraperError) {
+      return Response.json(
+        { error: error.message },
+        { status: 400 }
+      );
+    }
+    
+    return Response.json(
+      { error: 'Internal server error' },
+      { status: 500 }
+    );
+  }
+}
+```
+
+## Testing Considerations
+
+1. **Unit Tests**: Test individual strategies and extractors
+2. **Integration Tests**: Test against saved HTML samples
+3. **Mock External Requests**: Use saved HTML fixtures to avoid hitting real sites
+4. **Edge Cases**: Empty content, missing fields, malformed HTML
+5. **Rate Limiting**: Verify delays are properly applied
+
+## Security Considerations
+
+1. **URL Validation**: Only accept HTTP/HTTPS URLs
+2. **Domain Allowlist**: Restrict to configured domains
+3. **Content Sanitization**: Clean HTML before storage
+4. **Request Timeouts**: Prevent hanging on slow sites
+5. **Rate Limiting**: Prevent abuse of the scraping endpoint
+
+## Future Enhancements
+
+1. **Browser Automation**: Use Playwright for JavaScript-rendered content
+2. **AI Content Extraction**: Use LLMs for sites without clear patterns
+3. **User-Submitted Configurations**: Allow users to define selectors
+4. **Scheduled Imports**: Periodic author page checking
+5. **Import History**: Track what has been imported to avoid duplicates