Files

Stefan Hardegger fcad028959 scraping and improvements

2025-07-28 13:52:09 +02:00

12 KiB

Raw Blame History

StoryCove Web Scraper Feature Specification

Overview

The Web Scraper feature allows users to import stories from external websites into StoryCove by providing URLs. The scraper extracts story metadata and content using configurable selectors defined in a JSON configuration file.

Feature Requirements

Core Functionality

Single Story Import: Users can provide a story URL and the scraper will extract:
- Title (required)
- Author (required)
- Content (required)
- Summary (optional)
- Cover Image (optional)
- Tags (optional)
Author Page Scanning: Users can provide an author page URL to:
- Discover all stories by that author
- Present a selectable list of stories
- Allow bulk import of selected stories
Multi-page Story Support: Handle stories split across multiple pages by:
- Detecting pagination
- Fetching all pages
- Merging content in correct order

User Interface Flow

Add Story View Enhancement:

[Manual Entry] | [Import from URL]

When "Import from URL" selected:
- URL input field
- "Fetch" button
- Loading indicator during fetch
- Pre-filled form with scraped data
- Ability to edit before saving

Bulk Import View (future enhancement):

- URL input for author page
- "Scan for Stories" button
- Checkbox list of discovered stories
- "Import Selected" button
- Progress indicator

Technical Implementation

Architecture

/lib/scraper/
├── config/
│   └── sites.json          # Site configurations
├── scraper.ts              # Main scraper class
├── strategies/             # Strategy implementations
│   ├── index.ts
│   ├── textExtractor.ts
│   ├── linkExtractor.ts
│   └── contentCleaner.ts
├── utils/
│   ├── rateLimit.ts
│   ├── cache.ts
│   └── urlParser.ts
└── types.ts                # TypeScript definitions

API Routes

// /app/api/scrape/story/route.ts
POST /api/scrape/story
Body: { url: string }
Response: { 
  title: string,
  author: string,
  content: string,
  summary?: string,
  coverImage?: string,
  tags?: string[]
}

// /app/api/scrape/author/route.ts
POST /api/scrape/author
Body: { url: string }
Response: {
  stories: Array<{
    url: string,
    title: string,
    author: string,
    summary?: string
  }>
}

Core Classes

// /lib/scraper/types.ts
interface SiteConfig {
  story: StorySelectors;
  authorPage: AuthorPageSelectors;
}

interface StorySelectors {
  title: string | SelectorStrategy;
  author: string | SelectorStrategy;
  content: string | SelectorStrategy;
  summary?: string | SelectorStrategy;
  coverImage?: string | SelectorStrategy;
  tags?: string | SelectorStrategy;
  multiPage?: MultiPageConfig;
}

interface SelectorStrategy {
  strategy: string;
  [key: string]: any;
}

interface ScrapedStory {
  title: string;
  author: string;
  content: string;
  summary?: string;
  coverImage?: string;
  tags?: string[];
  sourceUrl: string;
}

Main Scraper Implementation

// /lib/scraper/scraper.ts
import * as cheerio from 'cheerio';
import { SiteConfig, ScrapedStory } from './types';
import sitesConfig from './config/sites.json';

export class StoryScraper {
  private config: Record<string, SiteConfig>;
  private cache: Map<string, any>;
  
  constructor() {
    this.config = sitesConfig.sites;
    this.cache = new Map();
  }

  async scrapeStory(url: string): Promise<ScrapedStory> {
    const domain = this.getDomain(url);
    const siteConfig = this.config[domain];
    
    if (!siteConfig) {
      throw new Error(`Unsupported site: ${domain}`);
    }
    
    const html = await this.fetchWithCache(url);
    const $ = cheerio.load(html);
    
    const story: ScrapedStory = {
      title: await this.extractField($, siteConfig.story.title, html),
      author: await this.extractField($, siteConfig.story.author, html),
      content: await this.extractContent($, siteConfig.story, url),
      sourceUrl: url
    };
    
    // Extract optional fields
    if (siteConfig.story.summary) {
      story.summary = await this.extractField($, siteConfig.story.summary, html);
    }
    
    return story;
  }

  private async extractField(
    $: cheerio.CheerioAPI, 
    selector: string | SelectorStrategy,
    html: string
  ): Promise<string> {
    if (typeof selector === 'string') {
      // Simple CSS selector
      return $(selector).first().text().trim();
    }
    
    // Strategy-based extraction
    return await this.executeStrategy($, selector, html);
  }

  private async executeStrategy(
    $: cheerio.CheerioAPI,
    strategy: SelectorStrategy,
    html: string
  ): Promise<string> {
    switch (strategy.strategy) {
      case 'text-pattern':
        return this.extractByTextPattern(html, strategy);
      case 'link-with-path':
        return this.extractLinkWithPath($, strategy);
      case 'text-blocks':
        return this.extractTextBlocks($, strategy);
      // ... other strategies
    }
  }
}

Strategy Implementations

// /lib/scraper/strategies/textExtractor.ts
export function extractByTextPattern(
  html: string, 
  config: TextPatternStrategy
): string {
  const regex = new RegExp(config.pattern, 'i');
  const match = html.match(regex);
  return match ? match[config.group || 1].trim() : '';
}

export function extractTextBlocks(
  $: cheerio.CheerioAPI,
  config: TextBlockStrategy
): string {
  const blocks: Array<{element: any, text: string}> = [];
  
  $('*').each((_, elem) => {
    const $elem = $(elem);
    const text = $elem.clone().children().remove().end().text().trim();
    
    if (text.length >= (config.minLength || 500)) {
      blocks.push({ element: elem, text });
    }
  });
  
  // Find the block that likely contains story content
  const storyBlock = blocks.find(block => {
    const hasHints = config.containerHints?.some(hint => 
      $(block.element).attr('class')?.includes(hint) ||
      $(block.element).attr('id')?.includes(hint)
    );
    return hasHints || blocks.length === 1;
  });
  
  return storyBlock ? $(storyBlock.element).html() || '' : '';
}

Rate Limiting

// /lib/scraper/utils/rateLimit.ts
export class RateLimiter {
  private lastRequest: number = 0;
  private minDelay: number;
  
  constructor(minDelayMs: number = 1000) {
    this.minDelay = minDelayMs;
  }
  
  async throttle(): Promise<void> {
    const now = Date.now();
    const timeSinceLastRequest = now - this.lastRequest;
    
    if (timeSinceLastRequest < this.minDelay) {
      await this.delay(this.minDelay - timeSinceLastRequest);
    }
    
    this.lastRequest = Date.now();
  }
  
  private delay(ms: number): Promise<void> {
    return new Promise(resolve => setTimeout(resolve, ms));
  }
}

Multi-page Story Handling

// /lib/scraper/scraper.ts (addition)
private async extractContent(
  $: cheerio.CheerioAPI,
  storyConfig: StorySelectors,
  url: string
): Promise<string> {
  let content = await this.extractField($, storyConfig.content, $.html());
  
  if (storyConfig.multiPage?.enabled) {
    const additionalPages = await this.fetchAdditionalPages(
      $, 
      url, 
      storyConfig.multiPage
    );
    
    for (const pageHtml of additionalPages) {
      const $page = cheerio.load(pageHtml);
      const pageContent = await this.extractField(
        $page, 
        storyConfig.content, 
        pageHtml
      );
      content += '\n\n' + pageContent;
    }
  }
  
  return content;
}

private async fetchAdditionalPages(
  $: cheerio.CheerioAPI,
  baseUrl: string,
  config: MultiPageConfig
): Promise<string[]> {
  const pages: string[] = [];
  let currentUrl = baseUrl;
  let pageNum = 2;
  
  while (pageNum <= (config.maxPages || 20)) {
    let nextUrl: string | null = null;
    
    if (config.strategy === 'url-pattern') {
      nextUrl = this.buildPageUrl(baseUrl, pageNum, config);
    } else if (config.nextPageSelector) {
      const nextLink = $(config.nextPageSelector).attr('href');
      if (nextLink) {
        nextUrl = new URL(nextLink, currentUrl).href;
      }
    }
    
    if (!nextUrl) break;
    
    try {
      await this.rateLimiter.throttle();
      const html = await this.fetchWithCache(nextUrl);
      pages.push(html);
      currentUrl = nextUrl;
      pageNum++;
    } catch (error) {
      console.error(`Failed to fetch page ${pageNum}:`, error);
      break;
    }
  }
  
  return pages;
}

Error Handling

// /lib/scraper/scraper.ts (addition)
async scrapeStory(url: string): Promise<ScrapedStory> {
  try {
    // ... existing implementation
  } catch (error) {
    if (error instanceof Error) {
      throw new ScraperError(
        `Failed to scrape ${url}: ${error.message}`,
        url,
        error
      );
    }
    throw error;
  }
}

export class ScraperError extends Error {
  constructor(
    message: string,
    public url: string,
    public originalError?: Error
  ) {
    super(message);
    this.name = 'ScraperError';
  }
}

Configuration File Structure

{
  "sites": {
    "domain.com": {
      "story": {
        "title": "selector or strategy object",
        "author": "selector or strategy object",
        "content": "selector or strategy object",
        "summary": "optional selector or strategy",
        "coverImage": "optional selector or strategy",
        "tags": "optional selector or strategy",
        "multiPage": {
          "enabled": true,
          "strategy": "url-pattern|next-link",
          "nextPageSelector": "a.next-page",
          "pageParam": "page",
          "maxPages": 20
        }
      },
      "authorPage": {
        "storyLinks": "selector or strategy object",
        "pagination": {
          "enabled": true,
          "nextPageSelector": "a.next"
        }
      }
    }
  },
  "globalOptions": {
    "userAgent": "Mozilla/5.0...",
    "timeout": 30000,
    "retryAttempts": 3,
    "rateLimitMs": 1000
  }
}

Usage Example

// In a Next.js API route
import { StoryScraper } from '@/lib/scraper/scraper';

export async function POST(request: Request) {
  const { url } = await request.json();
  
  try {
    const scraper = new StoryScraper();
    const story = await scraper.scrapeStory(url);
    
    return Response.json(story);
  } catch (error) {
    if (error instanceof ScraperError) {
      return Response.json(
        { error: error.message },
        { status: 400 }
      );
    }
    
    return Response.json(
      { error: 'Internal server error' },
      { status: 500 }
    );
  }
}

Testing Considerations

Unit Tests: Test individual strategies and extractors
Integration Tests: Test against saved HTML samples
Mock External Requests: Use saved HTML fixtures to avoid hitting real sites
Edge Cases: Empty content, missing fields, malformed HTML
Rate Limiting: Verify delays are properly applied

Security Considerations

URL Validation: Only accept HTTP/HTTPS URLs
Domain Allowlist: Restrict to configured domains
Content Sanitization: Clean HTML before storage
Request Timeouts: Prevent hanging on slow sites
Rate Limiting: Prevent abuse of the scraping endpoint

Future Enhancements

Browser Automation: Use Playwright for JavaScript-rendered content
AI Content Extraction: Use LLMs for sites without clear patterns
User-Submitted Configurations: Allow users to define selectors
Scheduled Imports: Periodic author page checking
Import History: Track what has been imported to avoid duplicates

12 KiB Raw Blame History