storycove/storycove-scraper-spec.md

# StoryCove Web Scraper Feature Specification

## Overview

The Web Scraper feature allows users to import stories from external websites into StoryCove by providing URLs. The scraper extracts story metadata and content using configurable selectors defined in a JSON configuration file.

## Feature Requirements

### Core Functionality

1. **Single Story Import**: Users can provide a story URL and the scraper will extract:
   - Title (required)
   - Author (required)
   - Content (required)
   - Summary (optional)
   - Cover Image (optional)
   - Tags (optional)

2. **Author Page Scanning**: Users can provide an author page URL to:
   - Discover all stories by that author
   - Present a selectable list of stories
   - Allow bulk import of selected stories

3. **Multi-page Story Support**: Handle stories split across multiple pages by:
   - Detecting pagination
   - Fetching all pages
   - Merging content in correct order

### User Interface Flow

1. **Add Story View Enhancement**:
   ```
   [Manual Entry] | [Import from URL]

   When "Import from URL" selected:
   - URL input field
   - "Fetch" button
   - Loading indicator during fetch
   - Pre-filled form with scraped data
   - Ability to edit before saving
   ```

2. **Bulk Import View** (future enhancement):
   ```
   - URL input for author page
   - "Scan for Stories" button
   - Checkbox list of discovered stories
   - "Import Selected" button
   - Progress indicator
   ```

## Technical Implementation

### Architecture

```
/lib/scraper/
├── config/
│   └── sites.json          # Site configurations
├── scraper.ts              # Main scraper class
├── strategies/             # Strategy implementations
│   ├── index.ts
│   ├── textExtractor.ts
│   ├── linkExtractor.ts
│   └── contentCleaner.ts
├── utils/
│   ├── rateLimit.ts
│   ├── cache.ts
│   └── urlParser.ts
└── types.ts                # TypeScript definitions
```

### API Routes

```typescript
// /app/api/scrape/story/route.ts
POST /api/scrape/story
Body: { url: string }
Response: {
  title: string,
  author: string,
  content: string,
  summary?: string,
  coverImage?: string,
  tags?: string[]
}

// /app/api/scrape/author/route.ts
POST /api/scrape/author
Body: { url: string }
Response: {
  stories: Array<{
    url: string,
    title: string,
    author: string,
    summary?: string
  }>
}
```

### Core Classes

```typescript
// /lib/scraper/types.ts
interface SiteConfig {
  story: StorySelectors;
  authorPage: AuthorPageSelectors;
}

interface StorySelectors {
  title: string | SelectorStrategy;
  author: string | SelectorStrategy;
  content: string | SelectorStrategy;
  summary?: string | SelectorStrategy;
  coverImage?: string | SelectorStrategy;
  tags?: string | SelectorStrategy;
  multiPage?: MultiPageConfig;
}

interface SelectorStrategy {
  strategy: string;
  [key: string]: any;
}

interface ScrapedStory {
  title: string;
  author: string;
  content: string;
  summary?: string;
  coverImage?: string;
  tags?: string[];
  sourceUrl: string;
}
```

### Main Scraper Implementation

```typescript
// /lib/scraper/scraper.ts
import * as cheerio from 'cheerio';
import { SiteConfig, ScrapedStory } from './types';
import sitesConfig from './config/sites.json';

export class StoryScraper {
  private config: Record<string, SiteConfig>;
  private cache: Map<string, any>;

  constructor() {
    this.config = sitesConfig.sites;
    this.cache = new Map();
  }

  async scrapeStory(url: string): Promise<ScrapedStory> {
    const domain = this.getDomain(url);
    const siteConfig = this.config[domain];

    if (!siteConfig) {
      throw new Error(`Unsupported site: ${domain}`);
    }

    const html = await this.fetchWithCache(url);
    const $ = cheerio.load(html);

    const story: ScrapedStory = {
      title: await this.extractField($, siteConfig.story.title, html),
      author: await this.extractField($, siteConfig.story.author, html),
      content: await this.extractContent($, siteConfig.story, url),
      sourceUrl: url
    };

    // Extract optional fields
    if (siteConfig.story.summary) {
      story.summary = await this.extractField($, siteConfig.story.summary, html);
    }

    return story;
  }

  private async extractField(
    $: cheerio.CheerioAPI,
    selector: string | SelectorStrategy,
    html: string
  ): Promise<string> {
    if (typeof selector === 'string') {
      // Simple CSS selector
      return $(selector).first().text().trim();
    }

    // Strategy-based extraction
    return await this.executeStrategy($, selector, html);
  }

  private async executeStrategy(
    $: cheerio.CheerioAPI,
    strategy: SelectorStrategy,
    html: string
  ): Promise<string> {
    switch (strategy.strategy) {
      case 'text-pattern':
        return this.extractByTextPattern(html, strategy);
      case 'link-with-path':
        return this.extractLinkWithPath($, strategy);
      case 'text-blocks':
        return this.extractTextBlocks($, strategy);
      // ... other strategies
    }
  }
}
```

### Strategy Implementations

```typescript
// /lib/scraper/strategies/textExtractor.ts
export function extractByTextPattern(
  html: string,
  config: TextPatternStrategy
): string {
  const regex = new RegExp(config.pattern, 'i');
  const match = html.match(regex);
  return match ? match[config.group || 1].trim() : '';
}

export function extractTextBlocks(
  $: cheerio.CheerioAPI,
  config: TextBlockStrategy
): string {
  const blocks: Array<{element: any, text: string}> = [];

  $('*').each((_, elem) => {
    const $elem = $(elem);
    const text = $elem.clone().children().remove().end().text().trim();

    if (text.length >= (config.minLength || 500)) {
      blocks.push({ element: elem, text });
    }
  });

  // Find the block that likely contains story content
  const storyBlock = blocks.find(block => {
    const hasHints = config.containerHints?.some(hint =>
      $(block.element).attr('class')?.includes(hint) ||
      $(block.element).attr('id')?.includes(hint)
    );
    return hasHints || blocks.length === 1;
  });

  return storyBlock ? $(storyBlock.element).html() || '' : '';
}
```

### Rate Limiting

```typescript
// /lib/scraper/utils/rateLimit.ts
export class RateLimiter {
  private lastRequest: number = 0;
  private minDelay: number;

  constructor(minDelayMs: number = 1000) {
    this.minDelay = minDelayMs;
  }

  async throttle(): Promise<void> {
    const now = Date.now();
    const timeSinceLastRequest = now - this.lastRequest;

    if (timeSinceLastRequest < this.minDelay) {
      await this.delay(this.minDelay - timeSinceLastRequest);
    }

    this.lastRequest = Date.now();
  }

  private delay(ms: number): Promise<void> {
    return new Promise(resolve => setTimeout(resolve, ms));
  }
}
```

### Multi-page Story Handling

```typescript
// /lib/scraper/scraper.ts (addition)
private async extractContent(
  $: cheerio.CheerioAPI,
  storyConfig: StorySelectors,
  url: string
): Promise<string> {
  let content = await this.extractField($, storyConfig.content, $.html());

  if (storyConfig.multiPage?.enabled) {
    const additionalPages = await this.fetchAdditionalPages(
      $,
      url,
      storyConfig.multiPage
    );

    for (const pageHtml of additionalPages) {
      const $page = cheerio.load(pageHtml);
      const pageContent = await this.extractField(
        $page,
        storyConfig.content,
        pageHtml
      );
      content += '\n\n' + pageContent;
    }
  }

  return content;
}

private async fetchAdditionalPages(
  $: cheerio.CheerioAPI,
  baseUrl: string,
  config: MultiPageConfig
): Promise<string[]> {
  const pages: string[] = [];
  let currentUrl = baseUrl;
  let pageNum = 2;

  while (pageNum <= (config.maxPages || 20)) {
    let nextUrl: string | null = null;

    if (config.strategy === 'url-pattern') {
      nextUrl = this.buildPageUrl(baseUrl, pageNum, config);
    } else if (config.nextPageSelector) {
      const nextLink = $(config.nextPageSelector).attr('href');
      if (nextLink) {
        nextUrl = new URL(nextLink, currentUrl).href;
      }
    }

    if (!nextUrl) break;

    try {
      await this.rateLimiter.throttle();
      const html = await this.fetchWithCache(nextUrl);
      pages.push(html);
      currentUrl = nextUrl;
      pageNum++;
    } catch (error) {
      console.error(`Failed to fetch page ${pageNum}:`, error);
      break;
    }
  }

  return pages;
}
```

### Error Handling

```typescript
// /lib/scraper/scraper.ts (addition)
async scrapeStory(url: string): Promise<ScrapedStory> {
  try {
    // ... existing implementation
  } catch (error) {
    if (error instanceof Error) {
      throw new ScraperError(
        `Failed to scrape ${url}: ${error.message}`,
        url,
        error
      );
    }
    throw error;
  }
}

export class ScraperError extends Error {
  constructor(
    message: string,
    public url: string,
    public originalError?: Error
  ) {
    super(message);
    this.name = 'ScraperError';
  }
}
```

## Configuration File Structure

```json
{
  "sites": {
    "domain.com": {
      "story": {
        "title": "selector or strategy object",
        "author": "selector or strategy object",
        "content": "selector or strategy object",
        "summary": "optional selector or strategy",
        "coverImage": "optional selector or strategy",
        "tags": "optional selector or strategy",
        "multiPage": {
          "enabled": true,
          "strategy": "url-pattern|next-link",
          "nextPageSelector": "a.next-page",
          "pageParam": "page",
          "maxPages": 20
        }
      },
      "authorPage": {
        "storyLinks": "selector or strategy object",
        "pagination": {
          "enabled": true,
          "nextPageSelector": "a.next"
        }
      }
    }
  },
  "globalOptions": {
    "userAgent": "Mozilla/5.0...",
    "timeout": 30000,
    "retryAttempts": 3,
    "rateLimitMs": 1000
  }
}
```

## Usage Example

```typescript
// In a Next.js API route
import { StoryScraper } from '@/lib/scraper/scraper';

export async function POST(request: Request) {
  const { url } = await request.json();

  try {
    const scraper = new StoryScraper();
    const story = await scraper.scrapeStory(url);

    return Response.json(story);
  } catch (error) {
    if (error instanceof ScraperError) {
      return Response.json(
        { error: error.message },
        { status: 400 }
      );
    }

    return Response.json(
      { error: 'Internal server error' },
      { status: 500 }
    );
  }
}
```

## Testing Considerations

1. **Unit Tests**: Test individual strategies and extractors
2. **Integration Tests**: Test against saved HTML samples
3. **Mock External Requests**: Use saved HTML fixtures to avoid hitting real sites
4. **Edge Cases**: Empty content, missing fields, malformed HTML
5. **Rate Limiting**: Verify delays are properly applied

## Security Considerations

1. **URL Validation**: Only accept HTTP/HTTPS URLs
2. **Domain Allowlist**: Restrict to configured domains
3. **Content Sanitization**: Clean HTML before storage
4. **Request Timeouts**: Prevent hanging on slow sites
5. **Rate Limiting**: Prevent abuse of the scraping endpoint

## Future Enhancements

1. **Browser Automation**: Use Playwright for JavaScript-rendered content
2. **AI Content Extraction**: Use LLMs for sites without clear patterns
3. **User-Submitted Configurations**: Allow users to define selectors
4. **Scheduled Imports**: Periodic author page checking
5. **Import History**: Track what has been imported to avoid duplicates